Andra Hugo 13 年之前
父節點
當前提交
38e00e9d11
共有 100 個文件被更改,包括 7350 次插入1006 次删除
  1. 63 1
      .gitignore
  2. 2 0
      AUTHORS
  3. 85 1
      ChangeLog
  4. 22 8
      Makefile.am
  5. 17 1
      README
  6. 43 159
      README.dev
  7. 21 0
      STARPU-VERSION
  8. 9 0
      TODO
  9. 0 95
      acinclude.m4
  10. 326 107
      configure.ac
  11. 2 2
      libstarpu.pc.in
  12. 35 0
      starpu-1.0.pc.in
  13. 1 1
      starpu-top/StarPU-Top-common.pri
  14. 0 2
      starpu-top/StarPU-Top-qwt-system.pri
  15. 2 0
      starpu-top/StarPU-Top-qwt-system.pri.in
  16. 1 1
      starpu-top/aboutdialog.ui
  17. 3 3
      starpu-top/communicationmanager.cpp
  18. 3 3
      starpu-top/communicationmanager.h
  19. 3 3
      starpu-top/communicationthread.cpp
  20. 1 1
      starpu-top/configurationmanager.h
  21. 1 1
      starpu-top/dataaggregatorwidget.h
  22. 1 1
      starpu-top/datawidget.h
  23. 2 4
      starpu-top/extradist
  24. 9 9
      starpu-top/ganttwidget.cpp
  25. 6 6
      starpu-top/ganttwidget.h
  26. 0 0
      starpu-top/images/starpu_top.png
  27. 1 1
      starpu-top/interactivewidget.h
  28. 11 11
      starpu-top/mainwindow.cpp
  29. 4 4
      starpu-top/mainwindow.h
  30. 3 3
      starpu-top/mainwindow.ui
  31. 1 1
      starpu-top/preferencesdialog.h
  32. 1 1
      starpu-top/resources.qrc
  33. 1 1
      starpu-top/sessionsetupmanager.h
  34. 7 7
      starpu-top/starputoptypes.h
  35. 7 7
      starpu-top/taskmanager.cpp
  36. 3 3
      starpu-top/taskmanager.h
  37. 1 0
      starpufft/.gitignore
  38. 97 0
      starpufft/Makefile.am
  39. 19 0
      starpufft/cuda_kernels.cu
  40. 19 0
      starpufft/cudaf_kernels.cu
  41. 156 0
      starpufft/cudax_kernels.cu
  42. 23 0
      starpufft/cudax_kernels.h
  43. 51 0
      starpufft/double.h
  44. 19 0
      starpufft/examples/test.c
  45. 19 0
      starpufft/examples/test_threads.c
  46. 19 0
      starpufft/examples/testf.c
  47. 19 0
      starpufft/examples/testf_threads.c
  48. 283 0
      starpufft/examples/testx.c
  49. 113 0
      starpufft/examples/testx_threads.c
  50. 51 0
      starpufft/float.h
  51. 27 0
      starpufft/libstarpufft.pc.in
  52. 27 0
      starpufft/starpufft-1.0.pc.in
  53. 19 0
      starpufft/starpufft.c
  54. 60 0
      starpufft/starpufft.h
  55. 21 0
      starpufft/starpufft_common.c
  56. 19 0
      starpufft/starpufftf.c
  57. 454 0
      starpufft/starpufftx.c
  58. 847 0
      starpufft/starpufftx1d.c
  59. 850 0
      starpufft/starpufftx2d.c
  60. 272 32
      tests/Makefile.am
  61. 23 3
      tests/cholesky/prio.r
  62. 23 3
      tests/cholesky/sched.r
  63. 0 65
      tests/core/multithreaded_init.c
  64. 0 121
      tests/core/task_wait_api.c
  65. 8 5
      tests/datawizard/acquire_cb.c
  66. 43 25
      tests/datawizard/acquire_cb_insert.c
  67. 46 18
      tests/datawizard/acquire_release.c
  68. 42 16
      tests/datawizard/acquire_release2.c
  69. 34 30
      tests/datawizard/copy.c
  70. 24 15
      tests/datawizard/critical_section_with_void_interface.c
  71. 66 32
      tests/datawizard/data_implicit_deps.c
  72. 46 30
      tests/datawizard/data_invalidation.c
  73. 48 39
      tests/datawizard/data_lookup.c
  74. 35 18
      tests/datawizard/dining_philosophers.c
  75. 174 0
      tests/datawizard/double_parameter.c
  76. 142 28
      tests/datawizard/dsm_stress.c
  77. 139 0
      tests/datawizard/gpu_register.c
  78. 23 17
      tests/datawizard/handle_to_pointer.c
  79. 102 0
      tests/datawizard/in_place_partition.c
  80. 78 31
      tests/datawizard/increment_redux.c
  81. 255 0
      tests/datawizard/increment_redux_lazy.c
  82. 102 30
      tests/datawizard/increment_redux_v2.c
  83. 70 0
      tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
  84. 198 0
      tests/datawizard/interfaces/bcsr/bcsr_interface.c
  85. 130 0
      tests/datawizard/interfaces/bcsr/bcsr_opencl.c
  86. 29 0
      tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
  87. 80 0
      tests/datawizard/interfaces/block/block_cuda.cu
  88. 163 0
      tests/datawizard/interfaces/block/block_interface.c
  89. 120 0
      tests/datawizard/interfaces/block/block_opencl.c
  90. 46 0
      tests/datawizard/interfaces/block/block_opencl_kernel.cl
  91. 106 0
      tests/datawizard/interfaces/copy_interfaces.c
  92. 68 0
      tests/datawizard/interfaces/csr/csr_cuda.cu
  93. 170 0
      tests/datawizard/interfaces/csr/csr_interface.c
  94. 130 0
      tests/datawizard/interfaces/csr/csr_opencl.c
  95. 29 0
      tests/datawizard/interfaces/csr/csr_opencl_kernel.cl
  96. 71 0
      tests/datawizard/interfaces/matrix/matrix_cuda.cu
  97. 145 0
      tests/datawizard/interfaces/matrix/matrix_interface.c
  98. 129 0
      tests/datawizard/interfaces/matrix/matrix_opencl.c
  99. 31 0
      tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl
  100. 0 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c

+ 63 - 1
.gitignore

@@ -3,7 +3,6 @@
 /config.status
 /config.status
 /autom4te.cache
 /autom4te.cache
 /libtool
 /libtool
-/libstarpu.pc
 /aclocal.m4
 /aclocal.m4
 /build-aux
 /build-aux
 /GPATH
 /GPATH
@@ -186,3 +185,66 @@ starpu.log
 /gcc-plugin/tests/lib-user
 /gcc-plugin/tests/lib-user
 /gcc-plugin/examples/matrix-mult
 /gcc-plugin/examples/matrix-mult
 /gcc-plugin/src/c-expr.c
 /gcc-plugin/src/c-expr.c
+/gcc-plugin/tests/heap-allocated
+/gcc-plugin/tests/output-pointer
+/gcc-plugin/examples/vector_scal/vector_scal
+/doc/starpu.info-1
+/doc/starpu.info-2
+/examples/axpy/axpy
+/examples/basic_examples/mult_impl
+/examples/basic_examples/multiformat
+/examples/cg/cg
+/examples/cholesky/cholesky_grain_tag
+/examples/cholesky/cholesky_implicit
+/examples/cholesky/cholesky_tag
+/examples/cholesky/cholesky_tile_tag
+/examples/cpp/incrementer_cpp
+/examples/filters/custom_mf/custom_mf_filter
+/examples/filters/multiformat/multiformat_filter
+/examples/heat/heat
+/examples/lu/lu_example_double
+/examples/lu/lu_example_float
+/examples/lu/lu_implicit_example_double
+/examples/lu/lu_implicit_example_float
+/examples/mult/dgemm
+/examples/mult/sgemm
+/mpi/starpumpi-1.0.pc
+/socl/socl-1.0.pc
+/starpufft/starpufft-1.0.pc
+/tests/core/deprecated
+/tests/core/deprecated_buffer
+/tests/core/deprecated_func
+/tests/core/multiformat_data_release
+/tests/core/multiformat_handle_conversion
+/tests/core/starpu_init
+/tests/core/starpu_task_bundle
+/tests/core/starpu_worker_exists
+/tests/datawizard/copy
+/tests/datawizard/double_parameter
+/tests/datawizard/gpu_register
+/tests/datawizard/in_place_partition
+/tests/datawizard/increment_redux_lazy
+/tests/datawizard/interfaces/bcsr/bcsr_interface
+/tests/datawizard/interfaces/block/block_interface
+/tests/datawizard/interfaces/csr/csr_interface
+/tests/datawizard/interfaces/matrix/matrix_interface
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_data_release
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion
+/tests/datawizard/interfaces/multiformat/advanced/multiformat_worker
+/tests/datawizard/interfaces/multiformat/advanced/same_handle
+/tests/datawizard/interfaces/multiformat/multiformat_interface
+/tests/datawizard/interfaces/test_interfaces
+/tests/datawizard/interfaces/test_vector_interface
+/tests/datawizard/interfaces/variable/variable_interface
+/tests/datawizard/interfaces/vector/test_vector_interface
+/tests/datawizard/interfaces/void/void_interface
+/tests/datawizard/partition_lazy
+/tests/loader
+/tests/starpu_machine_display
+/tools/starpu_calibrate_bus.1
+/tools/starpu_machine_display.1
+/tools/starpu_perfmodel_display.1
+/tools/starpu_perfmodel_plot.1
+/starpu-1.0.pc
+/gcc-plugin/examples/cholesky/cholesky

+ 2 - 0
AUTHORS

@@ -12,3 +12,5 @@ Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
 David Gómez <david_gomez1380@yahoo.com.mx>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
 Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
+Antoine Lucas <antoine.lucas.33@gmail.com>
+Pierre André Wacrenier <wacrenier@labri.fr>

+ 85 - 1
ChangeLog

@@ -1,3 +1,87 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+StarPU 1.0 (svn revision xxxx)
+==============================================
+The extensions-again release
+
+  * struct starpu_data_interface_ops --- operations on a data
+        interface --- define a new function pointer allocate_new_data
+        which creates a new data interface of the given type based on
+        an existing handle
+  * Make environment variables take precedence over the configuration
+        passed to starpu_init()
+  * Add man pages for some of the tools
+  * Add reduction mode to starpu_mpi_insert_task
+  * Add C++ application example in examples/cpp/
+  * Increase default value for STARPU_MAXCPUS -- Maximum number of
+        CPUs supported -- to 64.
+  * Libtool interface versioning has been included in libraries names
+        (libstarpu-1.0.so, libstarpumpi-1.0.so,
+        libstarpufft-1.0.so, libsocl-1.0.so)
+  * Enable by default the SOCL extension.
+  * Enable by default the GCC plug-in extension.
+  * Add a field named magic to struct starpu_task which is set when
+        initialising the task. starpu_task_submit will fail if the
+        field does not have the right value. This will hence avoid
+        submitting tasks which have not been properly initialised.
+  * Make where field for struct starpu_codelet optional. When unset, its
+	value will be automatically set based on the availability of the
+	different XXX_funcs fields of the codelet.
+  * Add a hook function pre_exec_hook in struct starpu_sched_policy.
+        The function is meant to be called in drivers. Schedulers
+        can use it to be notified when a task is about being computed.
+  * Define access modes for data handles into starpu_codelet and no longer
+	in starpu_task. Hence mark (struct starpu_task).buffers as
+	deprecated, and add (struct starpu_task).handles and (struct
+	starpu_codelet).modes
+  * Install headers under $includedir/starpu/1.0.
+  * Deprecate cost_model, and introduce cost_function, which is provided
+	with the whole task structure, the target arch and implementation
+	number
+  * Permit the application to provide its own size base for performance
+	models
+  * Fields xxx_func of struct starpu_codelet are made deprecated. One
+	should use instead fields xxx_funcs.
+  * Applications can provide several implementations of a codelet for the
+	same architecture.
+  * A new multi-format interface permits to use different binary formats
+	on CPUs & GPUs, the conversion functions being provided by the
+	application and called by StarPU as needed (and as less as
+	possible).
+  * Add a gcc plugin to extend the C interface with pragmas which allows to
+	easily define codelets and issue tasks.
+  * Add codelet execution time statistics plot.
+  * Add bus speed in starpu_machine_display.
+  * Add a StarPU-Top feedback and steering interface.
+  * Documentation improvement.
+  * Add a STARPU_DATA_ACQUIRE_CB which permits to inline the code to be
+	done.
+  * Permit to specify MPI tags for more efficient starpu_mpi_insert_task
+  * Add SOCL, an OpenCL interface on top of StarPU.
+  * Add gdb functions.
+  * Add complex support to LU example.
+  * Add an OpenMP fork-join example.
+  * Permit to use the same data several times in write mode in the
+	parameters of the same task.
+  * Some types were renamed for consistency. The tools/dev/rename.sh
+	script can be used to port code using former names. You can also
+	choose to include starpu_deprecated_api.h (after starpu.h) to keep
+	using the old types.
+
 StarPU 0.9 (svn revision 3721)
 StarPU 0.9 (svn revision 3721)
 ==============================================
 ==============================================
 The extensions release
 The extensions release
@@ -58,7 +142,7 @@ The asynchronous heterogeneous multi-accelerator release
     - Implement starpu_worker_get_count
     - Implement starpu_worker_get_count
     - Implement starpu_display_codelet_stats
     - Implement starpu_display_codelet_stats
     - Implement starpu_data_prefetch_on_node
     - Implement starpu_data_prefetch_on_node
-    - Expose the starpu_data_set_wb_mask function
+    - Expose the starpu_data_set_wt_mask function
   * Support nvidia (heterogeneous) multi-GPU
   * Support nvidia (heterogeneous) multi-GPU
   * Add the data request mechanism
   * Add the data request mechanism
     - All data transfers use data requests now
     - All data transfers use data requests now

+ 22 - 8
Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +34,6 @@ if COND_OPT
 SUBDIRS += tests/opt examples/opt
 SUBDIRS += tests/opt examples/opt
 endif
 endif
 
 
-
 if BUILD_GCC_PLUGIN
 if BUILD_GCC_PLUGIN
 SUBDIRS += gcc-plugin
 SUBDIRS += gcc-plugin
 endif
 endif
@@ -43,12 +42,16 @@ if BUILD_SCHED_CTX_HYPERVISOR
 SUBDIRS += sched_ctx_hypervisor
 SUBDIRS += sched_ctx_hypervisor
 endif
 endif
 
 
+if BUILD_STARPUFFT
+SUBDIRS += starpufft
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = libstarpu.pc
+pkgconfig_DATA = libstarpu.pc starpu-1.0.pc
 
 
-include_HEADERS = 				\
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 				\
 	include/starpu.h			\
 	include/starpu.h			\
-	include/starpu_config.h			\
 	include/starpu_data_filters.h		\
 	include/starpu_data_filters.h		\
 	include/starpu_data_interfaces.h	\
 	include/starpu_data_interfaces.h	\
 	include/starpu_task.h			\
 	include/starpu_task.h			\
@@ -57,13 +60,19 @@ include_HEADERS = 				\
 	include/starpu_data.h			\
 	include/starpu_data.h			\
 	include/starpu_perfmodel.h		\
 	include/starpu_perfmodel.h		\
 	include/starpu_util.h			\
 	include/starpu_util.h			\
+	include/starpu_fxt.h			\
 	include/starpu_cuda.h			\
 	include/starpu_cuda.h			\
 	include/starpu_opencl.h			\
 	include/starpu_opencl.h			\
 	include/starpu_expert.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
 	include/starpu_scheduler.h		\
-	include/starpu_top.h
+	include/starpu_top.h			\
+	include/starpu_deprecated_api.h         \
+	include/starpu_hash.h
+
+nodist_versinclude_HEADERS = 			\
+	include/starpu_config.h
 
 
 if BUILD_STARPU_TOP
 if BUILD_STARPU_TOP
 all-local:
 all-local:
@@ -86,6 +95,11 @@ else
 txtdir = ${docdir}
 txtdir = ${docdir}
 endif
 endif
 txt_DATA = AUTHORS COPYING.LGPL README
 txt_DATA = AUTHORS COPYING.LGPL README
-EXTRA_DIST = AUTHORS COPYING.LGPL README
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION
 
 
 include starpu-top/extradist
 include starpu-top/extradist
+
+showcheck:
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 17 - 1
README

@@ -1,3 +1,19 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 ++=================++
 ++=================++
 || I. Introduction ||
 || I. Introduction ||
 ++=================++
 ++=================++
@@ -134,7 +150,7 @@ Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 ++==============++
 ++==============++
 
 
 To upgrade your source code from older version (there were quite a few
 To upgrade your source code from older version (there were quite a few
-renamings), use the tools/rename.sh script
+renamings), use the tools/dev/rename.sh script
 
 
 ++===============++
 ++===============++
 || VIII. Contact ||
 || VIII. Contact ||

+ 43 - 159
README.dev

@@ -1,169 +1,53 @@
-Installing StarPU on windows
-----------------------------
-
-If you are building from a tarball downloaded from the website, you can skip the
-cygwin part.
-
-1. Install cygwin
-
-http://cygwin.com/install.html
-
-Make sure the following packages are available:
-- (Devel)/subversion
-- (Devel)/libtool
-- (Devel)/gcc
-- (Devel)/make
-- your favorite editor (vi, emacs, ...)
-- (Devel)/gdb
-- (Archive)/zip
-- (Devel)/pkg-config
-
-2. Install mingw
-
-http://sourceforge.net/projects/mingw/
-
-3. Install hwloc (not mandatory)
-
-http://www.open-mpi.org/projects/hwloc
-
-4. Install Microsoft Visual C++ Studio Express
-
-   http://www.microsoft.com/express/Downloads
-
-   Add in your path the following directories.
-   (adjusting where necessary for the Installation location according to VC
-    version and on 64 and 32bit Windows versions)
-
-   On cygwin, with Visual C++ 2010 e.g.;
-
-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
-   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
-
-   On MingW, with Visual C++ 2010, e.g.;
-
-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
-   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
-
-   Try to call <lib.exe> and <link.exe> without any option to make sure these
-   dump their help output, else no .def or .lib file will be produced.
-
-5. Install GPU Drivers (not mandatory)
-
-  5.1 Install Cuda
-
-      http://developer.nvidia.com/object/cuda_3_2_downloads.html
-
-      You need to install at least the CUDA toolkit.
-
-      libtool is not able to find the libraries automatically, you
-      need to make some copies:
-
-      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
-      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
-      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
-      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
-      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
-
-      (and if the version of your CUDA driver is >= 3.2)
-
-      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
-
-      Add the CUDA bin directory in your path
-
-      export PATH=/cygdrive/c/CUDA/bin:$PATH
-
-      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
-      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
-      definition which makes linking fail completely. Replace the first
-      occurence of
-
-      #define CUDARTAPI
-
-      with
-
-      #ifdef _WIN32
-      #define CUDARTAPI __stdcall
-      #else
-      #define CUDARTAPI
-      #endif
-
-      While at it, you can also comment the __cdecl definition to avoid spurious
-      warnings.
-
-
-  5.2 Install OpenCL
-
-      http://developer.nvidia.com/object/opencl-download.html
-
-      You need to download the NVIDIA Drivers for your version of
-      Windows. Executing the file will extract all files in a given
-      directory. The the driver installation will start, it will fail
-      if no compatibles drivers can be found on your system.
-
-      Anyway, you should copy the *.dl_ files from the directory
-      (extraction path) in the bin directory of the CUDA installation
-      directory (the directory should be v3.2/bin/)
-
-  5.3 Install MsCompress
-
-      http://gnuwin32.sourceforge.net/packages/mscompress.htm
-
-      Go in the CUDA bin directory, uncompress .dl_ files and rename
-      them in .dll files
-
-      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
-      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
-
-If you are building from a tarball downloaded from the website, you can skip the
-autogen.sh part.
-
-6. Start autogen.sh from cygwin
-
-   cd starpu-trunk
-   ./autogen.sh
-
-7. Start a MinGW shell
-
-   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
-
-8. Configure, make, install from MinGW
-
-   If you have a non-english version of windows, use
-
-     export LANG=C
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+Contents
+========
+
+- Developer Warnings
+- Naming Conventions
+- Coding Style
+
+Developer Warnings
+------------------
 
 
-   else libtool has troubles parsing the translated output of the toolchain.
+They are enabled only if the STARPU_DEVEL environment variable is
+defined to a non-empty value, when calling configure.
 
 
-   cd starpu-trunk
-   mkdir build
-   cd build
-   ../configure --prefix=$PWD/target --disable-default-drand48 \
-        --with-hwloc=<HWLOC installation directory> \
-        --with-cuda-dir=<CUDA installation directory> \
-        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
-	--with-opencl-dir=<CUDA installation directory>
-   make
-   make install
+
 
 
-   Also convert a couple of files to CRLF:
+Naming Conventions
+------------------
 
 
-   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
-   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
-   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
+* Prefix names of public objects (types, functions, etc.) with "starpu"
 
 
-9. If you want your StarPU installation to be standalone, you need to
-   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
-   installation bin directory, as well as MinGW/bin/libpthread*dll
+* Prefix names of internal objects (types, functions, etc.) with "_starpu"
 
 
-   cp <CUDA directory>/bin/*dll target/bin
-   cp <HWLOC directory>/bin/*dll target/bin
-   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
+* Names for qualified types (struct, union, enum) do not end with _t, _s or similar.
+  Use _t only for typedef types, such as opaque public types, e.g
+       typedef struct _starpu_data_state* starpu_data_handle_t;
+  or
+       typedef uint64_t starpu_tag_t;
 
 
-   and set the StarPU bin directory in your path.
+* When a variable can only take a finite set of values, use an enum
+  type instead of defining macros for each of the values.
 
 
-   export PATH=<StarPU installation directory>/bin:$PATH
+
 
 
+Coding Style
+------------
 
 
-Developers warning
-------------------
-They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.
+* Curly braces always go on a new line

+ 21 - 0
STARPU-VERSION

@@ -0,0 +1,21 @@
+# -*- sh -*-
+
+# Versioning (SONAMEs) for StarPU libraries.
+
+# Libtool interface versioning (info "(libtool) Versioning").
+LIBSTARPU_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPU_INTERFACE_AGE=0	# set to CURRENT - PREVIOUS interface
+STARPU_EFFECTIVE_VERSION=1.0
+
+LIBSTARPUFFT_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUFFT_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPUFFT_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
+
+LIBSTARPUMPI_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSTARPUMPI_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSTARPUMPI_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface
+
+LIBSOCL_INTERFACE_CURRENT=0	# increment upon ABI change
+LIBSOCL_INTERFACE_REVISION=0	# increment upon implementation change
+LIBSOCL_INTERFACE_AGE=0		# set to CURRENT - PREVIOUS interface

+ 9 - 0
TODO

@@ -0,0 +1,9 @@
+
+Moving access modes for data handles from struct starpu_task to struct starpu_codelet
+=====================================================================================
+
+TODO list
+
+- Make struct starpu_buffer_descr private (or not, as it can still be used in tests and examples)
+
+- When cost_model is provided, but not cost_function, need to rebuild a struct starpu_buffer_descr

+ 0 - 95
acinclude.m4

@@ -1,95 +0,0 @@
-dnl Copyright (C) Free Software Foundation, Inc.
-dnl
-dnl This program is free software; you can redistribute it and/or modify
-dnl it under the terms of the GNU General Public License as published by
-dnl the Free Software Foundation; either version 2 of the License, or
-dnl (at your option) any later version.
-dnl 
-dnl This program is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-dnl GNU General Public License for more details.
-dnl 
-dnl You should have received a copy of the GNU General Public License
-dnl along with this program; if not, write to the Free Software
-dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
-dnl
-dnl This test is taken from libgfortran
-
-dnl Check whether the target supports __sync_val_compare_and_swap.
-AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
-  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
-		 ac_cv_have_sync_val_compare_and_swap, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
-			[ac_cv_have_sync_val_compare_and_swap=yes],
-			[ac_cv_have_sync_val_compare_and_swap=no])])
-  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
-	      [Define to 1 if the target supports __sync_val_compare_and_swap])
-  fi])
-
-dnl Check whether the target supports __sync_bool_compare_and_swap.
-AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
-  AC_CACHE_CHECK([whether the target supports __sync_bool_compare_and_swap],
-		 ac_cv_have_sync_bool_compare_and_swap, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_bool_compare_and_swap(&foo, 0, 1);])],
-			[ac_cv_have_sync_bool_compare_and_swap=yes],
-			[ac_cv_have_sync_bool_compare_and_swap=no])])
-  if test $ac_cv_have_sync_bool_compare_and_swap = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP, 1,
-	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
-  fi])
-
-dnl Check whether the target supports __sync_fetch_and_add.
-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
-		 ac_cv_have_sync_fetch_and_add, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_fetch_and_add(&foo, 1);])],
-			[ac_cv_have_sync_fetch_and_add=yes],
-			[ac_cv_have_sync_fetch_and_add=no])])
-  if test $ac_cv_have_sync_fetch_and_add = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_ADD, 1,
-	      [Define to 1 if the target supports __sync_fetch_and_add])
-  fi])
-
-dnl Check whether the target supports __sync_fetch_and_or.
-AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_OR], [
-  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_or],
-		 ac_cv_have_sync_fetch_and_or, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_fetch_and_or(&foo, 1);])],
-			[ac_cv_have_sync_fetch_and_or=yes],
-			[ac_cv_have_sync_fetch_and_or=no])])
-  if test $ac_cv_have_sync_fetch_and_or = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_OR, 1,
-	      [Define to 1 if the target supports __sync_fetch_and_or])
-  fi])
-
-dnl Check whether the target supports __sync_lock_test_and_set.
-AC_DEFUN([STARPU_CHECK_SYNC_LOCK_TEST_AND_SET], [
-  AC_CACHE_CHECK([whether the target supports __sync_lock_test_and_set],
-		 ac_cv_have_sync_lock_test_and_set, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_lock_test_and_set(&foo, 1);])],
-			[ac_cv_have_sync_lock_test_and_set=yes],
-			[ac_cv_have_sync_lock_test_and_set=no])])
-  if test $ac_cv_have_sync_lock_test_and_set = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_LOCK_TEST_AND_SET, 1,
-	      [Define to 1 if the target supports __sync_lock_test_and_set])
-  fi])
-
-dnl Check whether the target supports __sync_synchronize.
-AC_DEFUN([STARPU_CHECK_SYNC_SYNCHRONIZE], [
-  AC_CACHE_CHECK([whether the target supports __sync_synchronize],
-		 ac_cv_have_sync_synchronize, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM(,
-			[__sync_synchronize();])],
-			[ac_cv_have_sync_synchronize=yes],
-			[ac_cv_have_sync_synchronize=no])])
-  if test $ac_cv_have_sync_synchronize = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_SYNCHRONIZE, 1,
-	      [Define to 1 if the target supports __sync_synchronize])
-  fi])

+ 326 - 107
configure.ac

@@ -1,9 +1,9 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011  INRIA
+# Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,20 +16,51 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AC_INIT([StarPU],0.9.2, [starpu-bugs@lists.gforge.inria.fr], starpu)
+AC_INIT([StarPU],1.0.0rc2, [starpu-devel@lists.gforge.inria.fr], starpu)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CONFIG_AUX_DIR([build-aux])
+
+dnl Versioning.
+
+STARPU_MAJOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 1`"
+STARPU_MINOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 2`"
+AC_SUBST([STARPU_MAJOR_VERSION])
+AC_SUBST([STARPU_MINOR_VERSION])
+AC_SUBST([STARPU_EFFECTIVE_VERSION])
+AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION],
+  [Major version number of StarPU.])
+AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION],
+  [Major version number of StarPU.])
+
+. "$srcdir/STARPU-VERSION"
+AC_SUBST([LIBSTARPU_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPU_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPU_INTERFACE_AGE])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPUMPI_INTERFACE_AGE])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_CURRENT])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_REVISION])
+AC_SUBST([LIBSTARPUFFT_INTERFACE_AGE])
+AC_SUBST([LIBSOCL_INTERFACE_CURRENT])
+AC_SUBST([LIBSOCL_INTERFACE_REVISION])
+AC_SUBST([LIBSOCL_INTERFACE_AGE])
+
 AC_CANONICAL_SYSTEM
 AC_CANONICAL_SYSTEM
 
 
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl when they're available.
 dnl when they're available.
 m4_ifdef([AM_SILENT_RULES],
 m4_ifdef([AM_SILENT_RULES],
-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests])],
+  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
   [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
   [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
 
 
+m4_ifdef([AM_SILENT_RULES],
+  [AM_SILENT_RULES(yes)])
+
 AC_PREREQ(2.60)
 AC_PREREQ(2.60)
 
 
 AC_PROG_CC
 AC_PROG_CC
+AC_PROG_CXX
 AC_PROG_CPP
 AC_PROG_CPP
 AC_PROG_SED
 AC_PROG_SED
 AC_PROG_LN_S
 AC_PROG_LN_S
@@ -61,13 +92,18 @@ AM_CONDITIONAL([STARPU_HAVE_WINDOWS], [test "x$starpu_windows" = "xyes"])
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 AC_CHECK_SIZEOF([void *])
 AC_CHECK_SIZEOF([void *])
 SIZEOF_VOID_P=$ac_cv_sizeof_void_p
 SIZEOF_VOID_P=$ac_cv_sizeof_void_p
-if test x$SIZEOF_VOID_P = x4; then
-	case "$target" in
-	i386-*darwin*) CFLAGS+=" -march=i686 " ;;
-	esac
-fi
-
-
+case $SIZEOF_VOID_P in
+	4)
+		case "$target" in
+		i386-*darwin*) CFLAGS+=" -march=i686 " ;;
+		esac
+		STARPU_MS_LIB_ARCH=X86
+		;;
+	8)
+		STARPU_MS_LIB_ARCH=X64
+		;;
+esac
+AC_SUBST(STARPU_MS_LIB_ARCH)
 
 
 # This will be useful for program which use CUDA (and .cubin files) which need
 # This will be useful for program which use CUDA (and .cubin files) which need
 # some path to the CUDA code at runtime.
 # some path to the CUDA code at runtime.
@@ -122,8 +158,14 @@ else
   AC_DEFINE([starpu_erand48_r(xsubi, buffer, result)],[do {*(result) = ((double)(rand()) / RAND_MAX);} while (0);],[erand48_r equivalent function])
   AC_DEFINE([starpu_erand48_r(xsubi, buffer, result)],[do {*(result) = ((double)(rand()) / RAND_MAX);} while (0);],[erand48_r equivalent function])
 fi
 fi
 
 
+# Some systems do not define strerror_r
+AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
+
+# Some systems do not define unsetenv
+AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
+
 # Define slow machine
 # Define slow machine
-AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--disable-slow-machine],
+AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--enable-slow-machine],
 				   [Lower default values for the testcases run by make check])],
 				   [Lower default values for the testcases run by make check])],
 				   enable_slow_machine=$enableval, enable_slow_machine=false)
 				   enable_slow_machine=$enableval, enable_slow_machine=false)
 if  test x$enable_slow_machine = xyes; then
 if  test x$enable_slow_machine = xyes; then
@@ -132,6 +174,8 @@ fi
 
 
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
 
+AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 
 
@@ -198,7 +242,7 @@ AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hyper
 AC_MSG_CHECKING(maximum number of CPUs)
 AC_MSG_CHECKING(maximum number of CPUs)
 AC_ARG_ENABLE(maxcpus, [AS_HELP_STRING([--enable-maxcpus=<number>],
 AC_ARG_ENABLE(maxcpus, [AS_HELP_STRING([--enable-maxcpus=<number>],
 			[maximum number of CPUs])],
 			[maximum number of CPUs])],
-			maxcpus=$enableval, maxcpus=16)
+			maxcpus=$enableval, maxcpus=64)
 AC_MSG_RESULT($maxcpus)
 AC_MSG_RESULT($maxcpus)
 AC_DEFINE_UNQUOTED(STARPU_MAXCPUS, [$maxcpus], [Maximum number of CPUs supported])
 AC_DEFINE_UNQUOTED(STARPU_MAXCPUS, [$maxcpus], [Maximum number of CPUs supported])
 
 
@@ -312,7 +356,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
     __cuda_include_dir=$2
     __cuda_include_dir=$2
     __cuda_lib_dir=$3
     __cuda_lib_dir=$3
 
 
-    if test "$__cuda_dir" != "no" ; then
+    if test "$__cuda_dir" != "no" -a "$__cuda_dir" != "" ; then
 	AC_MSG_CHECKING(whether CUDA RT is available in $__cuda_dir)
 	AC_MSG_CHECKING(whether CUDA RT is available in $__cuda_dir)
     else
     else
 	AC_MSG_CHECKING(whether CUDA RT is available)
 	AC_MSG_CHECKING(whether CUDA RT is available)
@@ -349,8 +393,8 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
         if test "$have_valid_cuda" = "no" ; then
         if test "$have_valid_cuda" = "no" ; then
             if test "$3" = "no" -a "$__cuda_dir" != "no" ; then
             if test "$3" = "no" -a "$__cuda_dir" != "no" ; then
                 __cuda_lib_dir="$__cuda_dir/lib64"
                 __cuda_lib_dir="$__cuda_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
 	        STARPU_CUDA_LDFLAGS="${SAVED_STARPU_CUDA_LDFLAGS} -L$__cuda_lib_dir"
 	        STARPU_CUDA_LDFLAGS="${SAVED_STARPU_CUDA_LDFLAGS} -L$__cuda_lib_dir"
-	        LDFLAGS="${SAVED_LDFLAGS} -L$__cuda_lib_dir"
 	        AC_HAVE_LIBRARY([cudart],[have_valid_cuda=yes],[have_valid_cuda=no])
 	        AC_HAVE_LIBRARY([cudart],[have_valid_cuda=yes],[have_valid_cuda=no])
                 unset ac_cv_lib_cudart_main
                 unset ac_cv_lib_cudart_main
             fi
             fi
@@ -359,6 +403,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 
 
     if test "$have_valid_cuda" = "yes" ; then
     if test "$have_valid_cuda" = "yes" ; then
         STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcudart"
         STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcudart"
+	LDFLAGS="${SAVED_LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
 	# we also check that CUBLAS is available
 	# we also check that CUBLAS is available
 	AC_HAVE_LIBRARY([cublas],[have_valid_cuda=yes],[have_valid_cuda=no])
 	AC_HAVE_LIBRARY([cublas],[have_valid_cuda=yes],[have_valid_cuda=no])
         unset ac_cv_lib_cublas_main
         unset ac_cv_lib_cublas_main
@@ -379,7 +424,7 @@ AC_DEFUN([STARPU_CHECK_CUDA_RUNTIME],
 if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
     STARPU_CHECK_CUDA($cuda_dir, $cuda_lib_dir)
     STARPU_CHECK_CUDA($cuda_dir, $cuda_lib_dir)
     if test "$have_valid_cuda" = "no" ; then
     if test "$have_valid_cuda" = "no" ; then
-        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
+        for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
             STARPU_CHECK_CUDA($f, "no")
             STARPU_CHECK_CUDA($f, "no")
             if test "$have_valid_cuda" = "yes" ; then
             if test "$have_valid_cuda" = "yes" ; then
                 break
                 break
@@ -390,7 +435,7 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
     if test "$have_valid_cuda" = "yes" ; then
     if test "$have_valid_cuda" = "yes" ; then
         STARPU_CHECK_CUDA_RUNTIME($cuda_dir, $cuda_include_dir, $cuda_lib_dir)
         STARPU_CHECK_CUDA_RUNTIME($cuda_dir, $cuda_include_dir, $cuda_lib_dir)
         if test "$have_valid_cuda" = "no" ; then
         if test "$have_valid_cuda" = "no" ; then
-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH"; do
+            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
                 STARPU_CHECK_CUDA_RUNTIME($f, "no", "no")
                 STARPU_CHECK_CUDA_RUNTIME($f, "no", "no")
                 if test "$have_valid_cuda" = "yes" ; then
                 if test "$have_valid_cuda" = "yes" ; then
                     break
                     break
@@ -399,8 +444,24 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
         fi
         fi
     fi
     fi
 
 
+    # Check cuda is compatible with the C compiler
+    AC_MSG_CHECKING(whether CUDA is working)
+    if test "$have_valid_cuda" = "yes" ; then
+        SAVED_CPPFLAGS="${CPPFLAGS}"
+        CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
+		[[#include <cuda.h>]],
+		[[]]
+		),
+	    [have_valid_cuda="yes"],
+	    [have_valid_cuda="no"]
+	])
+        CPPFLAGS="${SAVED_CPPFLAGS}"
+    fi
+    AC_MSG_RESULT($have_valid_cuda)
+
     # in case CUDA was explicitely required, but is not available, this is an error
     # in case CUDA was explicitely required, but is not available, this is an error
-    if test x$enable_cuda = xyes -a x$have_valid_cuda = no; then
+    if test x$enable_cuda = xyes -a x$have_valid_cuda = xno; then
 	AC_MSG_ERROR([cannot find CUDA])
 	AC_MSG_ERROR([cannot find CUDA])
     fi
     fi
     # now we enable CUDA if and only if a proper setup is available
     # now we enable CUDA if and only if a proper setup is available
@@ -609,21 +670,28 @@ AC_ARG_WITH(opencl-lib-dir,
 		enable_opencl=yes
 		enable_opencl=yes
 	], [opencl_lib_dir=no])
 	], [opencl_lib_dir=no])
 
 
-if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
-    	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
-        if test "$have_valid_opencl" = "no" ; then
-            for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
-                if test -n $f ; then
-    	            STARPU_CHECK_OPENCL($f, "no", "no")
-                    if test "$have_valid_opencl" = "yes" ; then
-                        break
-                    fi
-                fi
-            done
-        fi
+AC_DEFUN([STARPU_LOOK_FOR_OPENCL],
+[
+    	if test "x$has_opencl_being_checked" != "xyes" ; then
+    	    STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
+	    if test "$have_valid_opencl" = "no" ; then
+            	for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
+		    if test -n $f ; then
+    			STARPU_CHECK_OPENCL($f, "no", "no")
+			if test "$have_valid_opencl" = "yes" ; then
+			    break
+			fi
+		    fi
+		done
+	    fi
+	    has_opencl_being_checked=yes
+	fi
+])
 
 
+if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
+	STARPU_LOOK_FOR_OPENCL()
 	# in case OpenCL was explicitely required, but is not available, this is an error
 	# in case OpenCL was explicitely required, but is not available, this is an error
-	if test x$enable_opencl = xyes -a x$have_valid_opencl = no; then
+	if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
 	    AC_MSG_ERROR([cannot find OpenCL])
 	    AC_MSG_ERROR([cannot find OpenCL])
 	fi
 	fi
 
 
@@ -684,7 +752,7 @@ if test x$enable_gordon = xyes -o x$enable_gordon = xmaybe; then
 	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
 	# AC_CHECK_FUNC(gordon_init, [gordon], [have_valid_gordon=no])
 
 
 	# in case Gordon was explicitely required, but is not available, this is an error
 	# in case Gordon was explicitely required, but is not available, this is an error
-	if test x$enable_gordon = xyes -a x$have_valid_gordon = no; then
+	if test x$enable_gordon = xyes -a x$have_valid_gordon = xno; then
 		AC_MSG_ERROR([cannot find Gordon])
 		AC_MSG_ERROR([cannot find Gordon])
 	fi
 	fi
 
 
@@ -727,6 +795,7 @@ AC_MSG_RESULT($enable_debug)
 
 
 if test x$enable_debug = xyes; then
 if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
 	CFLAGS="$CFLAGS -O0"
+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
 else
 else
 	CFLAGS="$CFLAGS -O3"
 	CFLAGS="$CFLAGS -O3"
 fi
 fi
@@ -741,6 +810,14 @@ if test x$enable_fast = xyes; then
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 fi
 fi
 
 
+AC_MSG_CHECKING(whether memory status should be displayed)
+AC_ARG_ENABLE(memory-status, [AS_HELP_STRING([--enable-memory-status],
+			     [display memory status at the end of execution])],
+			     enable_memory_status=$enableval, enable_memory_status=no)
+AC_MSG_RESULT($enable_memory_status)
+if test x$enable_memory_status = xyes; then
+        AC_DEFINE(STARPU_MEMORY_STATUS, [1], [display memory status])
+fi
 
 
 
 
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_MSG_CHECKING(whether debug messages should be displayed)
@@ -927,7 +1004,7 @@ AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of worker
 AC_MSG_CHECKING(maximum number of implementations)
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
 		[maximum number of implementations])],
 		[maximum number of implementations])],
-		maximplementations=$enableval, maximplementations=1)
+		maximplementations=$enableval, maximplementations=4)
 AC_MSG_RESULT($maximplementations)
 AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
 		[maximum number of implementations])
@@ -1031,45 +1108,63 @@ fi
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
-build_starpu_top=no
-AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
-if test x$QMAKE != xnot-found; then
-	QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
-	if test $QMAKE_VERSION -ge 2 ; then
-		PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
-			QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
-			QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
-			if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
-				build_starpu_top=yes
-			fi
-			QWT_PRI=embed
-			AC_ARG_WITH(qwt-include-dir,
-				[AS_HELP_STRING([--with-qwt-include-dir=<path>],
-				[specify installed libqwt include path])],
-				[
-					STARPU_QWT_CPPFLAGS="-I$withval"
-					AC_SUBST(STARPU_QWT_CPPFLAGS)
-					QWT_PRI=system
-				])
-			AC_ARG_WITH(qwt-lib-dir,
-				[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
-				[specify installed libqwt library path])],
-				[
-					STARPU_QWT_LDFLAGS="-L$withval"
-					QWT_PRI=system
-				])
-			AC_ARG_WITH(qwt-lib,
-				[AS_HELP_STRING([--with-qwt-lib=<path>],
-				[specify installed libqwt library name])],
-				[
-					STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
-					QWT_PRI=system
-				])
-			AC_SUBST(QWT_PRI)
-		])
+AC_ARG_ENABLE([starpu-top],
+  [AS_HELP_STRING([--disable-starpu-top],
+    [build StarPU-Top])],
+  [enable_starpu_top="no"],
+  [enable_starpu_top="maybe"])
+
+# Check whether StarPU-Top can be built
+AC_MSG_CHECKING(for StarPU-Top)
+
+if test "x$enable_starpu_top" = "xmaybe" ; then
+	can_build_starpu_top=no
+	AC_PATH_PROGS([QMAKE], [qmake-qt4 qmake], [not-found])
+	if test x$QMAKE != xnot-found; then
+		QMAKE_VERSION=`$QMAKE --version 2>&1 | head -n 1 | cut -d '.' -f 1 | cut -d ' ' -f 3`
+		if test $QMAKE_VERSION -ge 2 ; then
+			PKG_CHECK_EXISTS([QtGui QtNetwork QtOpenGL QtSql], [
+				QT_MAJVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 1`
+				QT_MINVERSION=`$PKG_CONFIG --modversion QtGui | cut -d '.' -f 2`
+				if test $QT_MAJVERSION -gt 4 -o \( $QT_MAJVERSION -eq 4 -a $QT_MINVERSION -ge 7 \) ; then
+					can_build_starpu_top=yes
+				fi
+				QWT_PRI=embed
+				AC_ARG_WITH(qwt-include-dir,
+					[AS_HELP_STRING([--with-qwt-include-dir=<path>],
+					[specify installed libqwt include path])],
+					[
+						STARPU_QWT_INCLUDE="$withval"
+						AC_SUBST(STARPU_QWT_INCLUDE)
+						QWT_PRI=system
+					])
+				AC_ARG_WITH(qwt-lib-dir,
+					[AS_HELP_STRING([--with-qwt-lib-dir=<path>],
+					[specify installed libqwt library path])],
+					[
+						STARPU_QWT_LDFLAGS="-L$withval"
+						QWT_PRI=system
+					])
+				AC_ARG_WITH(qwt-lib,
+					[AS_HELP_STRING([--with-qwt-lib=<name>],
+					[specify installed libqwt library name])],
+					[
+						STARPU_QWT_LDFLAGS="${STARPU_QWT_LDFLAGS} -l$withval"
+						QWT_PRI=system
+					])
+				AC_SUBST(STARPU_QWT_LDFLAGS)
+				AC_SUBST(QWT_PRI)
+			])
+		fi
 	fi
 	fi
 fi
 fi
 
 
+if test "x$enable_starpu_top" = "xmaybe" ; then
+  build_starpu_top=$can_build_starpu_top
+else
+  build_starpu_top=no
+fi
+
 AM_CONDITIONAL(BUILD_STARPU_TOP, test x$build_starpu_top = xyes)
 AM_CONDITIONAL(BUILD_STARPU_TOP, test x$build_starpu_top = xyes)
 
 
 ###############################################################################
 ###############################################################################
@@ -1088,7 +1183,7 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 	AC_MSG_CHECKING([whether compiler support $1])
 	AC_MSG_CHECKING([whether compiler support $1])
 
 
 	SAVED_CFLAGS="$CFLAGS"
 	SAVED_CFLAGS="$CFLAGS"
-	CFLAGS="$1 -we10006"
+	CFLAGS="$1" # -we10006"
 
 
 	AC_COMPILE_IFELSE(
 	AC_COMPILE_IFELSE(
 		AC_LANG_PROGRAM(
 		AC_LANG_PROGRAM(
@@ -1117,6 +1212,11 @@ if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
 fi
 fi
 
 
+# Same value as Automake's, for use in other places.
+pkglibdir="\${libdir}/$PACKAGE"
+AC_SUBST([pkglibdir])
+
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                               GCC extensions                                #
 #                               GCC extensions                                #
@@ -1127,61 +1227,114 @@ AC_ARG_ENABLE([gcc-extensions],
   [AS_HELP_STRING([--enable-gcc-extensions],
   [AS_HELP_STRING([--enable-gcc-extensions],
     [build the GCC plug-in that provides C language extensions (experimental)])],
     [build the GCC plug-in that provides C language extensions (experimental)])],
   [enable_gcc_plugin="$enableval"],
   [enable_gcc_plugin="$enableval"],
-  [enable_gcc_plugin="no"])
+  [enable_gcc_plugin="maybe"])
 
 
-if test "x$enable_gcc_plugin" = "xyes"; then
-   STARPU_GCC_PLUGIN_SUPPORT
+if test "x$enable_gcc_plugin" = "xyes" -o "x$enable_gcc_plugin" = "xmaybe" ; then
+    STARPU_GCC_PLUGIN_SUPPORT
 
 
-   if test "x$ac_cv_have_gcc_plugins" != "xyes"; then
-     AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
-   fi
+    if test "x$ac_cv_have_gcc_plugins" = "xno" ; then
+        if test "x$enable_gcc_plugin" = "xyes" ; then
+    	    # Since this was explicitly asked for, error out.
+            AC_MSG_ERROR([This compiler lacks GCC plug-in support.])
+	else
+	    AC_MSG_WARN([GCC plug-ins not supported; StarPU's GCC plug-in will not be built])
+        fi
+    else
+        # What GCC version are we using?
+        STARPU_GCC_VERSION
+
+        # The `.so' itself cannot be called `starpu-gcc.so' (because
+	# `-fplugin-arg-' option names and such must match the `.so'
+	# name), so use a meaningful directory name.
+	gccplugindir="\${pkglibdir}/${STARPU_EFFECTIVE_VERSION}/gcc/${STARPU_GCC_VERSION_MAJOR}.${STARPU_GCC_VERSION_MINOR}"
+	AC_SUBST([gccplugindir])
+
+	# Lines to be inserted in the `.pc' file.
+	GCC_PLUGIN_DIR_PKGCONFIG="gccplugindir=$gccplugindir"
+	GCC_PLUGIN_PKGCONFIG="gccplugin=\${gccplugindir}/starpu.so"
+	AC_SUBST([GCC_PLUGIN_DIR_PKGCONFIG])
+	AC_SUBST([GCC_PLUGIN_PKGCONFIG])
+    fi
+fi
 
 
-   build_gcc_plugin="yes"
 
 
-   # GNU Guile 1.8/2.0 is used to run the test suite.
-   AC_PATH_PROG([GUILE], [guile])
-   if test "x$GUILE" != "x"; then
-      run_gcc_plugin_test_suite="yes"
-   else
-      run_gcc_plugin_test_suite="no"
-   fi
+if test "x$ac_cv_have_gcc_plugins" = "xyes" ; then
+    build_gcc_plugin="yes"
+
+    # GNU Guile 1.8/2.0 is used to run the test suite.
+    AC_PATH_PROG([GUILE], [guile])
+    if test "x$GUILE" != "x"; then
+        if test "x$enable_cpu" = "xyes"; then
+	   run_gcc_plugin_test_suite="yes"
+	else
+	   AC_MSG_WARN([CPU back-end disabled; GCC plug-in test suite will not be run])
+	   run_gcc_plugin_test_suite="no"
+	fi
+    else
+	run_gcc_plugin_test_suite="no"
+    fi
 else
 else
-   build_gcc_plugin="no"
-   run_gcc_plugin_test_suite="no"
+    build_gcc_plugin="no"
+    run_gcc_plugin_test_suite="no"
 fi
 fi
 
 
 # Bison is used to generate the C expression parser.  The generated
 # Bison is used to generate the C expression parser.  The generated
 # parser is part of the distribution, though.
 # parser is part of the distribution, though.
-AC_PROG_YACC
+AM_MISSING_PROG([YACC], [bison])
 
 
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([BUILD_GCC_PLUGIN], [test "x$build_gcc_plugin" = "xyes"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 AM_CONDITIONAL([HAVE_GUILE], [test "x$GUILE" != "x"])
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
-#                               OpenCL interface                              #
+#                               SOCL interface                                #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
 
 
 AC_ARG_ENABLE([socl],
 AC_ARG_ENABLE([socl],
   [AS_HELP_STRING([--enable-socl],
   [AS_HELP_STRING([--enable-socl],
-    [build the OpenCL interface (SOCL)])],
+    [build the OpenCL interface (experimental)])],
   [enable_socl="$enableval"],
   [enable_socl="$enableval"],
-  [enable_socl="no"])
+  [enable_socl="maybe"])
 
 
-if test "x$enable_socl" = "xyes"; then
-   STARPU_SOCL_SUPPORT
-   build_socl="yes"
+AC_MSG_CHECKING(for SOCL)
+
+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
+    if test "$have_valid_opencl" = "no" ; then
+	STARPU_LOOK_FOR_OPENCL()
+    fi
+fi
+
+# in case SOCL was explicitely required, but is not available, this is an error
+if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
+    AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
+fi
+
+# now we enable SOCL if and only if a proper setup is available
+if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
+   build_socl=$have_valid_opencl
 else
 else
-   build_socl="no"
-   run_socl_test_suite="no"
+   build_socl=no
 fi
 fi
 
 
+AC_MSG_RESULT($build_socl)
 AM_CONDITIONAL([BUILD_SOCL], [test "x$build_socl" = "xyes"])
 AM_CONDITIONAL([BUILD_SOCL], [test "x$build_socl" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SOCL], [test "x$build_socl" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SOCL], [test "x$build_socl" = "xyes"])
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
+#                                 Debugging                                   #
+#                                                                             #
+###############################################################################
+
+AC_PATH_PROG([GDB], [gdb], [not-found])
+if test "x$GDB" != "xnot-found"; then
+   AC_DEFINE_UNQUOTED([STARPU_GDB_PATH], ["$GDB"],
+     [Path to the GNU debugger.])
+fi
+
+###############################################################################
+#                                                                             #
 #                                  Examples                                   #
 #                                  Examples                                   #
 #                                                                             #
 #                                                                             #
 ###############################################################################
 ###############################################################################
@@ -1203,10 +1356,10 @@ AC_SUBST(STARPU_OPENGL_RENDER, $enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
 AC_MSG_RESULT($enable_opengl_render)
 
 
 AC_PATH_XTRA
 AC_PATH_XTRA
-if test "x$x_includes" != "xNONE"; then
+if test "x$no_x" != "xyes"; then
 	AC_DEFINE(STARPU_HAVE_X11, [1], [enable X11])
 	AC_DEFINE(STARPU_HAVE_X11, [1], [enable X11])
 fi
 fi
-AM_CONDITIONAL([HAVE_X11], [test "x$x_includes" != "xNONE"])
+AM_CONDITIONAL([HAVE_X11], [test "x$no_x" != "xyes"])
 
 
 # In case there are BLAS kernels that are used by the example applications
 # In case there are BLAS kernels that are used by the example applications
 # we may specify which library to use. Note that this is not used for StarPU
 # we may specify which library to use. Note that this is not used for StarPU
@@ -1330,6 +1483,11 @@ AC_SUBST(BLAS_LIB,$blas_lib)
 have_fftw=no
 have_fftw=no
 have_fftwf=no
 have_fftwf=no
 have_fftwl=no
 have_fftwl=no
+fft_support=no
+
+AC_ARG_ENABLE(starpufft, [AS_HELP_STRING([--disable-starpufft],
+			[Disable build of StarPU-FFT])],
+			enable_starpufft=$enableval,enable_starpufft=yes)
 
 
 PKG_CHECK_MODULES([FFTW],  [fftw3],  [
 PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   AC_DEFINE([STARPU_HAVE_FFTW], [1], [Define to 1 if you have the libfftw3 library.])
   AC_DEFINE([STARPU_HAVE_FFTW], [1], [Define to 1 if you have the libfftw3 library.])
@@ -1337,7 +1495,7 @@ PKG_CHECK_MODULES([FFTW],  [fftw3],  [
   have_fftw=yes
   have_fftw=yes
 ], [:])
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
 AM_CONDITIONAL(STARPU_HAVE_FFTW, [test x$have_fftw = xyes])
- 
+
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
 PKG_CHECK_MODULES([FFTWF], [fftw3f], [
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_DEFINE([STARPU_HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
   AC_SUBST([STARPU_HAVE_FFTWF], [1])
@@ -1352,6 +1510,11 @@ PKG_CHECK_MODULES([FFTWL], [fftw3l], [
 ], [:])
 ], [:])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 AM_CONDITIONAL(STARPU_HAVE_FFTWL, [test x$have_fftwl = xyes])
 
 
+if test x$enable_starpufft = xyes -a \( \( x$enable_cpu = xyes -a x$have_fftw = xyes -a x$have_fftwf = xyes \) -o x$have_cufftdoublecomplex = xyes \); then
+   fft_support=yes
+fi
+AM_CONDITIONAL(BUILD_STARPUFFT, [test x$fft_support = xyes])
+
 ##########################################
 ##########################################
 # hwloc                                  #
 # hwloc                                  #
 ##########################################
 ##########################################
@@ -1407,10 +1570,56 @@ AC_ARG_ENABLE(optional_tests, [AS_HELP_STRING([--optional-tests],
 AC_MSG_RESULT($want_optional_tests)
 AC_MSG_RESULT($want_optional_tests)
 AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
 
+# Check if icc is available
+AC_CHECK_PROGS([ICC], [icc])
+
+# If cuda and icc are both available, check they are compatible
+if test "$enable_cuda" = "yes" -a "$ICC" != ""; then
+   AC_MSG_CHECKING(whether CUDA and ICC are compatible)
+   OLD_CC="$CC"
+   CC="$ICC"
+   AC_COMPILE_IFELSE(
+       AC_LANG_PROGRAM(
+	   [[#include <cuda.h>]],
+	   [[]]
+	   ),
+       AC_MSG_RESULT(yes),
+       [ICC=""
+           AC_MSG_RESULT(no)]
+   )
+   CC="$OLD_CC"
+fi
+
+# Disable ICC on windows
+if test "x$ICC" != "x" -a "$starpu_windows" = "yes" ; then
+    ICC=""
+fi
+if test "x$ICC" != "x"; then
+  AC_DEFINE(STARPU_HAVE_ICC, [], [Define this if icc is available])
+fi
+AM_CONDITIONAL([STARPU_HAVE_ICC], [test "x$ICC" != "x"])
+
+# Do not generate manpages for the tools if we do not have help2man
+AC_CHECK_PROGS([HELP2MAN], [help2man])
+# Disable on windows
+if test "$starpu_windows" = "yes" ; then
+    HELP2MAN=""
+fi
+AM_CONDITIONAL([STARPU_HAVE_HELP2MAN], [test "x$HELP2MAN" != "x"])
+
+AC_CHECK_MEMBER([struct cudaDeviceProp.pciDomainID],
+  AC_DEFINE([STARPU_HAVE_DOMAINID],[1],[Define to 1 if CUDA device properties include DomainID]),
+  , [[#include <cuda_runtime_api.h>]])
+
+AC_CHECK_MEMBER([struct cudaDeviceProp.pciBusID],
+  AC_DEFINE([STARPU_HAVE_BUSID],[1],[Define to 1 if CUDA device properties include BusID]),
+  , [[#include <cuda_runtime_api.h>]])
+
 # File configuration
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x tests/regression/regression.sh
   chmod +x gcc-plugin/tests/run-test
   chmod +x gcc-plugin/tests/run-test
+  chmod +x tools/starpu_workers_activity
 ])
 ])
 
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
@@ -1420,19 +1629,27 @@ AC_OUTPUT([
 	Makefile
 	Makefile
 	src/Makefile
 	src/Makefile
 	tools/Makefile
 	tools/Makefile
+	tools/starpu_workers_activity
 	socl/Makefile
 	socl/Makefile
 	socl/src/Makefile
 	socl/src/Makefile
+	socl/examples/Makefile
+        socl/socl-1.0.pc
 	libstarpu.pc
 	libstarpu.pc
+	starpu-1.0.pc
+	mpi/libstarpumpi.pc
+	mpi/starpumpi-1.0.pc
+	starpufft/Makefile
+	starpufft/libstarpufft.pc
+	starpufft/starpufft-1.0.pc
 	examples/Makefile
 	examples/Makefile
         examples/opt/Makefile
         examples/opt/Makefile
-	examples/starpufft/Makefile
 	examples/stencil/Makefile
 	examples/stencil/Makefile
-	examples/socl/Makefile
 	tests/Makefile
 	tests/Makefile
         tests/opt/Makefile
         tests/opt/Makefile
 	doc/Makefile
 	doc/Makefile
 	mpi/Makefile
 	mpi/Makefile
 	starpu-top/StarPU-Top.pro
 	starpu-top/StarPU-Top.pro
+	starpu-top/StarPU-Top-qwt-system.pri
         gcc-plugin/Makefile
         gcc-plugin/Makefile
 	gcc-plugin/src/Makefile
 	gcc-plugin/src/Makefile
 	gcc-plugin/tests/Makefile
 	gcc-plugin/tests/Makefile
@@ -1450,9 +1667,6 @@ AC_MSG_NOTICE([
 	OpenCL enabled: $enable_opencl
 	OpenCL enabled: $enable_opencl
 	Cell   enabled: $enable_gordon
 	Cell   enabled: $enable_gordon
 
 
-	GCC plug-in: $build_gcc_plugin
-	GCC plug-in test suite: $run_gcc_plugin_test_suite
-
 	Compile-time limits
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	--enable-maxopencldev, --enable-maxbuffers)
 	--enable-maxopencldev, --enable-maxbuffers)
@@ -1466,12 +1680,17 @@ AC_MSG_NOTICE([
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	GPU-GPU transfers: $have_cuda_memcpy_peer
 	Allocation cache:  $enable_allocation_cache
 	Allocation cache:  $enable_allocation_cache
 
 
-	MPI enabled:   $use_mpi
-	SOCL enabled:  $build_socl
 	Magma enabled: $have_magma
 	Magma enabled: $have_magma
 	BLAS library:  $blas_lib
 	BLAS library:  $blas_lib
 	hwloc:         $have_valid_hwloc
 	hwloc:         $have_valid_hwloc
-
 	FxT trace enabled: $use_fxt
 	FxT trace enabled: $use_fxt
 	StarPU-Top:        $build_starpu_top
 	StarPU-Top:        $build_starpu_top
+
+	StarPU Extensions:
+	       MPI enabled:   $use_mpi
+	       MPI test suite: $running_mpi_check
+	       FFT Support: $fft_support
+	       GCC plug-in: $build_gcc_plugin
+	       GCC plug-in test suite: $run_gcc_plugin_test_suite
+	       SOCL enabled:  $build_socl
 ])
 ])

+ 2 - 2
libstarpu.pc.in

@@ -6,8 +6,8 @@ includedir=@includedir@
 Name: starpu
 Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Version: @PACKAGE_VERSION@
-Cflags: -I${includedir} @STARPU_CUDA_CPPFLAGS@
-Libs: -L${libdir} -lstarpu @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@
 Requires: @HWLOC_REQUIRES@
 Requires.private: @GORDON_REQUIRES@
 Requires.private: @GORDON_REQUIRES@

+ 35 - 0
starpu-1.0.pc.in

@@ -0,0 +1,35 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+pkglibdir=@pkglibdir@
+includedir=@includedir@
+
+# When the GCC plug-in is available, the following lines indicate
+# where it is installed.
+@GCC_PLUGIN_DIR_PKGCONFIG@
+@GCC_PLUGIN_PKGCONFIG@
+
+Name: starpu
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: @HWLOC_REQUIRES@
+Requires.private: @GORDON_REQUIRES@

+ 1 - 1
starpu-top/StarPU-Top-common.pri

@@ -28,7 +28,7 @@ SOURCES += $$SRCDIR/main.cpp \
     $$SRCDIR/aboutdialog.cpp
     $$SRCDIR/aboutdialog.cpp
 HEADERS += $$SRCDIR/mainwindow.h \
 HEADERS += $$SRCDIR/mainwindow.h \
 #STARPU-TOP
 #STARPU-TOP
-    $$SRCDIR/starputoptypes.h \
+    $$SRCDIR/starpu_top_types.h \
     $$SRCDIR/widgetwindowsmanager.h \
     $$SRCDIR/widgetwindowsmanager.h \
     $$SRCDIR/configurationmanager.h \
     $$SRCDIR/configurationmanager.h \
     $$SRCDIR/communicationthread.h \
     $$SRCDIR/communicationthread.h \

+ 0 - 2
starpu-top/StarPU-Top-qwt-system.pri

@@ -1,2 +0,0 @@
-LIBS += -lqwt-qt4
-INCLUDEPATH += /usr/include/qwt-qt4

+ 2 - 0
starpu-top/StarPU-Top-qwt-system.pri.in

@@ -0,0 +1,2 @@
+LIBS += @STARPU_QWT_LDFLAGS@
+INCLUDEPATH += @STARPU_QWT_INCLUDE@

+ 1 - 1
starpu-top/aboutdialog.ui

@@ -112,7 +112,7 @@
       <string/>
       <string/>
      </property>
      </property>
      <property name="pixmap">
      <property name="pixmap">
-      <pixmap resource="resources.qrc">:/images/starputop.png</pixmap>
+      <pixmap resource="resources.qrc">:/images/starpu_top.png</pixmap>
      </property>
      </property>
      <property name="scaledContents">
      <property name="scaledContents">
       <bool>true</bool>
       <bool>true</bool>

+ 3 - 3
starpu-top/communicationmanager.cpp

@@ -70,7 +70,7 @@ void CommunicationManager::initializeSession()
 {
 {
     _dataDescriptions = new QList<DataDescription*> ();
     _dataDescriptions = new QList<DataDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
-    _serverDevices = new QList<StarputopDevice> ;
+    _serverDevices = new QList<starpu_top_device> ;
 
 
     _serverInfoMsgCount = 0;
     _serverInfoMsgCount = 0;
     _state = COM_STATE_INIT;
     _state = COM_STATE_INIT;
@@ -665,7 +665,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
         Q_ASSERT_X(ok == true, "CommunicationManager::parseInitDevMessage()",
         Q_ASSERT_X(ok == true, "CommunicationManager::parseInitDevMessage()",
                    "Bogus message received in INIT DEV");
                    "Bogus message received in INIT DEV");
 
 
-        StarputopDeviceType deviceType;
+        starpu_top_device_type deviceType;
 
 
         Q_ASSERT_X(
         Q_ASSERT_X(
                 deviceTypeString.compare(
                 deviceTypeString.compare(
@@ -701,7 +701,7 @@ void CommunicationManager::parseInitDevMessage(QString messageString)
             deviceType = SERVERDEVICE_GORDON;
             deviceType = SERVERDEVICE_GORDON;
         }
         }
 
 
-        StarputopDevice device;
+        starpu_top_device device;
         device.id = deviceId;
         device.id = deviceId;
         device.type = deviceType;
         device.type = deviceType;
         device.name = deviceNameString;
         device.name = deviceNameString;

+ 3 - 3
starpu-top/communicationmanager.h

@@ -27,7 +27,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #define COMMUNICATIONMANAGER_H
 #define COMMUNICATIONMANAGER_H
 
 
 #include <QTcpSocket>
 #include <QTcpSocket>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 class CommunicationManager : public QTcpSocket
 class CommunicationManager : public QTcpSocket
 { /* Receives protocol messages from server, parses them
 { /* Receives protocol messages from server, parses them
@@ -54,7 +54,7 @@ private:
     qlonglong _serverTimestamp;
     qlonglong _serverTimestamp;
     QList<DataDescription*> *_dataDescriptions;
     QList<DataDescription*> *_dataDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
-    QList<StarputopDevice> *_serverDevices;
+    QList<starpu_top_device> *_serverDevices;
     // Communication states
     // Communication states
     CommunicationState _state;
     CommunicationState _state;
     bool _initServerInfoCompleted;
     bool _initServerInfoCompleted;
@@ -125,7 +125,7 @@ signals:
     void serverInitCompleted(QString serverID,
     void serverInitCompleted(QString serverID,
                              QList<DataDescription*> *dataDescriptions,
                              QList<DataDescription*> *dataDescriptions,
                              QList<ParamDescription*> *paramDescriptions,
                              QList<ParamDescription*> *paramDescriptions,
-                             QList<StarputopDevice> *serverDevices);
+                             QList<starpu_top_device> *serverDevices);
     // Notify GUI with a protocol message
     // Notify GUI with a protocol message
     // Protocol error
     // Protocol error
     void protocolError(QString errorMessage);
     void protocolError(QString errorMessage);

+ 3 - 3
starpu-top/communicationthread.cpp

@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "configurationmanager.h"
 #include "configurationmanager.h"
 #include "mainwindow.h"
 #include "mainwindow.h"
 #include "communicationmanager.h"
 #include "communicationmanager.h"
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 const int MAX_CONNECTION_ATTEMPTS = 10;
 const int MAX_CONNECTION_ATTEMPTS = 10;
 
 
@@ -103,12 +103,12 @@ void CommunicationThread::createNewCommunicationManager(void)
                      SIGNAL(serverInitCompleted(QString,
                      SIGNAL(serverInitCompleted(QString,
                                                 QList<DataDescription*>*,
                                                 QList<DataDescription*>*,
                                                 QList<ParamDescription*>*,
                                                 QList<ParamDescription*>*,
-                                                QList<StarputopDevice>*)),
+                                                QList<Starpu_TopDevice>*)),
                      _mainWindow, SLOT(initClient(
                      _mainWindow, SLOT(initClient(
                              QString,
                              QString,
                              QList<DataDescription*>*,
                              QList<DataDescription*>*,
                              QList<ParamDescription*>*,
                              QList<ParamDescription*>*,
-                             QList<StarputopDevice>*)));
+                             QList<Starpu_TopDevice>*)));
     // Output data
     // Output data
     QObject::connect(_mainWindow, SIGNAL(clientLaunched()),
     QObject::connect(_mainWindow, SIGNAL(clientLaunched()),
                      _communicationManager, SLOT(sendGoMessage()));
                      _communicationManager, SLOT(sendGoMessage()));

+ 1 - 1
starpu-top/configurationmanager.h

@@ -29,7 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <QSettings>
 #include <QSettings>
 
 
 static const QString CONFIG_FILE_DIR = ".";
 static const QString CONFIG_FILE_DIR = ".";
-static const QString CONFIG_FILE_NAME = "starputop.cfg";
+static const QString CONFIG_FILE_NAME = "starpu_top.cfg";
 
 
 class ConfigurationManager
 class ConfigurationManager
 { /* Contains and manages all the application settings
 { /* Contains and manages all the application settings

+ 1 - 1
starpu-top/dataaggregatorwidget.h

@@ -34,7 +34,7 @@ class QwtPlot;
 
 
 #include <QHash>
 #include <QHash>
 #include <QAction>
 #include <QAction>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include "abstractwidgetwindow.h"
 #include "abstractwidgetwindow.h"
 
 
 class DataAggregatorWidget : public AbstractWidgetWindow
 class DataAggregatorWidget : public AbstractWidgetWindow

+ 1 - 1
starpu-top/datawidget.h

@@ -31,7 +31,7 @@ class WidgetWindowsManager;
 class QwtPlotCurve;
 class QwtPlotCurve;
 class QwtPlot;
 class QwtPlot;
 
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include "abstractwidgetwindow.h"
 #include "abstractwidgetwindow.h"
 
 
 class DataWidget : public AbstractWidgetWindow
 class DataWidget : public AbstractWidgetWindow

+ 2 - 4
starpu-top/extradist

@@ -9,9 +9,8 @@ EXTRA_DIST	+=	\
                 starpu-top/abstractwidgetwindow.cpp     \
                 starpu-top/abstractwidgetwindow.cpp     \
                 starpu-top/communicationthread.h        \
                 starpu-top/communicationthread.h        \
                 starpu-top/configurationmanager.cpp     \
                 starpu-top/configurationmanager.cpp     \
-                starpu-top/starputoptypes.h             \
+                starpu-top/starpu_top_types.h             \
                 starpu-top/mainwindow.ui                \
                 starpu-top/mainwindow.ui                \
-                starpu-top/debug                        \
                 starpu-top/mainwindow.cpp               \
                 starpu-top/mainwindow.cpp               \
                 starpu-top/sessionsetupmanager.cpp      \
                 starpu-top/sessionsetupmanager.cpp      \
                 starpu-top/resources.qrc                \
                 starpu-top/resources.qrc                \
@@ -19,7 +18,7 @@ EXTRA_DIST	+=	\
                 starpu-top/images/connect.png           \
                 starpu-top/images/connect.png           \
                 starpu-top/images/debugon.png           \
                 starpu-top/images/debugon.png           \
                 starpu-top/images/help.png              \
                 starpu-top/images/help.png              \
-                starpu-top/images/starputop.png         \
+                starpu-top/images/starpu_top.png         \
                 starpu-top/images/widget.png            \
                 starpu-top/images/widget.png            \
                 starpu-top/images/lock.png              \
                 starpu-top/images/lock.png              \
                 starpu-top/images/about.png             \
                 starpu-top/images/about.png             \
@@ -45,7 +44,6 @@ EXTRA_DIST	+=	\
                 starpu-top/debugconsole.ui                      \
                 starpu-top/debugconsole.ui                      \
                 starpu-top/dataaggregatorwidget.cpp             \
                 starpu-top/dataaggregatorwidget.cpp             \
                 starpu-top/datawidget.cpp                       \
                 starpu-top/datawidget.cpp                       \
-                starpu-top/release                              \
                 starpu-top/datawidget.h                         \
                 starpu-top/datawidget.h                         \
                 starpu-top/debugconsole.cpp                     \
                 starpu-top/debugconsole.cpp                     \
                 starpu-top/ganttwidget.h                        \
                 starpu-top/ganttwidget.h                        \

+ 9 - 9
starpu-top/ganttwidget.cpp

@@ -469,7 +469,7 @@ void GanttWidget::drawFromTime(QPainter *painter, qlonglong timestamp)
         borneBefore = 0;
         borneBefore = 0;
     }
     }
     _tasks = _taskManager->tasks(borneBefore, _timePresent);
     _tasks = _taskManager->tasks(borneBefore, _timePresent);
-    foreach(StarputopTask t, _tasks)
+    foreach(starpu_top_task t, _tasks)
     {
     {
 	drawWorkPU(painter,t);
 	drawWorkPU(painter,t);
     }
     }
@@ -526,7 +526,7 @@ void GanttWidget::drawIdlePU(QPainter *painter)
 }
 }
 
 
 /* draw forecasted working time for each processor */
 /* draw forecasted working time for each processor */
-void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
+void GanttWidget::drawPrevWorkPU(QPainter *painter, starpu_top_task t)
 {
 {
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
@@ -623,7 +623,7 @@ void GanttWidget::drawPrevWorkPU(QPainter *painter, StarputopTask t)
  we haven't to test if they are displayable or not. We just have to calculate
  we haven't to test if they are displayable or not. We just have to calculate
  which part of time is displayable.
  which part of time is displayable.
  The task t has its begin or its end between time Before and timePresent */
  The task t has its begin or its end between time Before and timePresent */
-void GanttWidget::drawWorkPU(QPainter *painter, StarputopTask t)
+void GanttWidget::drawWorkPU(QPainter *painter, starpu_top_task t)
 {
 {
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int starty = HEIGHT_TIME_AXIS + MARGIN;
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
     int widthAllowed = size().width() - WIDTH_PROGRAM - MARGIN
@@ -783,12 +783,12 @@ void GanttWidget::countPUs()
     _numPUs = length;
     _numPUs = length;
     delete _PUsByDevice;
     delete _PUsByDevice;
     delete _PUsByPos;
     delete _PUsByPos;
-    _PUsByDevice = new StarputopDevice[length];
-    _PUsByPos = new StarputopDevice[length];
+    _PUsByDevice = new starpu_top_device[length];
+    _PUsByPos = new starpu_top_device[length];
     int pos = 0;
     int pos = 0;
 
 
     /* CPUs */
     /* CPUs */
-    foreach(StarputopDevice sD,*_mainWindow->serverDevices())
+    foreach(starpu_top_device sD,*_mainWindow->serverDevices())
     {
     {
 	if(sD.type == 0)
 	if(sD.type == 0)
 	{
 	{
@@ -806,7 +806,7 @@ void GanttWidget::countPUs()
     }
     }
 
 
     /* GPUs */
     /* GPUs */
-    foreach (StarputopDevice sD , *_mainWindow->serverDevices())
+    foreach (starpu_top_device sD , *_mainWindow->serverDevices())
     {
     {
 	if(sD.type == 1 || sD.type == 2)
 	if(sD.type == 1 || sD.type == 2)
 	{
 	{
@@ -855,7 +855,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
             }
             }
 
 
             _tasks = _taskManager->tasks(borneBefore, _timePresent);
             _tasks = _taskManager->tasks(borneBefore, _timePresent);
-            foreach (StarputopTask t, _tasks)
+            foreach (starpu_top_task t, _tasks)
             {
             {
                 drawWorkPU(painter,t);
                 drawWorkPU(painter,t);
             }
             }
@@ -863,7 +863,7 @@ void GanttWidget::paint(QPainter *painter, QPaintEvent *event)
             /* Future past */
             /* Future past */
             qlonglong borneAfter = _timePresent + _timeAfter;
             qlonglong borneAfter = _timePresent + _timeAfter;
             _tasks = _taskManager->prevTasks(_timePresent, borneAfter);
             _tasks = _taskManager->prevTasks(_timePresent, borneAfter);
-            foreach		(StarputopTask t, _tasks)
+            foreach		(starpu_top_task t, _tasks)
             {
             {
                 drawPrevWorkPU(painter,t);
                 drawPrevWorkPU(painter,t);
             }
             }

+ 6 - 6
starpu-top/ganttwidget.h

@@ -31,7 +31,7 @@ class TaskManager;
 
 
 #include <QGLWidget>
 #include <QGLWidget>
 #include <QPainter>
 #include <QPainter>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 class GanttWidget : public QGLWidget
 class GanttWidget : public QGLWidget
 {
 {
@@ -58,9 +58,9 @@ protected:
     void drawTime(QPainter *painter);
     void drawTime(QPainter *painter);
     void drawProgram(QPainter *painter);
     void drawProgram(QPainter *painter);
     void resizeGL (int width,int height);
     void resizeGL (int width,int height);
-    void drawWorkPU(QPainter *painter, StarputopTask t);
+    void drawWorkPU(QPainter *painter, starpu_top_task t);
     void drawIdlePU(QPainter *painter);
     void drawIdlePU(QPainter *painter);
-    void drawPrevWorkPU(QPainter *painter, StarputopTask t);
+    void drawPrevWorkPU(QPainter *painter, starpu_top_task t);
     void defaultScreen(QPainter *painter);
     void defaultScreen(QPainter *painter);
     void drawPresentLine(QPainter *painter);
     void drawPresentLine(QPainter *painter);
     int computeTimeInterval(int timeTotal);
     int computeTimeInterval(int timeTotal);
@@ -84,7 +84,7 @@ private:
     qreal _coordxPresentLine;
     qreal _coordxPresentLine;
     int _numPUs;
     int _numPUs;
     bool _wasRunning;
     bool _wasRunning;
-    QList<StarputopTask> _tasks;
+    QList<starpu_top_task> _tasks;
     int _timeTotal;
     int _timeTotal;
     int _timeAfter;
     int _timeAfter;
     int _timeBefore;
     int _timeBefore;
@@ -92,8 +92,8 @@ private:
     QTimer *_timer;
     QTimer *_timer;
     qlonglong _timePresent;
     qlonglong _timePresent;
     qlonglong _timeToShow;
     qlonglong _timeToShow;
-    StarputopDevice *_PUsByDevice;
-    StarputopDevice *_PUsByPos;
+    starpu_top_device *_PUsByDevice;
+    starpu_top_device *_PUsByPos;
     int _numCPUs;
     int _numCPUs;
     int _numGPUs;
     int _numGPUs;
     bool _initCompleted;
     bool _initCompleted;

starpu-top/images/starputop.png → starpu-top/images/starpu_top.png


+ 1 - 1
starpu-top/interactivewidget.h

@@ -30,7 +30,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <QCloseEvent>
 #include <QCloseEvent>
 #include <QLabel>
 #include <QLabel>
 #include <QHBoxLayout>
 #include <QHBoxLayout>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 class MainWindow;
 class MainWindow;
 
 

+ 11 - 11
starpu-top/mainwindow.cpp

@@ -61,7 +61,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _dataAggregatorWidgets = new QList<QPointer<DataAggregatorWidget> > ();
     _dataAggregatorWidgets = new QList<QPointer<DataAggregatorWidget> > ();
     _dataDescriptions = new QList<DataDescription*> ();
     _dataDescriptions = new QList<DataDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
     _paramDescriptions = new QList<ParamDescription*> ();
-    _serverDevices = new QList<StarputopDevice> ();
+    _serverDevices = new QList<starpu_top_device> ();
     _nbDataWidgets = _nbInteractiveWidgets = _nbDataAggregatorWidgets = 0;
     _nbDataWidgets = _nbInteractiveWidgets = _nbDataAggregatorWidgets = 0;
 
 
     // Init managers
     // Init managers
@@ -97,18 +97,18 @@ MainWindow::MainWindow(QWidget *parent) :
     QObject::connect(settingsAction, SIGNAL(triggered()), this,
     QObject::connect(settingsAction, SIGNAL(triggered()), this,
                      SLOT(on_actionPreferences_triggered()));
                      SLOT(on_actionPreferences_triggered()));
     connectButton->addAction(settingsAction);
     connectButton->addAction(settingsAction);
-    ui->menuStarputop->addAction(_actionConnect);
+    ui->menu_starpu_top->addAction(_actionConnect);
     // Action launch
     // Action launch
     _actionLaunch = ui->mainToolBar->addAction(QIcon(":/images/play.png"),
     _actionLaunch = ui->mainToolBar->addAction(QIcon(":/images/play.png"),
                                                tr("Launch StarPU"));
                                                tr("Launch StarPU"));
     _actionLaunch->setIconText("Launch StarPU");
     _actionLaunch->setIconText("Launch StarPU");
     _actionLaunch->setToolTip("Launch StarPU");
     _actionLaunch->setToolTip("Launch StarPU");
     _actionLaunch->setShortcut(QKeySequence("Ctrl+L"));
     _actionLaunch->setShortcut(QKeySequence("Ctrl+L"));
-    ui->menuStarputop->addAction(_actionLaunch);
+    ui->menu_starpu_top->addAction(_actionLaunch);
     QObject::connect(_actionLaunch, SIGNAL(triggered()), this,
     QObject::connect(_actionLaunch, SIGNAL(triggered()), this,
                      SLOT(on_actionLaunch_StarPU_triggered()));
                      SLOT(on_actionLaunch_StarPU_triggered()));
     ui->mainToolBar->addSeparator();
     ui->mainToolBar->addSeparator();
-    ui->menuStarputop->addSeparator();
+    ui->menu_starpu_top->addSeparator();
     // Action debug
     // Action debug
     _actionDebug = ui->mainToolBar->addAction(QIcon(":/images/debugon.png"),
     _actionDebug = ui->mainToolBar->addAction(QIcon(":/images/debugon.png"),
                                               tr("Enable debug"));
                                               tr("Enable debug"));
@@ -116,7 +116,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionDebug->setToolTip("Enable debug");
     _actionDebug->setToolTip("Enable debug");
     _actionDebug->setShortcut(QKeySequence("Ctrl+D"));
     _actionDebug->setShortcut(QKeySequence("Ctrl+D"));
     _actionDebug->setCheckable(true);
     _actionDebug->setCheckable(true);
-    ui->menuStarputop->addAction(_actionDebug);
+    ui->menu_starpu_top->addAction(_actionDebug);
     QObject::connect(_actionDebug, SIGNAL(toggled(bool)),
     QObject::connect(_actionDebug, SIGNAL(toggled(bool)),
                      this, SLOT(on_actionDebug_triggered(bool)));
                      this, SLOT(on_actionDebug_triggered(bool)));
     // Action save session setup
     // Action save session setup
@@ -125,7 +125,7 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionSaveSessionSetup->setIconText("Save session setup");
     _actionSaveSessionSetup->setIconText("Save session setup");
     _actionSaveSessionSetup->setToolTip("Save session setup");
     _actionSaveSessionSetup->setToolTip("Save session setup");
     _actionSaveSessionSetup->setShortcut(QKeySequence("Ctrl+S"));
     _actionSaveSessionSetup->setShortcut(QKeySequence("Ctrl+S"));
-    ui->menuStarputop->addAction(_actionSaveSessionSetup);
+    ui->menu_starpu_top->addAction(_actionSaveSessionSetup);
     QObject::connect(_actionSaveSessionSetup, SIGNAL(triggered()), this,
     QObject::connect(_actionSaveSessionSetup, SIGNAL(triggered()), this,
                      SLOT(on_actionSaveSessionSetup_triggered()));
                      SLOT(on_actionSaveSessionSetup_triggered()));
     // Action add data aggregator widget
     // Action add data aggregator widget
@@ -135,13 +135,13 @@ MainWindow::MainWindow(QWidget *parent) :
     _actionAddDataAggregatorWidget->setIconText("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setIconText("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setToolTip("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setToolTip("Add data aggregator widget");
     _actionAddDataAggregatorWidget->setShortcut(QKeySequence("Ctrl+G"));
     _actionAddDataAggregatorWidget->setShortcut(QKeySequence("Ctrl+G"));
-    ui->menuStarputop->addAction(_actionAddDataAggregatorWidget);
+    ui->menu_starpu_top->addAction(_actionAddDataAggregatorWidget);
     QObject::connect(_actionAddDataAggregatorWidget, SIGNAL(triggered()), this,
     QObject::connect(_actionAddDataAggregatorWidget, SIGNAL(triggered()), this,
                      SLOT(on_actionAddDataAggregatorWidget_triggered()));
                      SLOT(on_actionAddDataAggregatorWidget_triggered()));
     ui->mainToolBar->addSeparator();
     ui->mainToolBar->addSeparator();
-    ui->menuStarputop->addSeparator();
+    ui->menu_starpu_top->addSeparator();
     // Action quit
     // Action quit
-    QAction *actionQuit = ui->menuStarputop->addAction(
+    QAction *actionQuit = ui->menu_starpu_top->addAction(
             QIcon(":/images/quit.png"), tr("Quit"));
             QIcon(":/images/quit.png"), tr("Quit"));
     actionQuit->setIconText("Quit");
     actionQuit->setIconText("Quit");
     actionQuit->setToolTip("Quit");
     actionQuit->setToolTip("Quit");
@@ -540,7 +540,7 @@ void MainWindow::synchronizeSessionTime(qlonglong serverTimestamp)
 void MainWindow::initClient(QString serverID,
 void MainWindow::initClient(QString serverID,
                             QList<DataDescription*> *dataDescriptions,
                             QList<DataDescription*> *dataDescriptions,
                             QList<ParamDescription*> *paramDescriptions,
                             QList<ParamDescription*> *paramDescriptions,
-                            QList<StarputopDevice> *serverDevices)
+                            QList<starpu_top_device> *serverDevices)
 {
 {
     _serverID = serverID;
     _serverID = serverID;
     _dataDescriptions = dataDescriptions;
     _dataDescriptions = dataDescriptions;
@@ -1213,7 +1213,7 @@ ParamDescription *MainWindow::paramDescriptionFromId(int paramId)
     return 0;
     return 0;
 }
 }
 
 
-const QList<StarputopDevice> *MainWindow::serverDevices() const
+const QList<starpu_top_device> *MainWindow::serverDevices() const
 {
 {
     return _serverDevices;
     return _serverDevices;
 }
 }

+ 4 - 4
starpu-top/mainwindow.h

@@ -49,7 +49,7 @@ class TaskManager;
 #include <QAbstractSocket>
 #include <QAbstractSocket>
 #include <QTime>
 #include <QTime>
 #include <QSpinBox>
 #include <QSpinBox>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 namespace Ui
 namespace Ui
 {
 {
@@ -79,7 +79,7 @@ public:
     const QList<ParamDescription*> *paramDescriptions() const;
     const QList<ParamDescription*> *paramDescriptions() const;
     DataDescription *dataDescriptionFromId(int dataId);
     DataDescription *dataDescriptionFromId(int dataId);
     ParamDescription *paramDescriptionFromId(int interactiveId);
     ParamDescription *paramDescriptionFromId(int interactiveId);
-    const QList<StarputopDevice> *serverDevices() const;
+    const QList<starpu_top_device> *serverDevices() const;
     // Get different widgets metadata
     // Get different widgets metadata
     const QHash<DataWidgetType, QString> *dataWidgetNames() const;
     const QHash<DataWidgetType, QString> *dataWidgetNames() const;
     const QHash<DataType, QSet<DataWidgetType> >
     const QHash<DataType, QSet<DataWidgetType> >
@@ -166,7 +166,7 @@ private:
     // Different descriptions
     // Different descriptions
     QList<DataDescription*> *_dataDescriptions;
     QList<DataDescription*> *_dataDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
     QList<ParamDescription*> *_paramDescriptions;
-    QList<StarputopDevice> *_serverDevices;
+    QList<starpu_top_device> *_serverDevices;
     int _nbDataWidgets;
     int _nbDataWidgets;
     int _nbDataAggregatorWidgets;
     int _nbDataAggregatorWidgets;
     int _nbInteractiveWidgets;
     int _nbInteractiveWidgets;
@@ -233,7 +233,7 @@ public slots:
     void initClient(QString serverID,
     void initClient(QString serverID,
                     QList<DataDescription*> *dataDescriptions,
                     QList<DataDescription*> *dataDescriptions,
                     QList<ParamDescription*> *paramDescriptions,
                     QList<ParamDescription*> *paramDescriptions,
-                    QList<StarputopDevice> *serverDevices);
+                    QList<starpu_top_device> *serverDevices);
     // Connection events handlers
     // Connection events handlers
     void connectionSucceeded();
     void connectionSucceeded();
     void connectionAborted(QString message);
     void connectionAborted(QString message);

+ 3 - 3
starpu-top/mainwindow.ui

@@ -21,7 +21,7 @@
   </property>
   </property>
   <property name="windowIcon">
   <property name="windowIcon">
    <iconset resource="resources.qrc">
    <iconset resource="resources.qrc">
-    <normaloff>:/images/starputop.png</normaloff>:/images/starputop.png</iconset>
+    <normaloff>:/images/starpu_top.png</normaloff>:/images/starpu_top.png</iconset>
   </property>
   </property>
   <widget class="QWidget" name="centralWidget">
   <widget class="QWidget" name="centralWidget">
    <layout class="QGridLayout" name="gridLayout_2">
    <layout class="QGridLayout" name="gridLayout_2">
@@ -42,7 +42,7 @@
      <height>21</height>
      <height>21</height>
     </rect>
     </rect>
    </property>
    </property>
-   <widget class="QMenu" name="menuStarputop">
+   <widget class="QMenu" name="menu_starpu_top">
     <property name="title">
     <property name="title">
      <string>StarPU-Top</string>
      <string>StarPU-Top</string>
     </property>
     </property>
@@ -59,7 +59,7 @@
     </property>
     </property>
     <addaction name="actionPreferences"/>
     <addaction name="actionPreferences"/>
    </widget>
    </widget>
-   <addaction name="menuStarputop"/>
+   <addaction name="menu_starpu_top"/>
    <addaction name="menuDisplay"/>
    <addaction name="menuDisplay"/>
    <addaction name="menuHelp"/>
    <addaction name="menuHelp"/>
   </widget>
   </widget>

+ 1 - 1
starpu-top/preferencesdialog.h

@@ -33,7 +33,7 @@ class SessionSetupManager;
 #include <QMetaType>
 #include <QMetaType>
 #include <QDialog>
 #include <QDialog>
 #include <QComboBox>
 #include <QComboBox>
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 
 
 namespace Ui
 namespace Ui
 {
 {

+ 1 - 1
starpu-top/resources.qrc

@@ -13,7 +13,7 @@
         <file>images/add.png</file>
         <file>images/add.png</file>
         <file>images/remove.png</file>
         <file>images/remove.png</file>
         <file>images/widget.png</file>
         <file>images/widget.png</file>
-        <file>images/starputop.png</file>
+        <file>images/starpu_top.png</file>
         <file>images/windows.png</file>
         <file>images/windows.png</file>
         <file>images/lock.png</file>
         <file>images/lock.png</file>
     </qresource>
     </qresource>

+ 1 - 1
starpu-top/sessionsetupmanager.h

@@ -28,7 +28,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 
 class MainWindow;
 class MainWindow;
 
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include <QSettings>
 #include <QSettings>
 
 
 static const QString SESSION_SETUPS_DIR = "./sessionsetups";
 static const QString SESSION_SETUPS_DIR = "./sessionsetups";

+ 7 - 7
starpu-top/starputoptypes.h

@@ -23,8 +23,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
 */
 
 
 
 
-#ifndef STARPUTOPTYPES_H
-#define STARPUTOPTYPES_H
+#ifndef STARPU_TOP_TYPES_H
+#define STARPU_TOP_TYPES_H
 
 
 #include <QString>
 #include <QString>
 #include <QStringList>
 #include <QStringList>
@@ -112,7 +112,7 @@ enum ParamType
     PARAM_TYPE_ENUM = 4,
     PARAM_TYPE_ENUM = 4,
 };
 };
 
 
-enum StarputopDeviceType
+enum starpu_top_device_type
 {
 {
     SERVERDEVICE_CPU = 0,
     SERVERDEVICE_CPU = 0,
     SERVERDEVICE_CUDA = 1,
     SERVERDEVICE_CUDA = 1,
@@ -124,9 +124,9 @@ enum StarputopDeviceType
 typedef struct
 typedef struct
 {
 {
     int id;
     int id;
-    StarputopDeviceType type;
+    starpu_top_device_type type;
     QString name;
     QString name;
-} StarputopDevice;
+} starpu_top_device;
 
 
 // Server tasks
 // Server tasks
 typedef struct
 typedef struct
@@ -135,7 +135,7 @@ typedef struct
     int deviceId;
     int deviceId;
     qlonglong timestampStart;
     qlonglong timestampStart;
     qlonglong timestampEnd;
     qlonglong timestampEnd;
-} StarputopTask;
+} starpu_top_task;
 
 
 // Descriptions
 // Descriptions
 typedef struct
 typedef struct
@@ -318,4 +318,4 @@ typedef struct
     QList<int> dataIds;
     QList<int> dataIds;
 } DataAggregatorWidgetSetup;
 } DataAggregatorWidgetSetup;
 
 
-#endif // STARPUTOPTYPES_H
+#endif // STARPU_TOP_TYPES_H

+ 7 - 7
starpu-top/taskmanager.cpp

@@ -171,10 +171,10 @@ void TaskManager::addTaskEnd(int taskId, qlonglong timestampEnd)
     }
     }
 }
 }
 
 
-QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
-                                        qlonglong timestampEnd)
+QList<starpu_top_task> TaskManager::tasks(qlonglong timestampStart,
+					  qlonglong timestampEnd)
 {
 {
-    QList < StarputopTask > tasks;
+    QList < starpu_top_task > tasks;
 
 
     _selectTasksQuery.addBindValue(timestampStart);
     _selectTasksQuery.addBindValue(timestampStart);
     _selectTasksQuery.addBindValue(timestampEnd);
     _selectTasksQuery.addBindValue(timestampEnd);
@@ -206,7 +206,7 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
             qlonglong timestampEnd =
             qlonglong timestampEnd =
                     _selectTasksQuery.value(endField).toLongLong();
                     _selectTasksQuery.value(endField).toLongLong();
 
 
-            StarputopTask task;
+            starpu_top_task task;
             task.taskId = taskId;
             task.taskId = taskId;
             task.deviceId = deviceId;
             task.deviceId = deviceId;
             task.timestampStart = timestampStart;
             task.timestampStart = timestampStart;
@@ -220,10 +220,10 @@ QList<StarputopTask> TaskManager::tasks(qlonglong timestampStart,
     return tasks;
     return tasks;
 }
 }
 
 
-QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
+QList<starpu_top_task> TaskManager::prevTasks(qlonglong timestampStart,
                                             qlonglong timestampEnd)
                                             qlonglong timestampEnd)
 {
 {
-    QList < StarputopTask > prevTasks;
+    QList < starpu_top_task > prevTasks;
 
 
     _selectPrevTasksQuery.addBindValue(timestampStart);
     _selectPrevTasksQuery.addBindValue(timestampStart);
     _selectPrevTasksQuery.addBindValue(timestampEnd);
     _selectPrevTasksQuery.addBindValue(timestampEnd);
@@ -255,7 +255,7 @@ QList<StarputopTask> TaskManager::prevTasks(qlonglong timestampStart,
             qlonglong timestampEnd =
             qlonglong timestampEnd =
                     _selectPrevTasksQuery.value(endField).toLongLong();
                     _selectPrevTasksQuery.value(endField).toLongLong();
 
 
-            StarputopTask prevTask;
+            starpu_top_task prevTask;
             prevTask.taskId = taskId;
             prevTask.taskId = taskId;
             prevTask.deviceId = deviceId;
             prevTask.deviceId = deviceId;
             prevTask.timestampStart = timestampStart;
             prevTask.timestampStart = timestampStart;

+ 3 - 3
starpu-top/taskmanager.h

@@ -26,7 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #ifndef TASKMANAGER_H
 #ifndef TASKMANAGER_H
 #define TASKMANAGER_H
 #define TASKMANAGER_H
 
 
-#include "starputoptypes.h"
+#include "starpu_top_types.h"
 #include <QDebug>
 #include <QDebug>
 #include <QtSql/QSqlDatabase>
 #include <QtSql/QSqlDatabase>
 #include <QtSql/QSqlQuery>
 #include <QtSql/QSqlQuery>
@@ -46,9 +46,9 @@ public:
     void addTaskStart(int taskId, int deviceId, qlonglong timestampStart);
     void addTaskStart(int taskId, int deviceId, qlonglong timestampStart);
     void addTaskEnd(int taskId, qlonglong timestampEnd);
     void addTaskEnd(int taskId, qlonglong timestampEnd);
     // Getters
     // Getters
-    QList<StarputopTask> tasks(qlonglong timestampStart,
+    QList<starpu_top_task> tasks(qlonglong timestampStart,
                                qlonglong timestampEnd);
                                qlonglong timestampEnd);
-    QList<StarputopTask> prevTasks(qlonglong timestampStart,
+    QList<starpu_top_task> prevTasks(qlonglong timestampStart,
                                    qlonglong timestampEnd);
                                    qlonglong timestampEnd);
 
 
 private:
 private:

+ 1 - 0
starpufft/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 97 - 0
starpufft/Makefile.am

@@ -0,0 +1,97 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+
+lib_LTLIBRARIES = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la
+
+EXTRA_DIST =			\
+	float.h			\
+	double.h		\
+	cudax_kernels.h		\
+	starpufftx.c		\
+	starpufftx1d.c		\
+	starpufftx2d.c		\
+	cuda_kernels.cu		\
+	cudaf_kernels.cu	\
+	cudax_kernels.cu	\
+	examples/testx.c	\
+	examples/testx_threads.c\
+	examples/testf_threads.c\
+	examples/test_threads.c
+
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 				\
+	starpufft.h
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpufft.pc starpufft-1.0.pc
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = starpufft.c starpufftf.c starpufft_common.c
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS) $(FFTWF_LIBS) $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUFFT_LDFLAGS)
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(FFTWF_CFLAGS)
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUFFT_INTERFACE_CURRENT):$(LIBSTARPUFFT_INTERFACE_REVISION):$(LIBSTARPUFFT_INTERFACE_AGE)
+
+if STARPU_USE_CUDA
+NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC
+
+cudaf_kernels.o: cudaf_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir}
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cudaf_kernels.cu
+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS = cudaf_kernels.o starpufft.lo starpufftf.lo starpufft_common.lo
+
+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
+cuda_kernels.o: cuda_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir} -arch sm_13
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cuda_kernels.cu
+am_libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_OBJECTS += cuda_kernels.o
+endif
+
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD +=  $(STARPU_CUDA_LDFLAGS)
+endif
+
+examplebindir = $(libdir)/starpu/examples/starpufft
+examplebin_PROGRAMS =				\
+	examples/testf \
+	examples/test
+
+check_PROGRAMS = examples/testf
+examples_testf_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTWF_LIBS)
+
+# If we don't have CUDA, we assume that we have fftw available in double
+# precision anyway, we just want to make sure that if CUFFT is used, it also
+# supports double precision.
+if !STARPU_USE_CUDA
+check_PROGRAMS += examples/test
+else
+if STARPU_HAVE_CUFFTDOUBLECOMPLEX
+check_PROGRAMS += examples/test
+endif
+endif
+examples_test_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(FFTW_LIBS)
+
+TESTS = $(check_PROGRAMS)
+
+
+#check_PROGRAMS += examples/test_threads examples/testf_threads
+#examples_test_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3_threads
+#examples_testf_threads_LDADD = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu.la -lfftw3f_threads
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 19 - 0
starpufft/cuda_kernels.cu

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "cudax_kernels.cu"

+ 19 - 0
starpufft/cudaf_kernels.cu

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "cudax_kernels.cu"

+ 156 - 0
starpufft/cudax_kernels.cu

@@ -0,0 +1,156 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define _externC extern "C"
+#include "cudax_kernels.h"
+
+/* Note: these assume that the sizes are powers of two */
+
+#define VARS_1d \
+	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned numthreads = blockDim.x * gridDim.x;
+
+#define DISTRIB_1d(n, func,args) \
+	unsigned threads_per_block = 128; \
+\
+	if (n < threads_per_block) \
+	{			   \
+		dim3 dimGrid(n); \
+		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+	} 					\
+	else 					\
+	{				     \
+		dim3 dimGrid(n / threads_per_block); \
+		dim3 dimBlock(threads_per_block); \
+		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+	} \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n2;
+
+	for (j = start; j < end; j += numthreads)
+		twisted1[j] = in[i+j*n1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
+{
+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2));
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned n, unsigned i)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n;
+
+	for (j = start; j < end; j += numthreads)
+		out[j] = _cuCmul(out[j], roots[i*j]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i)
+{
+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i));
+}
+
+#define VARS_2d \
+	unsigned startx = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned starty = threadIdx.y + blockIdx.y * blockDim.y; \
+	unsigned numthreadsx = blockDim.x * gridDim.x; \
+	unsigned numthreadsy = blockDim.y * gridDim.y;
+
+/* FIXME: introduce threads_per_dim_n / m instead */
+#define DISTRIB_2d(n, m, func, args) \
+	unsigned threads_per_dim = 16; \
+	if (n < threads_per_dim) \
+	{				   \
+		if (m < threads_per_dim) \
+		{			    \
+			dim3 dimGrid(n, m); \
+			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+		else \
+		{					      \
+			dim3 dimGrid(1, m / threads_per_dim); \
+			dim3 dimBlock(n, threads_per_dim); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+	} \
+	else \
+	{				   \
+		if (m < threads_per_dim) \
+		{					      \
+			dim3 dimGrid(n / threads_per_dim, 1); \
+			dim3 dimBlock(threads_per_dim, m); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+		else \
+		{							\
+			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
+			dim3 dimBlock(threads_per_dim, threads_per_dim); \
+			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		} \
+	} \
+	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+	unsigned m = m1*m2;
+
+	for (k = startx; k < endx; k += numthreadsx)
+		for (l = starty; l < endy; l += numthreadsy)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2));
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _cuComplex * roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+
+	for (k = startx; k < endx ; k += numthreadsx)
+		for (l = starty; l < endy ; l += numthreadsy)
+			out[k*m2 + l] = _cuCmul(_cuCmul(out[k*m2 + l], roots0[i*k]), roots1[j*l]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j));
+}

+ 23 - 0
starpufft/cudax_kernels.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <cuComplex.h>
+#include <starpu_cuda.h>
+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2);
+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i);
+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2);
+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j);

+ 51 - 0
starpufft/double.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#undef  FLOAT
+#define DOUBLE
+
+typedef double real;
+#ifdef STARPU_HAVE_FFTW
+typedef fftw_complex _fftw_complex;
+typedef fftw_plan _fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+typedef cuDoubleComplex _cuComplex;
+typedef cufftDoubleComplex _cufftComplex;
+#define _cufftExecC2C cufftExecZ2Z
+#define _cufftExecR2C cufftExecD2Z
+#define _cufftExecC2R cufftExecZ2D
+#define _CUFFT_C2C CUFFT_Z2Z
+#define _CUFFT_R2C CUFFT_D2Z
+#define _CUFFT_C2R CUFFT_Z2D
+#define _cuCmul(x,y) cuCmul(x,y)
+#endif
+#define STARPUFFT(name) starpufft_##name
+#define _FFTW(name) fftw_##name
+
+#define TYPE ""

+ 19 - 0
starpufft/examples/test.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx.c"

+ 19 - 0
starpufft/examples/test_threads.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx_threads.c"

+ 19 - 0
starpufft/examples/testf.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx.c"

+ 19 - 0
starpufft/examples/testf_threads.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx_threads.c"

+ 283 - 0
starpufft/examples/testx.c

@@ -0,0 +1,283 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#undef STARPU_USE_CUDA
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#define SIGN (-1)
+/* #define SIGN (1) */
+
+#ifdef STARPU_HAVE_FFTW
+static void check_fftw(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++)
+	{
+		double diff = cabs(out[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-7 || relavgdiff > 1e-7)) {
+		fprintf(stderr, "Failure: Difference too big (TYPE f)\n");
+		exit(EXIT_FAILURE);
+	}
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+	{
+		fprintf(stderr, "Failure: Difference too big\n");
+		exit(EXIT_FAILURE);
+	}
+}
+#endif
+
+#ifdef STARPU_USE_CUDA
+static void check_cuda(STARPUFFT(complex) *out, STARPUFFT(complex) *out_fftw, int size)
+{
+	int i;
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++)
+	{
+		double diff = cabs(out_cuda[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		exit(EXIT_FAILURE);
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		exit(EXIT_FAILURE);
+}
+#endif
+
+int main(int argc, char *argv[])
+{
+	int i, ret;
+	int size;
+	int n = 0, m = 0;
+	STARPUFFT(plan) plan;
+	starpu_data_handle_t in_handle, out_handle;
+#ifdef STARPU_HAVE_FFTW
+	_FFTW(plan) fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+	cufftHandle cuda_plan;
+	cudaError_t cures;
+#endif
+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
+	struct timeval begin, end;
+	double timing;
+	size_t bytes;
+#endif
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	if (argc == 1)
+	{
+		n = 42;
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 2)
+	{
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 3)
+	{
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	}
+	else
+	{
+		assert(0);
+	}
+
+#if defined(STARPU_HAVE_FFTW) || defined(STARPU_USE_CUDA)
+	bytes = size * sizeof(STARPUFFT(complex));
+#endif
+
+	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
+	starpu_srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = starpu_drand48() + I * starpu_drand48();
+
+	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));
+
+#ifdef STARPU_HAVE_FFTW
+	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
+#endif
+
+#ifdef STARPU_USE_CUDA
+	STARPUFFT(complex) *out_cuda = STARPUFFT(malloc)(size * sizeof(*out_cuda));
+#endif
+
+	if (argc <= 2)
+	{
+		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
+#ifdef STARPU_HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef STARPU_USE_CUDA
+		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
+			printf("erf\n");
+#endif
+
+	}
+	else if (argc == 3)
+	{
+		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
+#ifdef STARPU_HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef STARPU_USE_CUDA
+		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
+#endif
+	}
+	else
+	{
+		assert(0);
+	}
+
+#ifdef STARPU_HAVE_FFTW
+	gettimeofday(&begin, NULL);
+	_FFTW(execute_dft)(fftw_plan, in, out_fftw);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+#ifdef STARPU_USE_CUDA
+	gettimeofday(&begin, NULL);
+	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
+		printf("erf2\n");
+	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(cures);
+	gettimeofday(&end, NULL);
+	cufftDestroy(cuda_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+
+	STARPUFFT(execute)(plan, in, out);
+	STARPUFFT(showstats)(stdout);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+
+#if 1
+	starpu_vector_data_register(&in_handle, 0, (uintptr_t) in, size, sizeof(*in));
+	starpu_vector_data_register(&out_handle, 0, (uintptr_t) out, size, sizeof(*out));
+
+	STARPUFFT(execute_handle)(plan, in_handle, out_handle);
+
+	starpu_data_unregister(in_handle);
+	starpu_data_unregister(out_handle);
+
+#ifdef STARPU_HAVE_FFTW
+	check_fftw(out, out_fftw, size);
+#endif
+#ifdef STARPU_USE_CUDA
+	check_cuda(out, out_cuda, size);
+#endif
+#endif
+
+	STARPUFFT(showstats)(stdout);
+	STARPUFFT(destroy_plan)(plan);
+
+	printf("\n");
+#if 0
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
+	printf("\n\n");
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
+	printf("\n\n");
+#ifdef STARPU_HAVE_FFTW
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
+	printf("\n\n");
+#endif
+#endif
+
+	STARPUFFT(free)(in);
+	STARPUFFT(free)(out);
+
+#ifdef STARPU_HAVE_FFTW
+	STARPUFFT(free)(out_fftw);
+#endif
+
+#ifdef STARPU_USE_CUDA
+	free(out_cuda);
+#endif
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}

+ 113 - 0
starpufft/examples/testx_threads.c

@@ -0,0 +1,113 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#include <fftw3.h>
+
+#define SIGN (-1)
+/* #define SIGN (1) */
+
+int main(int argc, char *argv[])
+{
+	int i;
+	struct timeval begin, end;
+	int size;
+	size_t bytes;
+	int n = 0, m = 0;
+	_FFTW(plan) fftw_plan;
+	double timing;
+	char *num;
+	int num_threads = 1;
+
+	_FFTW(init_threads)();
+
+	num = getenv("NUM_THREADS");
+	if (num)
+		num_threads = atoi(num);
+	_FFTW(plan_with_nthreads)(num_threads);
+
+	if (argc < 2 || argc > 3)
+	{
+		fprintf(stderr,"need one or two size of vector\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (argc == 2)
+	{
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	}
+	else if (argc == 3)
+	{
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	}
+	else
+	{
+		assert(0);
+	}
+
+	bytes = size * sizeof(_FFTW(complex));
+
+	_FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in));
+	starpu_srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = starpu_drand48() + I * starpu_drand48();
+
+	_FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw));
+
+	if (argc == 2)
+	{
+		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
+
+	}
+	else if (argc == 3)
+	{
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
+	}
+	else
+	{
+		assert(0);
+	}
+
+	gettimeofday(&begin, NULL);
+	_FFTW(execute)(fftw_plan);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads));
+
+	printf("\n");
+
+	return EXIT_SUCCESS;
+}

+ 51 - 0
starpufft/float.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef STARPU_HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef STARPU_USE_CUDA
+#include <cufft.h>
+#endif
+
+#undef  DOUBLE
+#define FLOAT
+
+typedef float real;
+#ifdef STARPU_HAVE_FFTW
+typedef fftwf_complex _fftw_complex;
+typedef fftwf_plan _fftw_plan;
+#endif
+#ifdef STARPU_USE_CUDA
+typedef cuComplex _cuComplex;
+typedef cufftComplex _cufftComplex;
+#define _cufftExecC2C cufftExecC2C
+#define _cufftExecR2C cufftExecR2C
+#define _cufftExecC2R cufftExecC2R
+#define _CUFFT_C2C CUFFT_C2C
+#define _CUFFT_R2C CUFFT_R2C
+#define _CUFFT_C2R CUFFT_C2R
+#define _cuCmul(x,y) cuCmulf(x,y)
+#endif
+#define STARPUFFT(name) starpufftf_##name
+#define _FFTW(name) fftwf_##name
+
+#define TYPE "f"

+ 27 - 0
starpufft/libstarpufft.pc.in

@@ -0,0 +1,27 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpufft
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@

+ 27 - 0
starpufft/starpufft-1.0.pc.in

@@ -0,0 +1,27 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpufft
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
+Libs: -L${libdir} -lstarpufft-@STARPU_EFFECTIVE_VERSION@ 
+Libs.private: @LDFLAGS@ @LIBS@ @STARPU_CUFFT_LDFLAGS@ @FFTW_LIBS@ @FFTWF_LIBS@

+ 19 - 0
starpufft/starpufft.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "starpufftx.c"

+ 60 - 0
starpufft/starpufft.h

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include <starpu.h>
+
+#define STARPUFFT_FORWARD -1
+#define STARPUFFT_INVERSE 1
+
+#define __STARPUFFT(name) starpufft_##name
+#define __STARPUFFTF(name) starpufftf_##name
+#define __STARPUFFTL(name) starpufftl_##name
+
+#define __STARPUFFT_INTERFACE(starpufft,real) \
+typedef real _Complex starpufft(complex); \
+\
+typedef struct starpufft(plan) *starpufft(plan); \
+\
+starpufft(plan) starpufft(plan_dft_1d)(int n, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_2d)(int n, int m, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_r2c_1d)(int n, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
+\
+void *starpufft(malloc)(size_t n); \
+void starpufft(free)(void *p); \
+\
+void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
+struct starpu_task *starpufft(start)(starpufft(plan) p, void *in, void *out); \
+\
+void starpufft(execute_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+struct starpu_task *starpufft(start_handle)(starpufft(plan) p, starpu_data_handle_t in, starpu_data_handle_t out); \
+\
+void starpufft(cleanup)(starpufft(plan) p); \
+void starpufft(destroy_plan)(starpufft(plan) p); \
+\
+void starpufft(startstats)(void); \
+void starpufft(stopstats)(void); \
+void starpufft(showstats)(FILE *out);
+
+__STARPUFFT_INTERFACE(__STARPUFFT, double)
+__STARPUFFT_INTERFACE(__STARPUFFTF, float)
+__STARPUFFT_INTERFACE(__STARPUFFTL, long double)
+
+/* Internal use */
+extern int starpufft_last_plan_number;

+ 21 - 0
starpufft/starpufft_common.c

@@ -0,0 +1,21 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "starpufft.h"
+
+/* Used as an identifier in starpu tags to let plans run concurrently */
+int starpufft_last_plan_number;

+ 19 - 0
starpufft/starpufftf.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "starpufftx.c"

+ 454 - 0
starpufft/starpufftx.c

@@ -0,0 +1,454 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define PARALLEL 0
+
+#include <math.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+#include <config.h>
+
+#include "starpufft.h"
+#ifdef STARPU_USE_CUDA
+#define _externC extern
+#include "cudax_kernels.h"
+
+#if defined(FLOAT) || defined(STARPU_HAVE_CUFFTDOUBLECOMPLEX)
+#  define __STARPU_USE_CUDA
+#else
+#  undef __STARPU_USE_CUDA
+#endif
+
+#endif
+
+#define _FFTW_FLAGS FFTW_ESTIMATE
+
+/* Steps for the parallel variant */
+enum steps
+{
+	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
+};
+
+#define NUMBER_BITS 5
+#define NUMBER_SHIFT (64 - NUMBER_BITS)
+#define STEP_BITS 3
+#define STEP_SHIFT (NUMBER_SHIFT - STEP_BITS)
+
+/* Tags for the steps of the parallel variant */
+#define _STEP_TAG(plan, step, i) (((starpu_tag_t) plan->number << NUMBER_SHIFT) | ((starpu_tag_t)(step) << STEP_SHIFT) | (starpu_tag_t) (i))
+
+
+#define I_BITS STEP_SHIFT
+
+enum type
+{
+	R2C,
+	C2R,
+	C2C
+};
+
+static unsigned task_per_worker[STARPU_NMAXWORKERS];
+static unsigned samples_per_worker[STARPU_NMAXWORKERS];
+static struct timeval start, submit_tasks, end;
+
+/*
+ *
+ *	The actual kernels
+ *
+ */
+
+struct STARPUFFT(plan)
+{
+	int number;	/* uniquely identifies the plan, for starpu tags */
+
+	int *n;
+	int *n1;
+	int *n2;
+	int totsize;
+	int totsize1;	/* Number of first-round tasks */
+	int totsize2;	/* Size of first-round tasks */
+	int totsize3;	/* Number of second-round tasks */
+	int totsize4;	/* Size of second-round tasks */
+	int dim;
+	enum type type;
+	int sign;
+
+	STARPUFFT(complex) *roots[2];
+	starpu_data_handle_t roots_handle[2];
+
+	/* For each worker, we need some data */
+	struct
+	{
+#ifdef STARPU_USE_CUDA
+		/* CUFFT plans */
+		cufftHandle plan1_cuda, plan2_cuda;
+		/* Sequential version */
+		cufftHandle plan_cuda;
+#endif
+#ifdef STARPU_HAVE_FFTW
+		/* FFTW plans */
+		_fftw_plan plan1_cpu, plan2_cpu;
+		/* Sequential version */
+		_fftw_plan plan_cpu;
+#endif
+	} plans[STARPU_NMAXWORKERS];
+
+	/* Buffers for codelets */
+	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
+
+	/* corresponding starpu DSM handles */
+	starpu_data_handle_t in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle, out_handle;
+
+	/* Tasks */
+	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
+	struct starpu_task *join_task, *end_task;
+
+	/* Arguments for tasks */
+	struct STARPUFFT(args) *fft1_args, *fft2_args;
+};
+
+struct STARPUFFT(args)
+{
+	struct STARPUFFT(plan) *plan;
+	int i, j, jj, kk, ll, *iv, *kkv;
+};
+
+static void
+check_dims(STARPUFFT(plan) plan)
+{
+	int dim;
+	for (dim = 0; dim < plan->dim; dim++)
+		if (plan->n[dim] & (plan->n[dim]-1))
+		{
+			fprintf(stderr,"can't cope with non-power-of-2\n");
+			STARPU_ABORT();
+		}
+}
+
+static void
+compute_roots(STARPUFFT(plan) plan)
+{
+	int dim, k;
+
+	/* Compute the n-roots and m-roots of unity for twiddling */
+	for (dim = 0; dim < plan->dim; dim++)
+	{
+		STARPUFFT(complex) exp = (plan->sign * 2. * 4.*atan(1.)) * _Complex_I / (STARPUFFT(complex)) plan->n[dim];
+		plan->roots[dim] = malloc(plan->n[dim] * sizeof(**plan->roots));
+		for (k = 0; k < plan->n[dim]; k++)
+			plan->roots[dim][k] = cexp(exp*k);
+		starpu_vector_data_register(&plan->roots_handle[dim], 0, (uintptr_t) plan->roots[dim], plan->n[dim], sizeof(**plan->roots));
+
+#ifdef STARPU_USE_CUDA
+		if (plan->n[dim] > 100000)
+		{
+			/* prefetch the big root array on GPUs */
+			unsigned worker;
+			unsigned nworkers = starpu_worker_get_count();
+			for (worker = 0; worker < nworkers; worker++)
+			{
+				unsigned node = starpu_worker_get_memory_node(worker);
+				if (starpu_worker_get_type(worker) == STARPU_CUDA_WORKER)
+					starpu_data_prefetch_on_node(plan->roots_handle[dim], node, 0);
+			}
+		}
+#endif
+	}
+}
+
+/* Only CUDA capability >= 1.3 supports doubles, rule old card out.  */
+#ifdef DOUBLE
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) {
+	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+		return 1;
+#ifdef STARPU_USE_CUDA
+	{
+		/* Cuda device */
+		const struct cudaDeviceProp *props;
+		props = starpu_cuda_get_device_properties(workerid);
+		if (props->major >= 2 || props->minor >= 3)
+			/* At least compute capability 1.3, supports doubles */
+			return 1;
+		/* Old card does not support doubles */
+		return 0;
+	}
+#endif
+	return 0;
+}
+#define CAN_EXECUTE .can_execute = can_execute,
+#else
+#define CAN_EXECUTE
+#endif
+
+#include "starpufftx1d.c"
+#include "starpufftx2d.c"
+
+struct starpu_task *
+STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
+{
+	struct starpu_task *task;
+	int z;
+
+	plan->in = _in;
+	plan->out = _out;
+
+	switch (plan->dim)
+	{
+		case 1:
+		{
+			switch (plan->type)
+			{
+			case C2C:
+				starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+				if (!PARALLEL)
+					starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
+				if (PARALLEL)
+				{
+					for (z = 0; z < plan->totsize1; z++)
+						plan->twist1_tasks[z]->handles[0] = plan->in_handle;
+				}
+				task = STARPUFFT(start1dC2C)(plan, plan->in_handle, plan->out_handle);
+				break;
+			default:
+				STARPU_ABORT();
+				break;
+			}
+			break;
+		}
+		case 2:
+			starpu_vector_data_register(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (!PARALLEL)
+				starpu_vector_data_register(&plan->out_handle, 0, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (PARALLEL)
+			{
+				for (z = 0; z < plan->totsize1; z++)
+					plan->twist1_tasks[z]->handles[0] = plan->in_handle;
+			}
+			task = STARPUFFT(start2dC2C)(plan, plan->in_handle, plan->out_handle);
+			break;
+		default:
+			STARPU_ABORT();
+			break;
+	}
+	return task;
+}
+
+void
+STARPUFFT(cleanup)(STARPUFFT(plan) plan)
+{
+	if (plan->in_handle)
+		starpu_data_unregister(plan->in_handle);
+	if (!PARALLEL)
+	{
+		if (plan->out_handle)
+			starpu_data_unregister(plan->out_handle);
+	}
+}
+
+struct starpu_task *
+STARPUFFT(start_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	return STARPUFFT(start1dC2C)(plan, in, out);
+}
+
+void
+STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
+{
+	memset(task_per_worker, 0, sizeof(task_per_worker));
+	memset(samples_per_worker, 0, sizeof(task_per_worker));
+
+	gettimeofday(&start, NULL);
+
+	struct starpu_task *task = STARPUFFT(start)(plan, in, out);
+	gettimeofday(&submit_tasks, NULL);
+	starpu_task_wait(task);
+
+	STARPUFFT(cleanup)(plan);
+
+	gettimeofday(&end, NULL);
+}
+
+void
+STARPUFFT(execute_handle)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	struct starpu_task *task = STARPUFFT(start_handle)(plan, in, out);
+	starpu_task_wait(task);
+}
+
+/* Destroy FFTW plans, unregister and free buffers, and free tags */
+void
+STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
+{
+	int workerid, dim, i;
+
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+	{
+		switch (starpu_worker_get_type(workerid))
+		{
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+			if (PARALLEL)
+			{
+				_FFTW(destroy_plan)(plan->plans[workerid].plan1_cpu);
+				_FFTW(destroy_plan)(plan->plans[workerid].plan2_cpu);
+			}
+			else
+			{
+				_FFTW(destroy_plan)(plan->plans[workerid].plan_cpu);
+			}
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+#ifdef STARPU_USE_CUDA
+			/* FIXME: Can't deallocate */
+#endif
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+
+	if (PARALLEL)
+	{
+		for (i = 0; i < plan->totsize1; i++)
+		{
+			starpu_data_unregister(plan->twisted1_handle[i]);
+			free(plan->twist1_tasks[i]);
+			starpu_data_unregister(plan->fft1_handle[i]);
+			free(plan->fft1_tasks[i]);
+		}
+
+		free(plan->twisted1_handle);
+		free(plan->twist1_tasks);
+		free(plan->fft1_handle);
+		free(plan->fft1_tasks);
+		free(plan->fft1_args);
+
+		free(plan->join_task);
+
+		for (i = 0; i < plan->totsize3; i++)
+		{
+			starpu_data_unregister(plan->twisted2_handle[i]);
+			free(plan->twist2_tasks[i]);
+			starpu_data_unregister(plan->fft2_handle[i]);
+			free(plan->fft2_tasks[i]);
+			free(plan->twist3_tasks[i]);
+		}
+
+		free(plan->end_task);
+
+		free(plan->twisted2_handle);
+		free(plan->twist2_tasks);
+		free(plan->fft2_handle);
+		free(plan->fft2_tasks);
+		free(plan->twist3_tasks);
+		free(plan->fft2_args);
+
+		for (dim = 0; dim < plan->dim; dim++)
+		{
+			starpu_data_unregister(plan->roots_handle[dim]);
+			free(plan->roots[dim]);
+		}
+
+		switch (plan->dim)
+		{
+		case 1:
+			STARPUFFT(free_1d_tags)(plan);
+			break;
+		case 2:
+			STARPUFFT(free_2d_tags)(plan);
+			break;
+		default:
+			STARPU_ABORT();
+			break;
+		}
+
+		free(plan->n1);
+		free(plan->n2);
+		STARPUFFT(free)(plan->twisted1);
+		STARPUFFT(free)(plan->fft1);
+		STARPUFFT(free)(plan->twisted2);
+		STARPUFFT(free)(plan->fft2);
+	}
+	free(plan->n);
+	free(plan);
+}
+
+void *
+STARPUFFT(malloc)(size_t n)
+{
+#ifdef STARPU_USE_CUDA
+	void *res;
+	starpu_malloc(&res, n);
+	return res;
+#else
+#  ifdef STARPU_HAVE_FFTW
+	return _FFTW(malloc)(n);
+#  else
+	return malloc(n);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(free)(void *p)
+{
+#ifdef STARPU_USE_CUDA
+	starpu_free(p);
+#else
+#  ifdef STARPU_HAVE_FFTW
+	_FFTW(free)(p);
+#  else
+	free(p);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(showstats)(FILE *out)
+{
+	int worker;
+	unsigned total;
+
+#define TIMING(begin,end) (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec))
+#define MSTIMING(begin,end) (TIMING(begin,end)/1000.)
+	double paratiming = TIMING(start,end);
+	fprintf(out, "Tasks submission took %2.2f ms\n", MSTIMING(start,submit_tasks));
+	fprintf(out, "Tasks termination took %2.2f ms\n", MSTIMING(submit_tasks,end));
+
+	fprintf(out, "Total %2.2f ms\n", MSTIMING(start,end));
+
+	for (worker = 0, total = 0; worker < starpu_worker_get_count(); worker++)
+		total += task_per_worker[worker];
+
+	for (worker = 0; worker < starpu_worker_get_count(); worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_worker_get_name(worker, name, sizeof(name));
+
+			unsigned long bytes = sizeof(STARPUFFT(complex))*samples_per_worker[worker];
+
+			fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%u %2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/paratiming, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+		}
+	}
+}

+ 847 - 0
starpufft/starpufftx1d.c

@@ -0,0 +1,847 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ *
+ * Dumb parallel version
+ *
+ */
+
+#define DIV_1D 64
+
+  /*
+   * Overall strategy for an fft of size n:
+   * - perform n1 ffts of size n2
+   * - twiddle
+   * - perform n2 ffts of size n1
+   *
+   * - n1 defaults to DIV_1D, thus n2 defaults to n / DIV_1D.
+   *
+   * Precise tasks:
+   *
+   * - twist1: twist the whole n-element input (called "in") into n1 chunks of
+   *           size n2, by using n1 tasks taking the whole n-element input as a
+   *           R parameter and one n2 output as a W parameter. The result is
+   *           called twisted1.
+   * - fft1:   perform n1 (n2) ffts, by using n1 tasks doing one fft each. Also
+   *           twiddle the result to prepare for the fft2. The result is called
+   *           fft1.
+   * - join:   depends on all the fft1s, to gather the n1 results of size n2 in
+   *           the fft1 vector.
+   * - twist2: twist the fft1 vector into n2 chunks of size n1, called twisted2.
+   *           since n2 is typically very large, this step is divided in DIV_1D
+   *           tasks, each of them performing n2/DIV_1D of them
+   * - fft2:   perform n2 ffts of size n1. This is divided in DIV_1D tasks of
+   *           n2/DIV_1D ffts, to be performed in batches. The result is called
+   *           fft2.
+   * - twist3: twist back the result of the fft2s above into the output buffer.
+   *           Only implemented on CPUs for simplicity of the gathering.
+   *
+   * The tag space thus uses 3 dimensions:
+   * - the number of the plan.
+   * - the step (TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END)
+   * - an index i between 0 and DIV_1D-1.
+   */
+
+#define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
+
+#ifdef __STARPU_USE_CUDA
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
+static void
+STARPUFFT(twist1_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	
+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft1:
+ *
+ * Perform one fft of size n2 */
+static void
+STARPUFFT(fft1_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n2 = plan->n2[0];
+	int workerid = starpu_worker_get_id();
+	cufftResult cures;
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft1_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n2 = plan->n2[0];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	const _cufftComplex * restrict roots = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
+static void
+STARPUFFT(fft2_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+	cufftResult cures;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft2_1d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	/* NOTE using batch support */
+	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+/* twist1:
+ *
+ * Twist the full input vector (first parameter) into one chunk of size n2
+ * (second parameter) */
+static void
+STARPUFFT(twist1_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("twist1 %d %g\n", i, (double) cabs(plan->in[i])); */
+
+	for (j = 0; j < n2; j++)
+		twisted1[j] = in[i+j*n1];
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* fft1:
+ *
+ * Perform one fft of size n2 */
+static void
+STARPUFFT(fft1_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n2 = plan->n2[0];
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft1 %d %g\n", i, (double) cabs(twisted1[0])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
+
+	/* twiddle fft1 buffer */
+	for (j = 0; j < n2; j++)
+		fft1[j] = fft1[j] * plan->roots[0][i*j];
+}
+#endif
+
+/* twist2:
+ *
+ * Twist the full vector (results of the fft1s) into one package of n2/DIV_1D
+ * chunks of size n1 */
+static void
+STARPUFFT(twist2_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist2 %d %g\n", jj, (double) cabs(plan->fft1[jj])); */
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			twisted2[jjj*n1+i] = plan->fft1[i*n2+j];
+	}
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* fft2:
+ *
+ * Perform n3 = n2/DIV_1D ffts of size n1 */
+static void
+STARPUFFT(fft2_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	/* int jj = args->jj; */
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft2 %d %g\n", jj, (double) cabs(twisted2[plan->totsize4-1])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
+}
+#endif
+
+/* twist3:
+ *
+ * Spread the package of n2/DIV_1D chunks of size n1 into the output vector */
+static void
+STARPUFFT(twist3_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist3 %d %g\n", jj, (double) cabs(fft2[0])); */
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			plan->out[i*n2+j] = fft2[jjj*n1+i];
+	}
+}
+
+/* Performance models for the 5 kinds of tasks */
+static struct starpu_perfmodel STARPUFFT(twist1_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist1_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(fft1_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft1_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(twist2_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist2_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(fft2_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft2_1d"
+};
+
+static struct starpu_perfmodel STARPUFFT(twist3_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist3_1d"
+};
+
+/* codelet pointers for the 5 kinds of tasks */
+static struct starpu_codelet STARPUFFT(twist1_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(twist1_1d_kernel_gpu), NULL},
+#endif
+	.cpu_funcs = {STARPUFFT(twist1_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist1_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft1_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft1_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft1_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft1_1d_model),
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet STARPUFFT(twist2_1d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist2_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist2_1d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft2_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft2_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft2_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft2_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(twist3_1d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist3_1d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist3_1d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+/*
+ *
+ * Sequential version
+ *
+ */
+
+#ifdef __STARPU_USE_CUDA
+/* Perform one fft of size n */
+static void
+STARPUFFT(fft_1d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n = plan->n[0];
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft_1d_kernel_gpu)(void *descr[], void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform one fft of size n */
+static void
+STARPUFFT(fft_1d_kernel_cpu)(void *descr[], void *_args)
+{
+	STARPUFFT(plan) plan = _args;
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
+}
+#endif
+
+static struct starpu_perfmodel STARPUFFT(fft_1d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft_1d"
+};
+
+static struct starpu_codelet STARPUFFT(fft_1d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft_1d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft_1d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft_1d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+/* Planning:
+ *
+ * - For each CPU worker, we need to plan the two fftw stages.
+ * - For GPU workers, we need to do the planning in the CUDA context, so we do
+ *   this lazily through the initialised1 and initialised2 flags ; TODO: use
+ *   starpu_execute_on_each_worker instead (done in the omp branch).
+ * - We allocate all the temporary buffers and register them to starpu.
+ * - We create all the tasks, but do not submit them yet. It will be possible
+ *   to reuse them at will to perform several ffts with the same planning.
+ */
+STARPUFFT(plan)
+STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_1D;
+	int n2 = n / n1;
+	int n3;
+	int z;
+	struct starpu_task *task;
+
+if (PARALLEL) {
+#ifdef __STARPU_USE_CUDA
+	/* cufft 1D limited to 8M elements */
+	while (n2 > 8 << 20) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << I_BITS));
+
+	/* distribute the n2 second ffts into DIV_1D packages */
+	n3 = n2 / DIV_1D;
+	STARPU_ASSERT(n2 == n3*DIV_1D);
+}
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+if (PARALLEL) {
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* The plan number has a limited size */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+}
+
+	/* Just one dimension */
+	plan->dim = 1;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+
+if (PARALLEL) {
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+}
+
+	/* Note: this is for coherency with the 2D case */
+	plan->totsize = n;
+
+if (PARALLEL) {
+	plan->totsize1 = n1;
+	plan->totsize2 = n2;
+	plan->totsize3 = DIV_1D;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+}
+	plan->type = C2C;
+	plan->sign = sign;
+
+if (PARALLEL) {
+	/* Compute the w^k just once. */
+	compute_roots(plan);
+}
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+if (PARALLEL) {
+			/* first fft plan: one fft of size n2.
+			 * FFTW imposes that buffer pointers are known at
+			 * planning time. */
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3 ffts of size n1 */
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3,
+					NULL, NULL, 1, plan->totsize1,
+					(void*) 1, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+} else {
+			/* fft plan: one fft of size n. */
+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_1d)(n, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
+}
+#else
+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+#ifdef __STARPU_USE_CUDA
+if (PARALLEL) {
+	starpu_execute_on_each_worker(STARPUFFT(fft1_1d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_1d_plan_gpu), plan, STARPU_CUDA);
+} else {
+	starpu_execute_on_each_worker(STARPUFFT(fft_1d_plan_gpu), plan, STARPU_CUDA);
+}
+#endif
+
+if (PARALLEL) {
+	/* Allocate buffers. */
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	/* Allocate handle arrays */
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	/* Allocate task arrays */
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	/* Allocate codelet argument arrays */
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks: DIV_1D tasks of type twist1 and fft1 */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
+
+		/* TODO: get rid of tags */
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+
+		/* Register the twisted1 buffer of size n2. */
+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		/* Register the fft1 buffer of size n2. */
+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need the result of fft1 on the CPU for the second
+		 * twist anyway, so tell starpu to not keep the fft1 buffer in
+		 * the GPU. */
+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_1d_codelet);
+		/* task->handles[0] = to be filled at execution to point
+		   to the application input. */
+		task->handles[1] = plan->twisted1_handle[z];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_1d_codelet);
+		task->handles[0] = plan->twisted1_handle[z];
+		task->handles[1] = plan->fft1_handle[z];
+		task->handles[2] = plan->roots_handle[0];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that the join task will depend on the fft1 task. */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create the join task, only serving as a dependency point between
+	 * fft1 and twist2 tasks */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks: DIV_1D batches of n2/DIV_1D twist2, fft2,
+	 * and twist3 */
+	for (z = 0; z < plan->totsize3; z++) {
+		int jj = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].jj = jj;
+
+		/* Register n3 twisted2 buffers of size n1 */
+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need the result of fft2 on the CPU for the third
+		 * twist anyway, so tell starpu to not keep the fft2 buffer in
+		 * the GPU. */
+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the join task */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_1D(plan, JOIN, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_1d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_1d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->handles[1] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_1d_codelet);
+		task->handles[0] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished
+		 * this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task, only serving as a join point. */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, END, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+}
+
+	return plan;
+}
+
+/* Actually submit all the tasks. */
+static struct starpu_task *
+STARPUFFT(start1dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+	int ret;
+
+if (PARALLEL) {
+	for (z=0; z < plan->totsize1; z++) {
+		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->join_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	for (z=0; z < plan->totsize3; z++) {
+		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->end_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	return plan->end_task;
+} else /* !PARALLEL */ {
+	struct starpu_task *task;
+
+	/* Create FFT task */
+	task = starpu_task_create();
+	task->detach = 0;
+	task->cl = &STARPUFFT(fft_1d_codelet);
+	task->handles[0] = in;
+	task->handles[1] = out;
+	task->cl_arg = plan;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return task;
+}
+}
+
+/* Free all the tags. The generic code handles freeing the buffers. */
+static void
+STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i;
+	int n1 = plan->n1[0];
+
+	if (!PARALLEL)
+		return;
+
+	for (i = 0; i < n1; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST1, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT1, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, JOIN, 0));
+
+	for (i = 0; i < DIV_1D; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST3, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, END, 0));
+}

+ 850 - 0
starpufft/starpufftx2d.c

@@ -0,0 +1,850 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define DIV_2D_N 8
+#define DIV_2D_M 8
+
+#define I_SHIFT (I_BITS/2)
+#define J_BITS I_SHIFT
+
+#define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
+
+#ifdef __STARPU_USE_CUDA
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft1:
+ *
+ * Perform one fft of size n2,m2 */
+static void
+STARPUFFT(fft1_2d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_worker_get_id();
+	cufftResult cures;
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan1_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft1_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+	const _cufftComplex * restrict roots0 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[2]);
+	const _cufftComplex * restrict roots1 = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[3]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	/* synchronization is done after the twiddling */
+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+/* fft2:
+ *
+ * Perform n3*m3 ffts of size n1,m1 */
+static void
+STARPUFFT(fft2_2d_plan_gpu(void *args))
+{
+	STARPUFFT(plan) plan = args;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+	cufftResult cures;
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan2_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft2_2d_kernel_gpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int n;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	for (n = 0; n < n3*m3; n++) {
+		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	}
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int m = plan->n[1];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("twist1 %d %d %g\n", i, j, (double) cabs(plan->in[i+j])); */
+
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform an n2,m2 fft */
+static void
+STARPUFFT(fft1_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) *twisted1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) *fft1 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft1 %d %d %g\n", i, j, (double) cabs(twisted1[0])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan1_cpu, twisted1, fft1);
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			fft1[k*m2 + l] = fft1[k*m2 + l] * plan->roots[0][i*k] * plan->roots[1][j*l];
+}
+#endif
+
+/* Twist the full vector into a package of n2/DIV_2D_N,m2/DIV_2D_M (n1,m1) chunks */
+static void
+STARPUFFT(twist2_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist2 %d %d %g\n", kk, ll, (double) cabs(plan->fft1[kk+ll])); */
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					twisted2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j] = plan->fft1[i*n1*n2*m2+j*n2*m2+k*m2+l];
+		}
+	}
+}
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) ffts */
+static void
+STARPUFFT(fft2_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	/* int kk = args->kk; */
+	/* int ll = args->ll; */
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) *twisted2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) *fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* printf("fft2 %d %d %g\n", kk, ll, (double) cabs(twisted2[plan->totsize4-1])); */
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan2_cpu, twisted2, fft2);
+}
+#endif
+
+/* Spread the package of (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) chunks into the full vector */
+static void
+STARPUFFT(twist3_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int m = plan->n[1];
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	/* printf("twist3 %d %d %g\n", kk, ll, (double) cabs(fft2[0])); */
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					plan->out[i*n2*m+j*m2+k*m+l] = fft2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j];
+		}
+	}
+}
+
+struct starpu_perfmodel STARPUFFT(twist1_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist1_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(fft1_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft1_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(twist2_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist2_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(fft2_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft2_2d"
+};
+
+struct starpu_perfmodel STARPUFFT(twist3_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"twist3_2d"
+};
+
+static struct starpu_codelet STARPUFFT(twist1_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(twist1_2d_kernel_gpu), NULL},
+#endif
+	.cpu_funcs = {STARPUFFT(twist1_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist1_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft1_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft1_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft1_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft1_2d_model),
+	.nbuffers = 4,
+	.modes = {STARPU_R, STARPU_W, STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet STARPUFFT(twist2_2d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist2_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist2_2d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(fft2_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft2_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft2_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft2_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet STARPUFFT(twist3_2d_codelet) = {
+	.where = STARPU_CPU,
+	.cpu_funcs = {STARPUFFT(twist3_2d_kernel_cpu), NULL},
+	CAN_EXECUTE
+	.model = &STARPUFFT(twist3_2d_model),
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+/*
+ *
+ * Sequential version
+ *
+ */
+
+#ifdef __STARPU_USE_CUDA
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_2d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n = plan->n[0];
+	int m = plan->n[1];
+	int workerid = starpu_worker_get_id();
+
+	cures = cufftPlan2d(&plan->plans[workerid].plan_cuda, n, m, _CUFFT_C2C);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+}
+
+static void
+STARPUFFT(fft_2d_kernel_gpu)(void *descr[], void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_2d_kernel_cpu)(void *descr[], void *_args)
+{
+	STARPUFFT(plan) plan = _args;
+	int workerid = starpu_worker_get_id();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
+}
+#endif
+
+static struct starpu_perfmodel STARPUFFT(fft_2d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft_2d"
+};
+
+static struct starpu_codelet STARPUFFT(fft_2d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft_2d_kernel_gpu), NULL},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft_2d_kernel_cpu), NULL},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft_2d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+STARPUFFT(plan)
+STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_2D_N;
+	int n2 = n / n1;
+	int n3;
+	int m1 = DIV_2D_M;
+	int m2 = m / m1;
+	int m3;
+	int z;
+	struct starpu_task *task;
+
+if (PARALLEL) {
+	/*
+	 * Simple strategy:
+	 *
+	 * - twist1: twist input in n1*m1 (n2,m2) chunks
+	 * - fft1:   perform n1*m1 (n2,m2) ffts
+	 * - twist2: twist into n2*m2 (n1,m1) chunks distributed in
+	 *           DIV_2D_N*DIV_2D_M groups
+	 * - fft2:   perform DIV_2D_N*DIV_2D_M times n3*m3 (n1,m1) ffts
+	 * - twist3: twist back into output
+	 */
+
+#ifdef __STARPU_USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (n2 > 16384) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << J_BITS));
+
+
+#ifdef __STARPU_USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (m2 > 16384) {
+		m1 *= 2;
+		m2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(m == m1*m2);
+	STARPU_ASSERT(m1 < (1ULL << J_BITS));
+
+	/* distribute the n2*m2 second ffts into DIV_2D_N*DIV_2D_M packages */
+	n3 = n2 / DIV_2D_N;
+	STARPU_ASSERT(n2 == n3*DIV_2D_N);
+	m3 = m2 / DIV_2D_M;
+	STARPU_ASSERT(m2 == m3*DIV_2D_M);
+}
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+if (PARALLEL) {
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* 4bit limitation in the tag space */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+}
+
+	plan->dim = 2;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+	plan->n[1] = m;
+
+if (PARALLEL) {
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n1[1] = m1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+	plan->n2[1] = m2;
+}
+
+	plan->totsize = n * m;
+
+if (PARALLEL) {
+	plan->totsize1 = n1 * m1;
+	plan->totsize2 = n2 * m2;
+	plan->totsize3 = DIV_2D_N * DIV_2D_M;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+}
+	plan->type = C2C;
+	plan->sign = sign;
+
+if (PARALLEL) {
+	/* Compute the w^k just once. */
+	compute_roots(plan);
+}
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+if (PARALLEL) {
+			/* first fft plan: one n2*m2 fft */
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_2d)(n2, m2, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3*m3 n1*m1 ffts */
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3*m3,
+					NULL, NULL, 1, plan->totsize1,
+					(void*) 1, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+} else {
+			/* fft plan: one fft of size n, m. */
+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_2d)(n, m, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
+}
+#else
+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+#ifdef __STARPU_USE_CUDA
+if (PARALLEL) {
+	starpu_execute_on_each_worker(STARPUFFT(fft1_2d_plan_gpu), plan, STARPU_CUDA);
+	starpu_execute_on_each_worker(STARPUFFT(fft2_2d_plan_gpu), plan, STARPU_CUDA);
+} else {
+	starpu_execute_on_each_worker(STARPUFFT(fft_2d_plan_gpu), plan, STARPU_CUDA);
+}
+#endif
+
+if (PARALLEL) {
+	/* Allocate buffers. */
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	/* Allocate handle arrays */
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	/* Allocate task arrays */
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	/* Allocate codelet argument arrays */
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z / m1, j = z % m1;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, i, j)
+
+		/* TODO: get rid of tags */
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+		plan->fft1_args[z].j = j;
+
+		/* Register (n2,m2) chunks */
+		starpu_vector_data_register(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		starpu_vector_data_register(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need it on the CPU for the second twist anyway */
+		starpu_data_set_wt_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_2d_codelet);
+		/* task->handles[0] = to be filled at execution */
+		task->handles[1] = plan->twisted1_handle[z];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_2d_codelet);
+		task->handles[0] = plan->twisted1_handle[z];
+		task->handles[1] = plan->fft1_handle[z];
+		task->handles[2] = plan->roots_handle[0];
+		task->handles[3] = plan->roots_handle[1];
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be done with first step we need to have
+		 * finished this fft1 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, JOIN, 0, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create join task */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, JOIN, 0, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks */
+	for (z = 0; z < plan->totsize3; z++) {
+		int kk = z / DIV_2D_M, ll = z % DIV_2D_M;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, kk, ll)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].kk = kk;
+		plan->fft2_args[z].ll = ll;
+
+		/* Register n3*m3 (n1,m1) chunks */
+		starpu_vector_data_register(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_vector_data_register(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need it on the CPU for the last twist anyway */
+		starpu_data_set_wt_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the whole first step to be
+		 * done */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_2D(plan, JOIN, 0, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_2d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_2d_codelet);
+		task->handles[0] = plan->twisted2_handle[z];
+		task->handles[1] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		/* These run only on CPUs and thus write directly into the
+		 * application output buffer. */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_2d_codelet);
+		task->handles[0] = plan->fft2_handle[z];
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, END, 0, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, END, 0, 0);
+	task->use_tag = 1;
+	task->destroy = 0;
+
+}
+
+	return plan;
+}
+
+/* Actually submit all the tasks. */
+static struct starpu_task *
+STARPUFFT(start2dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+	int ret;
+
+if (PARALLEL) {
+	for (z=0; z < plan->totsize1; z++) {
+		ret = starpu_task_submit(plan->twist1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft1_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->join_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	for (z=0; z < plan->totsize3; z++) {
+		ret = starpu_task_submit(plan->twist2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->fft2_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		ret = starpu_task_submit(plan->twist3_tasks[z]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_submit(plan->end_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	return plan->end_task;
+} else /* !PARALLEL */ {
+	struct starpu_task *task;
+
+	/* Create FFT task */
+	task = starpu_task_create();
+	task->detach = 0;
+	task->cl = &STARPUFFT(fft_2d_codelet);
+	task->handles[0] = in;
+	task->handles[1] = out;
+	task->cl_arg = plan;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return task;
+}
+}
+
+/* Free all the tags. The generic code handles freeing the buffers. */
+static void
+STARPUFFT(free_2d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i, j;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+
+	if (!PARALLEL)
+		return;
+
+	for (i = 0; i < n1; i++) {
+		for (j = 0; j < m1; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST1, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT1, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, JOIN, 0, 0));
+
+	for (i = 0; i < DIV_2D_N; i++) {
+		for (j = 0; j < DIV_2D_M; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST3, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, END, 0, 0));
+}

+ 272 - 32
tests/Makefile.am

@@ -1,8 +1,8 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
-# Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
+# Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,15 +16,28 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
 AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/src -I$(top_srcdir)/src/
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/src -I$(top_srcdir)/src/
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
 
 EXTRA_DIST =					\
 EXTRA_DIST =					\
+	helper.h				\
+	datawizard/scal.h			\
 	microbenchs/null_kernel_gordon.c	\
 	microbenchs/null_kernel_gordon.c	\
 	datawizard/sync_and_notify_data_gordon_kernels.c \
 	datawizard/sync_and_notify_data_gordon_kernels.c \
 	datawizard/sync_and_notify_data_opencl_codelet.cl\
 	datawizard/sync_and_notify_data_opencl_codelet.cl\
-	coverage/coverage.sh
+	coverage/coverage.sh			\
+	datawizard/interfaces/test_interfaces.h	\
+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl \
+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl \
+	datawizard/interfaces/variable/variable_opencl_kernel.cl \
+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl \
+	datawizard/interfaces/multiformat/multiformat_types.h \
+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl \
+	datawizard/interfaces/multiformat/advanced/generic.h \
+	datawizard/interfaces/csr/csr_opencl_kernel.cl \
+	datawizard/interfaces/block/block_opencl_kernel.cl
 
 
 CLEANFILES = 					\
 CLEANFILES = 					\
 	*.gcno *.gcda *.linkinfo		\
 	*.gcno *.gcda *.linkinfo		\
@@ -43,7 +56,7 @@ if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 NVCC ?= nvcc
 
 
-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include $(HWLOC_CFLAGS)
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include $(HWLOC_CFLAGS)
 
 
 .cu.cubin:
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
 	$(MKDIR_P) `dirname $@`
@@ -83,7 +96,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 ## test loader program
 LOADER			=	loader
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/tests/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/tests/$(LOADER)
-TESTS_ENVIRONMENT	=	$(LOADER_BIN)
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
 endif
 endif
 
 
 TESTS = $(noinst_PROGRAMS)
 TESTS = $(noinst_PROGRAMS)
@@ -92,31 +105,39 @@ if STARPU_COVERAGE_ENABLED
 TESTS	+=	coverage/coverage.sh
 TESTS	+=	coverage/coverage.sh
 endif
 endif
 
 
+starpu_machine_display_SOURCES	=	../tools/starpu_machine_display.c
+
 noinst_PROGRAMS =				\
 noinst_PROGRAMS =				\
-	core/restart				\
-	core/execute_on_a_specific_worker	\
-	core/insert_task			\
-	core/multithreaded			\
-	core/multithreaded_init			\
-	core/starpu_task_wait_for_all		\
-	core/starpu_task_wait			\
-	core/static_restartable			\
-	core/static_restartable_using_initializer\
-	core/static_restartable_tag		\
-	core/regenerate				\
-	core/wait_all_regenerable_tasks		\
-	core/subgraph_repeat			\
-	core/subgraph_repeat_regenerate		\
-	core/empty_task				\
-	core/empty_task_sync_point		\
-	core/empty_task_sync_point_tasks	\
-	core/empty_task_chain			\
-	core/tag_wait_api			\
-	core/task_wait_api			\
-	core/declare_deps_in_callback		\
-	core/declare_deps_after_submission	\
-	core/declare_deps_after_submission_synchronous	\
-	core/get_current_task			\
+	starpu_machine_display			\
+	main/deprecated_func			\
+	main/deprecated_buffer			\
+	main/restart				\
+	main/execute_on_a_specific_worker	\
+	main/insert_task			\
+	main/multithreaded			\
+	main/multithreaded_init			\
+	main/starpu_task_bundle			\
+	main/starpu_task_wait_for_all		\
+	main/starpu_task_wait			\
+	main/static_restartable			\
+	main/static_restartable_using_initializer\
+	main/static_restartable_tag		\
+	main/regenerate				\
+	main/wait_all_regenerable_tasks		\
+	main/subgraph_repeat			\
+	main/subgraph_repeat_regenerate		\
+	main/empty_task				\
+	main/empty_task_sync_point		\
+	main/empty_task_sync_point_tasks	\
+	main/empty_task_chain			\
+	main/tag_wait_api			\
+	main/task_wait_api			\
+	main/declare_deps_in_callback		\
+	main/declare_deps_after_submission	\
+	main/declare_deps_after_submission_synchronous	\
+	main/get_current_task			\
+	main/starpu_init			\
+	main/starpu_worker_exists               \
 	datawizard/acquire_cb			\
 	datawizard/acquire_cb			\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
@@ -128,6 +149,7 @@ noinst_PROGRAMS =				\
 	datawizard/sync_and_notify_data		\
 	datawizard/sync_and_notify_data		\
 	datawizard/sync_and_notify_data_implicit\
 	datawizard/sync_and_notify_data_implicit\
 	datawizard/dsm_stress			\
 	datawizard/dsm_stress			\
+	datawizard/double_parameter		\
 	datawizard/write_only_tmp_buffer	\
 	datawizard/write_only_tmp_buffer	\
 	datawizard/data_invalidation		\
 	datawizard/data_invalidation		\
 	datawizard/dining_philosophers		\
 	datawizard/dining_philosophers		\
@@ -144,8 +166,26 @@ noinst_PROGRAMS =				\
 	datawizard/critical_section_with_void_interface\
 	datawizard/critical_section_with_void_interface\
 	datawizard/increment_redux		\
 	datawizard/increment_redux		\
 	datawizard/increment_redux_v2		\
 	datawizard/increment_redux_v2		\
+	datawizard/increment_redux_lazy		\
 	datawizard/handle_to_pointer		\
 	datawizard/handle_to_pointer		\
 	datawizard/lazy_allocation		\
 	datawizard/lazy_allocation		\
+	datawizard/interfaces/copy_interfaces	\
+	datawizard/interfaces/block/block_interface \
+	datawizard/interfaces/bcsr/bcsr_interface \
+	datawizard/interfaces/csr/csr_interface \
+	datawizard/interfaces/matrix/matrix_interface \
+	datawizard/interfaces/multiformat/multiformat_interface \
+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl \
+	datawizard/interfaces/multiformat/advanced/multiformat_data_release \
+	datawizard/interfaces/multiformat/advanced/multiformat_worker \
+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion \
+	datawizard/interfaces/multiformat/advanced/same_handle \
+	datawizard/interfaces/variable/variable_interface    \
+	datawizard/interfaces/vector/test_vector_interface   \
+	datawizard/interfaces/void/void_interface \
+	datawizard/in_place_partition   	\
+	datawizard/partition_lazy		\
+	datawizard/gpu_register   		\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_blocking_calls	\
 	errorcheck/invalid_tasks		\
 	errorcheck/invalid_tasks		\
@@ -165,7 +205,7 @@ noinst_PROGRAMS =				\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
 	parallel_tasks/parallel_kernels_spmd	\
 	perfmodels/regression_based		\
 	perfmodels/regression_based		\
-	perfmodels/non_linear_regression_based
+	perfmodels/non_linear_regression_based 
 
 
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS
 check_PROGRAMS = $(noinst_PROGRAMS)
 check_PROGRAMS = $(noinst_PROGRAMS)
@@ -236,6 +276,42 @@ datawizard_sync_and_notify_data_implicit_SOURCES +=	\
 	datawizard/sync_and_notify_data_opencl.c
 	datawizard/sync_and_notify_data_opencl.c
 endif
 endif
 
 
+datawizard_in_place_partition_SOURCES =	\
+	datawizard/in_place_partition.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_in_place_partition_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
+datawizard_partition_lazy_SOURCES =	\
+	datawizard/partition_lazy.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_partition_lazy_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_partition_lazy_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
+datawizard_gpu_register_SOURCES =	\
+	datawizard/gpu_register.c	\
+	datawizard/scal.c
+if STARPU_USE_CUDA
+datawizard_gpu_register_SOURCES +=	\
+	datawizard/scal_cuda.cu
+endif
+if STARPU_USE_OPENCL
+datawizard_gpu_register_SOURCES +=	\
+	datawizard/scal_opencl.cl
+endif
+
 if STARPU_USE_GORDON
 if STARPU_USE_GORDON
 datawizard_sync_and_notify_data_SOURCES +=	\
 datawizard_sync_and_notify_data_SOURCES +=	\
 	datawizard/sync_and_notify_data_gordon_kernels.c
 	datawizard/sync_and_notify_data_gordon_kernels.c
@@ -245,3 +321,167 @@ BUILT_SOURCES += 						\
 	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
 	datawizard/sync_and_notify_data_gordon_kernels.spuelf	\
 	microbenchs/null_kernel_gordon.spuelf
 	microbenchs/null_kernel_gordon.spuelf
 endif
 endif
+
+###################
+# Block interface #
+###################
+datawizard_interfaces_block_block_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c  \
+	datawizard/interfaces/block/block_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_block_block_interface_SOURCES+= \
+	datawizard/interfaces/block/block_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_block_block_interface_SOURCES+= \
+	datawizard/interfaces/block/block_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/block/block_opencl_kernel.cl
+endif
+
+##################
+# BSCR interface #
+##################
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c \
+	datawizard/interfaces/bcsr/bcsr_interface.c 
+
+if STARPU_USE_CUDA
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
+	datawizard/interfaces/bcsr/bcsr_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_bcsr_bcsr_interface_SOURCES+= \
+	datawizard/interfaces/bcsr/bcsr_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl
+endif
+
+#################
+# CSR interface #
+#################
+datawizard_interfaces_csr_csr_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c  \
+	datawizard/interfaces/csr/csr_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_csr_csr_interface_SOURCES+= \
+	datawizard/interfaces/csr/csr_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_csr_csr_interface_SOURCES+= \
+	datawizard/interfaces/csr/csr_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/csr/csr_opencl_kernel.cl
+endif
+
+
+datawizard_interfaces_vector_test_vector_interface_SOURCES =               \
+	datawizard/interfaces/vector/test_vector_interface.c               \
+	datawizard/interfaces/test_interfaces.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
+	datawizard/interfaces/vector/test_vector_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_vector_test_vector_interface_SOURCES +=               \
+	datawizard/interfaces/vector/test_vector_opencl.c 
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/vector/test_vector_opencl_kernel.cl
+endif
+
+####################
+# Matrix interface #
+####################
+datawizard_interfaces_matrix_matrix_interface_SOURCES= \
+	datawizard/interfaces/test_interfaces.c        \
+	datawizard/interfaces/matrix/matrix_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
+	datawizard/interfaces/matrix/matrix_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_matrix_matrix_interface_SOURCES+= \
+	datawizard/interfaces/matrix/matrix_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA+= \
+	datawizard/interfaces/matrix/matrix_opencl_kernel.cl
+endif
+
+
+#########################
+# Multiformat interface #
+#########################
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES =           \
+	datawizard/interfaces/test_interfaces.c                             \
+	datawizard/interfaces/multiformat/multiformat_interface.c           \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
+	datawizard/interfaces/multiformat/multiformat_cuda.cu                      \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_multiformat_multiformat_interface_SOURCES+=                  \
+	datawizard/interfaces/multiformat/multiformat_opencl.c                     \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA +=                                                          \
+	datawizard/interfaces/multiformat/multiformat_opencl_kernel.cl                     \
+	datawizard/interfaces/multiformat/multiformat_conversion_codelets_kernel.cl
+endif
+
+datawizard_interfaces_multiformat_advanced_multiformat_cuda_opencl_SOURCES=\
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/multiformat_cuda_opencl.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_data_release_SOURCES = \
+	datawizard/interfaces/multiformat/advanced/generic.c                  \
+	datawizard/interfaces/multiformat/advanced/multiformat_data_release.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_worker_SOURCES=\
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/multiformat_worker.c
+
+datawizard_interfaces_multiformat_advanced_multiformat_handle_conversion_SOURCES = \
+	datawizard/interfaces/multiformat/advanced/generic.c \
+	datawizard/interfaces/multiformat/advanced/multiformat_handle_conversion.c
+
+datawizard_interfaces_multiformat_advanced_same_handle_SOURCES= \
+	datawizard/interfaces/multiformat/advanced/generic.c               \
+	datawizard/interfaces/multiformat/advanced/same_handle.c
+
+
+datawizard_interfaces_variable_variable_interface_SOURCES=   \
+	datawizard/interfaces/test_interfaces.c              \
+	datawizard/interfaces/variable/variable_interface.c
+
+if STARPU_USE_CUDA
+datawizard_interfaces_variable_variable_interface_SOURCES+= \
+	datawizard/interfaces/variable/variable_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+datawizard_interfaces_variable_variable_interface_SOURCES+= \
+	datawizard/interfaces/variable/variable_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	datawizard/interfaces/variable/variable_opencl_kernel.cl
+endif
+
+##################
+# Void interface #
+##################
+datawizard_interfaces_void_void_interface_SOURCES=\
+	datawizard/interfaces/test_interfaces.c        \
+	datawizard/interfaces/void/void_interface.c
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 23 - 3
tests/cholesky/prio.r

@@ -1,3 +1,20 @@
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 sizelist <- seq(2048, 24576, 2048);
 sizelist <- seq(2048, 24576, 2048);
 schedlist <- c("greedy", "prio", "dm", "random");
 schedlist <- c("greedy", "prio", "dm", "random");
 
 
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 
 
 	if (file.exists(filename))
 	if (file.exists(filename))
-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
+	{
+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
 		return(ret);
 		return(ret);
 	};
 	};
 
 
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 	gflopstab <- NULL;
 	gflopstab <- NULL;
 	sizetab <- NULL;
 	sizetab <- NULL;
 
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- handle_size(size, sched);
 		list <- handle_size(size, sched);
 		gflopstab <- c(gflopstab, list);
 		gflopstab <- c(gflopstab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
 		sizetab <- c(sizetab, array(size, c(length(list))));
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 	meantab <- NULL;
 	meantab <- NULL;
 	sizetab <- NULL;
 	sizetab <- NULL;
 
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- mean(handle_size(size, sched));
 		list <- mean(handle_size(size, sched));
 		meantab <- c(meantab, list);
 		meantab <- c(meantab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
 		sizetab <- c(sizetab, array(size, c(length(list))));

+ 23 - 3
tests/cholesky/sched.r

@@ -1,3 +1,20 @@
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 sizelist <- seq(2048, 24576, 2048);
 sizelist <- seq(2048, 24576, 2048);
 schedlist <- c("greedy", "prio", "dm", "random");
 schedlist <- c("greedy", "prio", "dm", "random");
 
 
@@ -15,7 +32,8 @@ parse <- function (size, sched)
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 	filename = paste("timings_sched/sched", sched, size, sep=".");
 
 
 	if (file.exists(filename))
 	if (file.exists(filename))
-	{	ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
+	{
+		ret <- scan(paste("timings_sched/sched", sched, size, sep="."));
 		return(ret);
 		return(ret);
 	};
 	};
 
 
@@ -35,7 +53,8 @@ handle_sched <- function(sched)
 	gflopstab <- NULL;
 	gflopstab <- NULL;
 	sizetab <- NULL;
 	sizetab <- NULL;
 
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- handle_size(size, sched);
 		list <- handle_size(size, sched);
 		gflopstab <- c(gflopstab, list);
 		gflopstab <- c(gflopstab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
 		sizetab <- c(sizetab, array(size, c(length(list))));
@@ -51,7 +70,8 @@ handle_sched_mean <- function(sched)
 	meantab <- NULL;
 	meantab <- NULL;
 	sizetab <- NULL;
 	sizetab <- NULL;
 
 
-	for (size in sizelist) {
+	for (size in sizelist)
+	{
 		list <- mean(handle_size(size, sched));
 		list <- mean(handle_size(size, sched));
 		meantab <- c(meantab, list);
 		meantab <- c(meantab, list);
 		sizetab <- c(sizetab, array(size, c(length(list))));
 		sizetab <- c(sizetab, array(size, c(length(list))));

+ 0 - 65
tests/core/multithreaded_init.c

@@ -1,65 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-#include <sys/time.h>
-#include <stdio.h>
-#include <pthread.h>
-#include <starpu.h>
-
-#define NUM_THREADS 5
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
-void *launch_starpu(void *id)
-{ 
-   starpu_init(NULL);
-   return NULL;
-}
-
-int main(int argc, char **argv)
-{ 
-  unsigned i;
-  double timing;
-  struct timeval start;
-  struct timeval end;
-
-  pthread_t threads[NUM_THREADS];
-  
-  gettimeofday(&start, NULL);
-
-  for (i = 0; i < NUM_THREADS; ++i)
-    {
-      int ret = pthread_create(&threads[i], NULL, launch_starpu, NULL);
-      STARPU_ASSERT(ret == 0);
-    }
-
-  for (i = 0; i < NUM_THREADS; ++i)
-    {
-      int ret = pthread_join(threads[i], NULL);
-      STARPU_ASSERT(ret == 0);
-    }
-
-  gettimeofday(&end, NULL);
-
-  timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-  FPRINTF(stderr, "Success : %d threads launching simultaneously starpu_init\n", NUM_THREADS);
-  FPRINTF(stderr, "Total: %f secs\n", timing/1000000);
-  FPRINTF(stderr, "Per task: %f usecs\n", timing/NUM_THREADS);
-
-  starpu_shutdown();
-
-  return 0;
-}

+ 0 - 121
tests/core/task_wait_api.c

@@ -1,121 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <pthread.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
-static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
-{
-}
-
-static starpu_codelet dummy_codelet =
-{
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = dummy_func,
-	.cuda_func = dummy_func,
-	.opencl_func = dummy_func,
-        .model = NULL,
-	.nbuffers = 0
-};
-
-static struct starpu_task *create_dummy_task(void)
-{
-	struct starpu_task *task = starpu_task_create();
-
-	task->cl = &dummy_codelet;
-	task->cl_arg = NULL;
-	task->detach = 0;
-
-	return task;
-}
-
-int main(int argc, char **argv)
-{
-	starpu_init(NULL);
-
-	FPRINTF(stderr, "{ A } -> { B }\n");
-	fflush(stderr);
-
-	struct starpu_task *taskA, *taskB;
-
-	taskA = create_dummy_task();
-	taskB = create_dummy_task();
-
-	/* B depends on A */
-	starpu_task_declare_deps_array(taskB, 1, &taskA);
-
-	starpu_task_submit(taskB);
-	starpu_task_submit(taskA);
-
-	starpu_task_wait(taskB);
-
-	FPRINTF(stderr, "{ C, D, E, F } -> { G }\n");
-
-	struct starpu_task *taskC, *taskD, *taskE, *taskF, *taskG;
-
-	taskC = create_dummy_task();
-	taskD = create_dummy_task();
-	taskE = create_dummy_task();
-	taskF = create_dummy_task();
-	taskG = create_dummy_task();
-
-	struct starpu_task *tasksCDEF[4] = {taskC, taskD, taskE, taskF};
-	starpu_task_declare_deps_array(taskG, 4, tasksCDEF);
-
-	starpu_task_submit(taskC);
-	starpu_task_submit(taskD);
-	starpu_task_submit(taskG);
-	starpu_task_submit(taskE);
-	starpu_task_submit(taskF);
-
-	starpu_task_wait(taskG);
-
-	FPRINTF(stderr, "{ H, I } -> { J, K, L }\n");
-
-	struct starpu_task *taskH, *taskI, *taskJ, *taskK, *taskL;
-
-	taskH = create_dummy_task();
-	taskI = create_dummy_task();
-	taskJ = create_dummy_task();
-	taskK = create_dummy_task();
-	taskL = create_dummy_task();
-
-	struct starpu_task *tasksHI[2] = {taskH, taskI};
-
-	starpu_task_declare_deps_array(taskJ, 2, tasksHI);
-	starpu_task_declare_deps_array(taskK, 2, tasksHI);
-	starpu_task_declare_deps_array(taskL, 2, tasksHI);
-
-	starpu_task_submit(taskH);
-	starpu_task_submit(taskI);
-	starpu_task_submit(taskJ);
-	starpu_task_submit(taskK);
-	starpu_task_submit(taskL);
-
-	starpu_task_wait(taskJ);
-	starpu_task_wait(taskK);
-	starpu_task_wait(taskL);
-
-	starpu_shutdown();
-
-	return 0;
-}

+ 8 - 5
tests/datawizard/acquire_cb.c

@@ -15,11 +15,10 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#include "../helper.h"
 
 
 unsigned token = 0;
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 
 void callback(void *arg __attribute__ ((unused)))
 void callback(void *arg __attribute__ ((unused)))
 {
 {
@@ -29,7 +28,11 @@ void callback(void *arg __attribute__ ((unused)))
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-        starpu_init(NULL);
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
         starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
         starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
@@ -41,5 +44,5 @@ int main(int argc, char **argv)
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
 }
 }

+ 43 - 25
tests/datawizard/acquire_cb_insert.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,30 +14,38 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
+
+#warning memory leak
 
 
 #define N 16
 #define N 16
 #define M 4
 #define M 4
 #define X 2
 #define X 2
 
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
 void which_index_cpu(void *descr[], void *_args)
 void which_index_cpu(void *descr[], void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	/* A real case would actually compute something */
 	/* A real case would actually compute something */
 	*x0 = X;
 	*x0 = X;
 }
 }
 
 
-starpu_codelet which_index = {
+struct starpu_codelet which_index =
+{
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
-	.cpu_func = which_index_cpu,
-        .nbuffers = 1
+	.cpu_funcs = {which_index_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 void work_cpu(void *descr[], void *_args)
 void work_cpu(void *descr[], void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	int i, n = STARPU_VECTOR_GET_NX(descr[0]);
 	int i, n = STARPU_VECTOR_GET_NX(descr[0]);
 	float *x0 = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 	float *x0 = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
@@ -45,16 +53,19 @@ void work_cpu(void *descr[], void *_args)
 		x0[i] = i + 1;
 		x0[i] = i + 1;
 }
 }
 
 
-starpu_codelet work = {
+struct starpu_codelet work =
+{
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
-	.cpu_func = work_cpu,
-        .nbuffers = 1
+	.cpu_funcs = {work_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static int x;
 static int x;
-static starpu_data_handle x_handle, f_handle;
+static starpu_data_handle_t x_handle, f_handle;
 
 
-void callback(void *arg) {
+void callback(void *arg)
+{
 	starpu_insert_task(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
 	starpu_insert_task(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
 	starpu_data_release(x_handle);
 	starpu_data_release(x_handle);
 }
 }
@@ -64,18 +75,22 @@ int main(int argc, char **argv)
         int i, ret;
         int i, ret;
 	float *f;
 	float *f;
 
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	/* Declare x */
 	/* Declare x */
 	starpu_variable_data_register(&x_handle, 0, (uintptr_t)&x, sizeof(x));
 	starpu_variable_data_register(&x_handle, 0, (uintptr_t)&x, sizeof(x));
 
 
 	/* Allocate and Declare f */
 	/* Allocate and Declare f */
-	starpu_malloc((void**)&f, N * sizeof(*f));
+	ret = starpu_malloc((void**)&f, N * sizeof(*f));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 	memset(f, 0, N * sizeof(*f));
 	memset(f, 0, N * sizeof(*f));
 	starpu_vector_data_register(&f_handle, 0, (uintptr_t)f, N, sizeof(*f));
 	starpu_vector_data_register(&f_handle, 0, (uintptr_t)f, N, sizeof(*f));
 
 
 	/* Partition f */
 	/* Partition f */
-	struct starpu_data_filter filter = {
+	struct starpu_data_filter filter =
+	{
 		.filter_func = starpu_block_filter_func_vector,
 		.filter_func = starpu_block_filter_func_vector,
 		.nchildren = M,
 		.nchildren = M,
 	};
 	};
@@ -84,6 +99,7 @@ int main(int argc, char **argv)
 	/* Compute which portion we will work on */
 	/* Compute which portion we will work on */
         ret = starpu_insert_task(&which_index, STARPU_W, x_handle, 0);
         ret = starpu_insert_task(&which_index, STARPU_W, x_handle, 0);
 	if (ret == -ENODEV) goto enodev;
 	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 
 	/* And submit the corresponding task */
 	/* And submit the corresponding task */
 #ifdef __GCC__
 #ifdef __GCC__
@@ -96,30 +112,32 @@ int main(int argc, char **argv)
 	starpu_data_acquire_cb(x_handle, STARPU_W, callback, NULL);
 	starpu_data_acquire_cb(x_handle, STARPU_W, callback, NULL);
 #endif
 #endif
 
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	starpu_data_unpartition(f_handle, 0);
 	starpu_data_unpartition(f_handle, 0);
 	starpu_data_unregister(f_handle);
 	starpu_data_unregister(f_handle);
 	starpu_data_unregister(x_handle);
 	starpu_data_unregister(x_handle);
 
 
         FPRINTF(stderr, "VALUES: %d", x);
         FPRINTF(stderr, "VALUES: %d", x);
-
-        for(i=0 ; i<N ; i++) {
+        for(i=0 ; i<N ; i++)
+	{
 		FPRINTF(stderr, " %f", f[i]);
 		FPRINTF(stderr, " %f", f[i]);
         }
         }
-
-	STARPU_ASSERT(f[X*(N/M)] == 1);
-	STARPU_ASSERT(f[X*(N/M)+1] == 2);
-	STARPU_ASSERT(f[X*(N/M)+2] == 3);
-	STARPU_ASSERT(f[X*(N/M)+3] == 4);
-
 	FPRINTF(stderr, "\n");
 	FPRINTF(stderr, "\n");
 
 
+	ret = EXIT_SUCCESS;
+	if (f[X*(N/M)] != 1 || f[X*(N/M)+1] != 2 ||
+	    f[X*(N/M)+2] != 3 || f[X*(N/M)+3] != 4)
+		ret = EXIT_FAILURE;
+
+	starpu_free(f);
 	starpu_shutdown();
 	starpu_shutdown();
-	return 0;
+	STARPU_RETURN(ret);
 
 
 enodev:
 enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 46 - 18
tests/datawizard/acquire_release.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,11 +15,15 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
 
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
+#ifdef STARPU_SLOW_MACHINE
+static unsigned ntasks = 10;
+#else
 static unsigned ntasks = 10000;
 static unsigned ntasks = 10000;
+#endif
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
@@ -27,30 +31,35 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	(*tokenptr)++;
 	(*tokenptr)++;
 }
 }
 
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
         .where = STARPU_CPU|STARPU_CUDA,
         .where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = increment_cpu,
+	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
 unsigned token = 0;
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 
-void increment_token()
+int increment_token()
 {
 {
+	int ret;
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
         task->synchronous = 1;
         task->synchronous = 1;
 	task->cl = &increment_cl;
 	task->cl = &increment_cl;
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-	starpu_task_submit(task);
+	task->handles[0] = token_handle;
+	ret = starpu_task_submit(task);
+	return ret;
 }
 }
 
 
 void callback(void *arg __attribute__ ((unused)))
 void callback(void *arg __attribute__ ((unused)))
@@ -61,8 +70,12 @@ void callback(void *arg __attribute__ ((unused)))
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int i;
 	int i;
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-        starpu_init(NULL);
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 
 
         FPRINTF(stderr, "Token: %u\n", token);
         FPRINTF(stderr, "Token: %u\n", token);
@@ -70,21 +83,36 @@ int main(int argc, char **argv)
 	for(i=0; i<ntasks; i++)
 	for(i=0; i<ntasks; i++)
 	{
 	{
 		/* synchronize data in RAM */
 		/* synchronize data in RAM */
-                starpu_data_acquire(token_handle, STARPU_R);
+                ret = starpu_data_acquire(token_handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
                 token ++;
                 token ++;
                 starpu_data_release(token_handle);
                 starpu_data_release(token_handle);
 
 
-                increment_token();
+                ret = increment_token();
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
-                starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+                ret = starpu_data_acquire_cb(token_handle, STARPU_RW, callback, NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 	}
 	}
 
 
 	starpu_data_unregister(token_handle);
 	starpu_data_unregister(token_handle);
 
 
+	starpu_shutdown();
+
         FPRINTF(stderr, "Token: %u\n", token);
         FPRINTF(stderr, "Token: %u\n", token);
-        STARPU_ASSERT(token==ntasks*2);
+	if (token == ntasks * 2)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	STARPU_RETURN(ret);
 
 
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
 	starpu_shutdown();
 	starpu_shutdown();
-
-	return 0;
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 42 - 16
tests/datawizard/acquire_release2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,9 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
 
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#warning memory leak
 
 
 static unsigned ntasks = 40000;
 static unsigned ntasks = 40000;
 
 
@@ -26,30 +28,33 @@ extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
 
 
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	(*tokenptr)++;
 	(*tokenptr)++;
 }
 }
 
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
+	.modes = { STARPU_RW },
         .where = STARPU_CPU|STARPU_CUDA,
         .where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = increment_cpu,
+	.cpu_funcs = {increment_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
 #endif
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
 unsigned token = 0;
 unsigned token = 0;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 
-void increment_token(int synchronous)
+int increment_token(int synchronous)
 {
 {
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
         task->synchronous = synchronous;
         task->synchronous = synchronous;
 	task->cl = &increment_cl;
 	task->cl = &increment_cl;
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-	starpu_task_submit(task);
+	task->handles[0] = token_handle;
+	return starpu_task_submit(task);
 }
 }
 
 
 void callback(void *arg __attribute__ ((unused)))
 void callback(void *arg __attribute__ ((unused)))
@@ -62,8 +67,12 @@ void callback(void *arg __attribute__ ((unused)))
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	int i;
 	int i;
+	int ret;
+
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-        starpu_init(NULL);
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 	starpu_variable_data_register(&token_handle, 0, (uintptr_t)&token, sizeof(unsigned));
 
 
         FPRINTF(stderr, "Token: %u\n", token);
         FPRINTF(stderr, "Token: %u\n", token);
@@ -74,16 +83,33 @@ int main(int argc, char **argv)
 
 
 	for(i=0; i<ntasks; i++)
 	for(i=0; i<ntasks; i++)
 	{
 	{
-                starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
-                increment_token(0);
+                ret = starpu_data_acquire_cb(token_handle, STARPU_W, callback, NULL);  // recv
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
+
+                ret = increment_token(0);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
                 starpu_data_acquire_cb(token_handle, STARPU_R, callback, NULL);  // send
                 starpu_data_acquire_cb(token_handle, STARPU_R, callback, NULL);  // send
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire_cb");
 	}
 	}
 
 
 	starpu_data_unregister(token_handle);
 	starpu_data_unregister(token_handle);
-        FPRINTF(stderr, "Token: %u\n", token);
-        assert(token==ntasks);
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+        FPRINTF(stderr, "Token: %u\n", token);
+	if (token == ntasks)
+		ret = EXIT_SUCCESS;
+	else
+		ret = EXIT_FAILURE;
+	STARPU_RETURN(ret);
+
+enodev:
+	starpu_data_unregister(token_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 34 - 30
tests/datawizard/copy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,8 +16,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
-
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#include "../helper.h"
 
 
 static unsigned nloops = 1000;
 static unsigned nloops = 1000;
 
 
@@ -25,37 +24,41 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 {
 {
 }
 }
 
 
-static starpu_codelet cpu_codelet =
+static struct starpu_codelet cpu_codelet =
 {
 {
         .where = STARPU_CPU,
         .where = STARPU_CPU,
-        .cpu_func = dummy_func,
+        .cpu_funcs = {dummy_func, NULL},
         .model = NULL,
         .model = NULL,
-        .nbuffers = 1
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 };
 
 
-static starpu_codelet gpu_codelet =
+static struct starpu_codelet gpu_codelet =
 {
 {
         .where = STARPU_CUDA|STARPU_OPENCL,
         .where = STARPU_CUDA|STARPU_OPENCL,
-        .cuda_func = dummy_func,
-        .opencl_func = dummy_func,
+        .cuda_funcs = {dummy_func, NULL},
+        .opencl_funcs = {dummy_func, NULL},
         .model = NULL,
         .model = NULL,
-        .nbuffers = 1
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 };
 
 
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
         float foo;
         float foo;
-	starpu_data_handle float_array_handle;
-        int i;
+	starpu_data_handle_t float_array_handle;
+        int i, ret;
 
 
-        starpu_init(NULL);
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
 	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
 	{
 	{
 		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
 		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
 		starpu_shutdown();
 		starpu_shutdown();
-		return 77;
+		return STARPU_TEST_SKIPPED;
 	}
 	}
 
 
         foo = 0.0f;
         foo = 0.0f;
@@ -71,32 +74,33 @@ int main(int argc, char **argv)
 
 
 		task_cpu->cl = &cpu_codelet;
 		task_cpu->cl = &cpu_codelet;
 		task_cpu->callback_func = NULL;
 		task_cpu->callback_func = NULL;
-		task_cpu->buffers[0].handle = float_array_handle;
-		task_cpu->buffers[0].mode = STARPU_RW;
+		task_cpu->handles[0] = float_array_handle;
 
 
 		task_gpu->cl = &gpu_codelet;
 		task_gpu->cl = &gpu_codelet;
 		task_gpu->callback_func = NULL;
 		task_gpu->callback_func = NULL;
-		task_gpu->buffers[0].handle = float_array_handle;
-		task_gpu->buffers[0].mode = STARPU_RW;
+		task_gpu->handles[0] = float_array_handle;
 
 
 		ret = starpu_task_submit(task_cpu);
 		ret = starpu_task_submit(task_cpu);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			FPRINTF(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
 		ret = starpu_task_submit(task_gpu);
 		ret = starpu_task_submit(task_gpu);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			FPRINTF(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
         }
         }
 
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	starpu_data_unregister(float_array_handle);
 	starpu_data_unregister(float_array_handle);
         starpu_shutdown();
         starpu_shutdown();
 
 
-        return 0;
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(float_array_handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 24 - 15
tests/datawizard/critical_section_with_void_interface.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,29 +15,35 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <stdio.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <unistd.h>
 #include <errno.h>
 #include <errno.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 
-starpu_data_handle void_handle;
+starpu_data_handle_t void_handle;
 
 
 int critical_var;
 int critical_var;
 
 
 static void critical_section(void *descr[], __attribute__ ((unused)) void *_args)
 static void critical_section(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	/* We do not protect this variable because it is only accessed when the
 	/* We do not protect this variable because it is only accessed when the
 	 * "void_handle" piece of data is accessed. */
 	 * "void_handle" piece of data is accessed. */
 	critical_var++;
 	critical_var++;
 }
 }
 
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = critical_section,
-	.cuda_func = critical_section,
-	.opencl_func = critical_section,
-	.nbuffers = 1
+	.cpu_funcs = {critical_section, NULL},
+	.cuda_funcs = {critical_section, NULL},
+	.opencl_funcs = {critical_section, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
@@ -48,7 +55,9 @@ int main(int argc, char **argv)
 	ntasks /= 10;
 	ntasks /= 10;
 #endif
 #endif
 
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	critical_var = 0;
 	critical_var = 0;
 
 
@@ -59,13 +68,12 @@ int main(int argc, char **argv)
 	for (i = 0; i < ntasks; i++)
 	for (i = 0; i < ntasks; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
-			task->cl = &cl;
-			task->buffers[0].handle = void_handle;
-			task->buffers[0].mode = STARPU_RW;
-	
+		task->cl = &cl;
+		task->handles[0] = void_handle;
+
 		ret = starpu_task_submit(task);
 		ret = starpu_task_submit(task);
-		if (ret == -ENODEV)
-			goto enodev;
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
 	starpu_data_unregister(void_handle);
 	starpu_data_unregister(void_handle);
@@ -74,11 +82,12 @@ int main(int argc, char **argv)
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
 
 
 enodev:
 enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 66 - 32
tests/datawizard/data_implicit_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,61 +15,78 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <stdio.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <unistd.h>
 #include <errno.h>
 #include <errno.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 
 #define VECTORSIZE	1024
 #define VECTORSIZE	1024
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 static unsigned *A, *B, *C, *D;
 static unsigned *A, *B, *C, *D;
-starpu_data_handle A_handle, B_handle, C_handle, D_handle;
+starpu_data_handle_t A_handle, B_handle, C_handle, D_handle;
 
 
 static unsigned var = 0;
 static unsigned var = 0;
 
 
 static void f(void *descr[], __attribute__ ((unused)) void *_args)
 static void f(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	usleep(200000);
 	usleep(200000);
 }
 }
 
 
-static starpu_codelet cl_f = {
+static struct starpu_codelet cl_f =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = f,
-	.cuda_func = f,
+	.cpu_funcs = {f, NULL},
+	.cuda_funcs = {f, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
 static void g(void *descr[], __attribute__ ((unused)) void *_args)
 static void g(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	usleep(100000);
 	usleep(100000);
 	var = 42;
 	var = 42;
 }
 }
 
 
-static starpu_codelet cl_g = {
+static struct starpu_codelet cl_g =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = g,
-	.cuda_func = g,
+	.cpu_funcs = {g, NULL},
+	.cuda_funcs = {g, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
 static void h(void *descr[], __attribute__ ((unused)) void *_args)
 static void h(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	FPRINTF(stderr, "VAR %u (should be 42)\n", var);
 	FPRINTF(stderr, "VAR %u (should be 42)\n", var);
 	STARPU_ASSERT(var == 42);
 	STARPU_ASSERT(var == 42);
 }
 }
 
 
-static starpu_codelet cl_h = {
+static struct starpu_codelet cl_h =
+{
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = h,
-	.cuda_func = h,
+	.cpu_funcs = {h, NULL},
+	.cuda_funcs = {h, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	A = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
 	A = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
 	B = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
 	B = (unsigned *) malloc(VECTORSIZE*sizeof(unsigned));
@@ -81,12 +98,12 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&C_handle, 0, (uintptr_t)C, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&C_handle, 0, (uintptr_t)C, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&D_handle, 0, (uintptr_t)D, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&D_handle, 0, (uintptr_t)D, VECTORSIZE, sizeof(unsigned));
 
 
-	#if 0
+#if 0
 	starpu_data_set_sequential_consistency_flag(A_handle, 0);
 	starpu_data_set_sequential_consistency_flag(A_handle, 0);
 	starpu_data_set_sequential_consistency_flag(B_handle, 0);
 	starpu_data_set_sequential_consistency_flag(B_handle, 0);
 	starpu_data_set_sequential_consistency_flag(C_handle, 0);
 	starpu_data_set_sequential_consistency_flag(C_handle, 0);
 	starpu_data_set_sequential_consistency_flag(D_handle, 0);
 	starpu_data_set_sequential_consistency_flag(D_handle, 0);
-	#endif
+#endif
 
 
 	/* 	f(Ar, Brw): sleep 
 	/* 	f(Ar, Brw): sleep 
 	 *	g(Br; Crw); sleep, var = 42
 	 *	g(Br; Crw); sleep, var = 42
@@ -94,29 +111,35 @@ int main(int argc, char **argv)
 	 */
 	 */
 	struct starpu_task *task_f = starpu_task_create();
 	struct starpu_task *task_f = starpu_task_create();
 	task_f->cl = &cl_f;
 	task_f->cl = &cl_f;
-	task_f->buffers[0].handle = A_handle;
-	task_f->buffers[0].mode = STARPU_R;
-	task_f->buffers[1].handle = B_handle;
-	task_f->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_f);
+	task_f->handles[0] = A_handle;
+	task_f->handles[1] = B_handle;
+	ret = starpu_task_submit(task_f);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
 	struct starpu_task *task_g = starpu_task_create();
 	struct starpu_task *task_g = starpu_task_create();
 	task_g->cl = &cl_g;
 	task_g->cl = &cl_g;
-	task_g->buffers[0].handle = B_handle;
-	task_g->buffers[0].mode = STARPU_R;
-	task_g->buffers[1].handle = C_handle;
-	task_g->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_g);
+	task_g->handles[0] = B_handle;
+	task_g->handles[1] = C_handle;
+	ret = starpu_task_submit(task_g);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
 	struct starpu_task *task_h = starpu_task_create();
 	struct starpu_task *task_h = starpu_task_create();
 	task_h->cl = &cl_h;
 	task_h->cl = &cl_h;
-	task_h->buffers[0].handle = C_handle;
-	task_h->buffers[0].mode = STARPU_R;
-	task_h->buffers[1].handle = D_handle;
-	task_h->buffers[1].mode = STARPU_RW;
-	starpu_task_submit(task_h);
+	task_h->handles[0] = C_handle;
+	task_h->handles[1] = D_handle;
+	ret = starpu_task_submit(task_h);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
+	starpu_data_unregister(C_handle);
+	starpu_data_unregister(D_handle);
 
 
 	free(A);
 	free(A);
 	free(B);
 	free(B);
@@ -125,5 +148,16 @@ int main(int argc, char **argv)
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	free(A);
+	free(B);
+	free(C);
+	free(D);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 46 - 30
tests/datawizard/data_invalidation.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,18 +15,23 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <stdio.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <unistd.h>
 #include <errno.h>
 #include <errno.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 
+#ifdef STARPU_SLOW_MACHINE
+#define NLOOPS		100
+#else
 #define NLOOPS		1000
 #define NLOOPS		1000
+#endif
 #define VECTORSIZE	1024
 #define VECTORSIZE	1024
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
-static starpu_data_handle v_handle;
+static starpu_data_handle_t v_handle;
 
 
 /*
 /*
  *	Memset
  *	Memset
@@ -34,6 +40,8 @@ static starpu_data_handle v_handle;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 
@@ -44,19 +52,23 @@ static void cuda_memset_codelet(void *descr[], __attribute__ ((unused)) void *_a
 
 
 static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static void cpu_memset_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 
 	memset(buf, 42, length);
 	memset(buf, 42, length);
 }
 }
 
 
-static starpu_codelet memset_cl = {
+static struct starpu_codelet memset_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = cpu_memset_codelet,
+	.cpu_funcs = {cpu_memset_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cuda_memset_codelet,
+	.cuda_funcs = {cuda_memset_codelet, NULL},
 #endif
 #endif
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 /*
 /*
@@ -65,6 +77,8 @@ static starpu_codelet memset_cl = {
 
 
 static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned length = STARPU_VECTOR_GET_NX(descr[0]);
 
 
@@ -79,10 +93,12 @@ static void cpu_check_content_codelet(void *descr[], __attribute__ ((unused)) vo
 	}
 	}
 }
 }
 
 
-static starpu_codelet check_content_cl = {
+static struct starpu_codelet check_content_cl =
+{
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
-	.cpu_func = cpu_check_content_codelet,
-	.nbuffers = 1
+	.cpu_funcs = {cpu_check_content_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_R}
 };
 };
 
 
 
 
@@ -90,7 +106,9 @@ int main(int argc, char **argv)
 {
 {
 	int ret;
 	int ret;
 
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	/* The buffer should never be explicitely allocated */
 	/* The buffer should never be explicitely allocated */
 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
@@ -103,31 +121,27 @@ int main(int argc, char **argv)
 
 
 		memset_task = starpu_task_create();
 		memset_task = starpu_task_create();
 		memset_task->cl = &memset_cl;
 		memset_task->cl = &memset_cl;
-		memset_task->buffers[0].handle = v_handle;
-		memset_task->buffers[0].mode = STARPU_W;
+		memset_task->handles[0] = v_handle;
 		memset_task->detach = 0;
 		memset_task->detach = 0;
-	
+
 		ret = starpu_task_submit(memset_task);
 		ret = starpu_task_submit(memset_task);
-		if (ret == -ENODEV)
-				goto enodev;
-	
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
 		ret = starpu_task_wait(memset_task);
 		ret = starpu_task_wait(memset_task);
-		if (ret)
-			exit(-1);
-		
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
+
 		check_content_task = starpu_task_create();
 		check_content_task = starpu_task_create();
 		check_content_task->cl = &check_content_cl;
 		check_content_task->cl = &check_content_cl;
-		check_content_task->buffers[0].handle = v_handle;
-		check_content_task->buffers[0].mode = STARPU_R;
+		check_content_task->handles[0] = v_handle;
 		check_content_task->detach = 0;
 		check_content_task->detach = 0;
-	
+
 		ret = starpu_task_submit(check_content_task);
 		ret = starpu_task_submit(check_content_task);
-		if (ret == -ENODEV)
-				goto enodev;
-	
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
 		ret = starpu_task_wait(check_content_task);
 		ret = starpu_task_wait(check_content_task);
-		if (ret)
-			exit(-1);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 
 
 		starpu_data_invalidate(v_handle);
 		starpu_data_invalidate(v_handle);
 	}
 	}
@@ -137,11 +151,13 @@ int main(int argc, char **argv)
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
 
 
 enodev:
 enodev:
+	starpu_data_unregister(v_handle);
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 48 - 39
tests/datawizard/data_lookup.c

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <sys/types.h>
+#include "../helper.h"
 
 
 static void task(void **buffers, void *args)
 static void task(void **buffers, void *args)
 {
 {
@@ -27,17 +28,19 @@ static void task(void **buffers, void *args)
 	size_t size, i;
 	size_t size, i;
 
 
 	numbers = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 	numbers = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 	for(i = 0; i < size; i++)
 	for(i = 0; i < size; i++)
 	{
 	{
 		numbers[i] = i;
 		numbers[i] = i;
 	}
 	}
 }
 }
 
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU,
 	.where = STARPU_CPU,
-	.cpu_func = task,
-	.nbuffers = 1
+	.cpu_funcs = {task, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 static int test_lazy_allocation()
 static int test_lazy_allocation()
@@ -46,7 +49,7 @@ static int test_lazy_allocation()
 
 
 	size_t i;
 	size_t i;
 	void *pointer;
 	void *pointer;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	int ret;
 	int ret;
 
 
 	/* Lazily-allocated vector.  */
 	/* Lazily-allocated vector.  */
@@ -58,28 +61,31 @@ static int test_lazy_allocation()
 				 STARPU_VALUE, &count, sizeof(size_t),
 				 STARPU_VALUE, &count, sizeof(size_t),
 				 0);
 				 0);
 	if (ret == -ENODEV) return ret;
 	if (ret == -ENODEV) return ret;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
 	 * could perform the kernel, so this is not an error from StarPU */
 	 * could perform the kernel, so this is not an error from StarPU */
 
 
 	/* Acquire the handle, forcing a local allocation.  */
 	/* Acquire the handle, forcing a local allocation.  */
-	starpu_data_acquire(handle, STARPU_R);
+	ret = starpu_data_acquire(handle, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
 
 
 	/* Make sure we have a local pointer to it.  */
 	/* Make sure we have a local pointer to it.  */
 	pointer = starpu_handle_get_local_ptr(handle);
 	pointer = starpu_handle_get_local_ptr(handle);
-	assert(pointer != NULL);
+	STARPU_ASSERT(pointer != NULL);
 	for(i = 0; i < count; i++)
 	for(i = 0; i < count; i++)
 	{
 	{
 		float *numbers = (float *)pointer;
 		float *numbers = (float *)pointer;
-		assert(numbers[i] == i);
+		STARPU_ASSERT(numbers[i] == i);
 	}
 	}
 
 
 	/* Make sure the pointer/handle mapping is up-to-date.  */
 	/* Make sure the pointer/handle mapping is up-to-date.  */
-	assert(starpu_data_lookup(pointer) == handle);
+	STARPU_ASSERT(starpu_data_lookup(pointer) == handle);
 
 
 	starpu_data_release(handle);
 	starpu_data_release(handle);
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 
 
-	assert(starpu_data_lookup(pointer) == NULL);
+	STARPU_ASSERT(starpu_data_lookup(pointer) == NULL);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -91,12 +97,12 @@ static int test_lazy_allocation()
 static void test_filters()
 static void test_filters()
 {
 {
 #define CHILDREN_COUNT 10
 #define CHILDREN_COUNT 10
-	int err, i;
+	int ret, i;
 	int *ptr, *children_pointers[CHILDREN_COUNT];
 	int *ptr, *children_pointers[CHILDREN_COUNT];
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 
 
-	err = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
-	assert(err == 0);
+	ret = starpu_malloc((void**)&ptr, VECTOR_SIZE * sizeof(*ptr));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 
 
 	starpu_vector_data_register(&handle, 0, (uintptr_t)ptr,
 	starpu_vector_data_register(&handle, 0, (uintptr_t)ptr,
 				    VECTOR_SIZE, sizeof(*ptr));
 				    VECTOR_SIZE, sizeof(*ptr));
@@ -107,18 +113,18 @@ static void test_filters()
 		.nchildren = CHILDREN_COUNT
 		.nchildren = CHILDREN_COUNT
 	};
 	};
 	starpu_data_partition(handle, &f);
 	starpu_data_partition(handle, &f);
-	assert(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
+	STARPU_ASSERT(starpu_data_get_nb_children(handle) == CHILDREN_COUNT);
 
 
 	for (i = 0; i < CHILDREN_COUNT; i++)
 	for (i = 0; i < CHILDREN_COUNT; i++)
 	{
 	{
-                starpu_data_handle child;
+                starpu_data_handle_t child;
 
 
 		child = starpu_data_get_sub_data(handle, 1, i);
 		child = starpu_data_get_sub_data(handle, 1, i);
 		children_pointers[i] = (int *) starpu_handle_get_local_ptr(child);
 		children_pointers[i] = (int *) starpu_handle_get_local_ptr(child);
-		assert(children_pointers[i] != NULL);
+		STARPU_ASSERT(children_pointers[i] != NULL);
 
 
 		/* Make sure we have a pointer -> handle mapping for CHILD.  */
 		/* Make sure we have a pointer -> handle mapping for CHILD.  */
-		assert(starpu_data_lookup(children_pointers[i]) == child);
+		STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == child);
 	}
 	}
 
 
 	starpu_data_unpartition(handle, 0);
 	starpu_data_unpartition(handle, 0);
@@ -127,11 +133,11 @@ static void test_filters()
 	{
 	{
 		if (children_pointers[i] != ptr)
 		if (children_pointers[i] != ptr)
 			/* Make sure the pointer -> handle mapping is gone.  */
 			/* Make sure the pointer -> handle mapping is gone.  */
-			assert(starpu_data_lookup(children_pointers[i]) == NULL);
+			STARPU_ASSERT(starpu_data_lookup(children_pointers[i]) == NULL);
 	}
 	}
 
 
 	/* Make sure the parent's mapping is back.  */
 	/* Make sure the parent's mapping is back.  */
-	assert(starpu_data_lookup(ptr) == handle);
+	STARPU_ASSERT(starpu_data_lookup(ptr) == handle);
 
 
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 	starpu_free(ptr);
 	starpu_free(ptr);
@@ -141,20 +147,22 @@ static void test_filters()
 
 
 int main(int argc, char *argv[])
 int main(int argc, char *argv[])
 {
 {
-	int err;
+	int ret;
 	size_t i;
 	size_t i;
 	void *vectors[VECTOR_COUNT], *variables[VARIABLE_COUNT];
 	void *vectors[VECTOR_COUNT], *variables[VARIABLE_COUNT];
-	starpu_data_handle vector_handles[VECTOR_COUNT];
-	starpu_data_handle variable_handles[VARIABLE_COUNT];
+	starpu_data_handle_t vector_handles[VECTOR_COUNT];
+	starpu_data_handle_t variable_handles[VARIABLE_COUNT];
 
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	/* Register data regions.  */
 	/* Register data regions.  */
 
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
 	{
-		err = starpu_malloc(&variables[i], sizeof(float));
-		assert(err == 0);
+		ret = starpu_malloc(&variables[i], sizeof(float));
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 		starpu_variable_data_register(&variable_handles[i], 0,
 		starpu_variable_data_register(&variable_handles[i], 0,
 					      (uintptr_t)variables[i],
 					      (uintptr_t)variables[i],
 					      sizeof(float));
 					      sizeof(float));
@@ -162,8 +170,8 @@ int main(int argc, char *argv[])
 
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
 	{
-		err = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
-		assert(err == 0);
+		ret = starpu_malloc(&vectors[i], VECTOR_SIZE * sizeof(float));
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 		starpu_vector_data_register(&vector_handles[i], 0,
 		starpu_vector_data_register(&vector_handles[i], 0,
 					    (uintptr_t)vectors[i],
 					    (uintptr_t)vectors[i],
 					    VECTOR_SIZE, sizeof(float));
 					    VECTOR_SIZE, sizeof(float));
@@ -173,18 +181,18 @@ int main(int argc, char *argv[])
 
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 
 		handle = starpu_data_lookup(variables[i]);
 		handle = starpu_data_lookup(variables[i]);
-		assert(handle == variable_handles[i]);
+		STARPU_ASSERT(handle == variable_handles[i]);
 	}
 	}
 
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 
 		handle = starpu_data_lookup(vectors[i]);
 		handle = starpu_data_lookup(vectors[i]);
-		assert(handle == vector_handles[i]);
+		STARPU_ASSERT(handle == vector_handles[i]);
 	}
 	}
 
 
 	/* Unregister them.  */
 	/* Unregister them.  */
@@ -203,24 +211,24 @@ int main(int argc, char *argv[])
 
 
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	for(i = 0; i < VARIABLE_COUNT; i++)
 	{
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 
 		handle = starpu_data_lookup(variables[i]);
 		handle = starpu_data_lookup(variables[i]);
-		assert(handle == NULL);
+		STARPU_ASSERT(handle == NULL);
 		starpu_free(variables[i]);
 		starpu_free(variables[i]);
 	}
 	}
 
 
 	for(i = 0; i < VECTOR_COUNT; i++)
 	for(i = 0; i < VECTOR_COUNT; i++)
 	{
 	{
-		starpu_data_handle handle;
+		starpu_data_handle_t handle;
 
 
 		handle = starpu_data_lookup(vectors[i]);
 		handle = starpu_data_lookup(vectors[i]);
-		assert(handle == NULL);
+		STARPU_ASSERT(handle == NULL);
 		starpu_free(vectors[i]);
 		starpu_free(vectors[i]);
 	}
 	}
 
 
-	err = test_lazy_allocation();
-	if (err == -ENODEV) goto enodev;
+	ret = test_lazy_allocation();
+	if (ret == -ENODEV) goto enodev;
 	test_filters();
 	test_filters();
 
 
 	starpu_shutdown();
 	starpu_shutdown();
@@ -231,5 +239,6 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 35 - 18
tests/datawizard/dining_philosophers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,28 +16,29 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
 
 
 /* number of philosophers */
 /* number of philosophers */
 #define N	16
 #define N	16
 
 
-starpu_data_handle fork_handles[N];
+starpu_data_handle_t fork_handles[N];
 unsigned forks[N];
 unsigned forks[N];
 
 
-#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
-
 static void eat_kernel(void *descr[], void *arg)
 static void eat_kernel(void *descr[], void *arg)
 {
 {
 }
 }
 
 
-static starpu_codelet eating_cl = {
+static struct starpu_codelet eating_cl =
+{
+	.modes = { STARPU_RW, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cuda_func = eat_kernel,
-	.cpu_func = eat_kernel,
-        .opencl_func = eat_kernel,
+	.cuda_funcs = {eat_kernel, NULL},
+	.cpu_funcs = {eat_kernel, NULL},
+        .opencl_funcs = {eat_kernel, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
-void submit_one_task(unsigned p)
+int submit_one_task(unsigned p)
 {
 {
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
 
 
@@ -46,18 +47,20 @@ void submit_one_task(unsigned p)
 	unsigned left = p;
 	unsigned left = p;
 	unsigned right = (p+1)%N;
 	unsigned right = (p+1)%N;
 
 
-	task->buffers[0].handle = fork_handles[left];
-	task->buffers[0].mode = STARPU_RW;
-	task->buffers[1].handle = fork_handles[right];
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = fork_handles[left];
+	task->handles[1] = fork_handles[right];
 
 
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
-	STARPU_ASSERT(!ret);
+	return ret;
 }
 }
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	/* initialize the forks */
 	/* initialize the forks */
 	unsigned f;
 	unsigned f;
@@ -75,10 +78,13 @@ int main(int argc, char **argv)
 	{
 	{
 		/* select one philosopher randomly */
 		/* select one philosopher randomly */
 		unsigned philosopher = rand() % N;
 		unsigned philosopher = rand() % N;
-		submit_one_task(philosopher);
+		ret = submit_one_task(philosopher);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
-	starpu_task_wait_for_all();
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
 
 	FPRINTF(stderr, "waiting done\n");
 	FPRINTF(stderr, "waiting done\n");
 	for (f = 0; f < N; f++)
 	for (f = 0; f < N; f++)
@@ -88,5 +94,16 @@ int main(int argc, char **argv)
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	for (f = 0; f < N; f++)
+	{
+		starpu_data_unregister(fork_handles[f]);
+	}
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 174 - 0
tests/datawizard/double_parameter.c

@@ -0,0 +1,174 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static struct starpu_codelet codelet_R_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet codelet_R_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet codelet_R_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW}
+};
+
+static struct starpu_codelet codelet_W_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet codelet_W_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet codelet_W_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_RW}
+};
+
+static struct starpu_codelet codelet_RW_R =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+static struct starpu_codelet codelet_RW_W =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_W}
+};
+
+static struct starpu_codelet codelet_RW_RW =
+{
+        .where = STARPU_CPU,
+        .cpu_funcs = { dummy_func, NULL },
+        .model = NULL,
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
+
+int main(int argc, char **argv)
+{
+	float foo = 0.0f;
+	starpu_data_handle_t handle;
+	int ret;
+	struct starpu_task *task;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, 0, (uintptr_t)&foo, sizeof(foo));
+
+#define SUBMIT(mode0, mode1) \
+	{ \
+		task = starpu_task_create();	\
+		task->handles[0] = handle;	\
+		task->handles[1] = handle;		 \
+		enum starpu_access_mode smode0 = STARPU_##mode0;	\
+		enum starpu_access_mode smode1 = STARPU_##mode0;	\
+		if      (smode0 == STARPU_R && smode1 == STARPU_R)	\
+			task->cl = &codelet_R_R;			\
+		else if (smode0 == STARPU_R && smode1 == STARPU_W)	\
+			task->cl = &codelet_R_W;			\
+		else if (smode0 == STARPU_R && smode1 == STARPU_RW)	\
+			task->cl = &codelet_R_RW;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_R)	\
+			task->cl = &codelet_W_R;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_W)	\
+			task->cl = &codelet_W_W;			\
+		else if (smode0 == STARPU_W && smode1 == STARPU_RW)	\
+			task->cl = &codelet_W_RW;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_R)	\
+			task->cl = &codelet_RW_R;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_W)	\
+			task->cl = &codelet_RW_W;			\
+		else if (smode0 == STARPU_RW && smode1 == STARPU_RW)	\
+			task->cl = &codelet_RW_RW;			\
+									\
+		ret = starpu_task_submit(task);				\
+		if (ret == -ENODEV) goto enodev;			\
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");   \
+	}
+
+	SUBMIT(R,R);
+	SUBMIT(R,W);
+	SUBMIT(R,RW);
+	SUBMIT(W,R);
+	SUBMIT(W,W);
+	SUBMIT(W,RW);
+	SUBMIT(RW,R);
+	SUBMIT(RW,W);
+	SUBMIT(RW,RW);
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 142 - 28
tests/datawizard/dsm_stress.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,6 +21,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include <pthread.h>
+#include "../helper.h"
 
 
 #define N	10000
 #define N	10000
 
 
@@ -33,7 +34,7 @@ static unsigned finished = 0;
 
 
 static unsigned cnt = N;
 static unsigned cnt = N;
 
 
-starpu_data_handle v_handle, v_handle2;
+starpu_data_handle_t v_handle, v_handle2;
 static unsigned *v;
 static unsigned *v;
 static unsigned *v2;
 static unsigned *v2;
 
 
@@ -43,10 +44,10 @@ static void callback(void *arg)
 
 
 	if (res == 0)
 	if (res == 0)
 	{
 	{
-		pthread_mutex_lock(&mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		finished = 1;
 		finished = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
 	}
 }
 }
 
 
@@ -64,11 +65,12 @@ static void cpu_codelet_null(void *descr[], __attribute__ ((unused)) void *_args
 {
 {
 }
 }
 
 
-static starpu_access_mode select_random_mode(void)
+static enum starpu_access_mode select_random_mode(void)
 {
 {
 	int r = rand();
 	int r = rand();
 
 
-	switch (r % 3) {
+	switch (r % 3)
+	{
 		case 0:
 		case 0:
 			return STARPU_R;
 			return STARPU_R;
 		case 1:
 		case 1:
@@ -79,22 +81,109 @@ static starpu_access_mode select_random_mode(void)
 	return STARPU_RW;
 	return STARPU_RW;
 }
 }
 
 
+static struct starpu_codelet cl_r_r =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
+
+static struct starpu_codelet cl_r_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
+};
+
+static struct starpu_codelet cl_r_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW}
+};
+
+static struct starpu_codelet cl_w_r =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
+
+static struct starpu_codelet cl_w_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_W}
+};
+
+static struct starpu_codelet cl_w_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_W, STARPU_RW}
+};
 
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl_rw_r =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_func = cpu_codelet_null,
-	.cuda_func = cuda_codelet_null,
-        .opencl_func = opencl_codelet_null,
-	.nbuffers = 2
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
+
+static struct starpu_codelet cl_rw_w =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_W}
+};
+
+static struct starpu_codelet cl_rw_rw =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {cpu_codelet_null, NULL},
+	.cuda_funcs = {cuda_codelet_null, NULL},
+        .opencl_funcs = {opencl_codelet_null, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
 };
 };
 
 
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
-	starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
+	ret = starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
+	ret = starpu_malloc((void **)&v2, VECTORSIZE*sizeof(unsigned));
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc");
 
 
 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&v_handle2, 0, (uintptr_t)v2, VECTORSIZE, sizeof(unsigned));
 	starpu_vector_data_register(&v_handle2, 0, (uintptr_t)v2, VECTORSIZE, sizeof(unsigned));
@@ -103,36 +192,61 @@ int main(int argc, char **argv)
 	for (iter = 0; iter < N; iter++)
 	for (iter = 0; iter < N; iter++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
-		task->cl = &cl;
 
 
-		task->buffers[0].handle = v_handle;
-		task->buffers[0].mode = select_random_mode();
-
-		task->buffers[1].handle = v_handle2;
-		task->buffers[1].mode = select_random_mode();
+		task->handles[0] = v_handle;
+		task->handles[1] = v_handle2;
+
+		enum starpu_access_mode mode0 = select_random_mode();
+		enum starpu_access_mode mode1 = select_random_mode();
+
+		if (mode0 == STARPU_R && mode1 == STARPU_R)
+			task->cl = &cl_r_r;
+		else if (mode0 == STARPU_R && mode1 == STARPU_W)
+			task->cl = &cl_r_w;
+		else if (mode0 == STARPU_R && mode1 == STARPU_RW)
+			task->cl = &cl_r_rw;
+		else if (mode0 == STARPU_W && mode1 == STARPU_R)
+			task->cl = &cl_w_r;
+		else if (mode0 == STARPU_W && mode1 == STARPU_W)
+			task->cl = &cl_w_w;
+		else if (mode0 == STARPU_W && mode1 == STARPU_RW)
+			task->cl = &cl_w_rw;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_R)
+			task->cl = &cl_rw_r;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_W)
+			task->cl = &cl_rw_w;
+		else if (mode0 == STARPU_RW && mode1 == STARPU_RW)
+			task->cl = &cl_rw_rw;
 
 
 		task->callback_func = callback;
 		task->callback_func = callback;
 		task->callback_arg = NULL;
 		task->callback_arg = NULL;
 
 
 		int ret = starpu_task_submit(task);
 		int ret = starpu_task_submit(task);
-		if (ret == -ENODEV)
-			goto enodev;
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 	}
 
 
-	pthread_mutex_lock(&mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	if (!finished)
 	if (!finished)
-		pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
+	starpu_data_unregister(v_handle);
+	starpu_data_unregister(v_handle2);
 	starpu_free(v);
 	starpu_free(v);
 	starpu_free(v2);
 	starpu_free(v2);
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
 
 
 enodev:
 enodev:
+	starpu_data_unregister(v_handle);
+	starpu_data_unregister(v_handle2);
+	starpu_free(v);
+	starpu_free(v2);
+	starpu_shutdown();
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
-	return 77;
+	return STARPU_TEST_SKIPPED;
 }
 }

+ 139 - 0
tests/datawizard/gpu_register.c

@@ -0,0 +1,139 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include <starpu_cuda.h>
+#include "../helper.h"
+#include "scal.h"
+
+int main(int argc, char **argv)
+{
+	int ret;
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000
+	unsigned *foo_gpu;
+	unsigned *foo;
+	starpu_data_handle_t handle;
+	int n, i, size, pieces;
+	int devid;
+	unsigned workerid;
+	int chosen = -1;
+	cudaError_t cures;
+#endif
+#endif
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_CUDA
+#if CUDART_VERSION >= 4000 /* We need thread-safety of CUDA */
+	/* TODO OpenCL, too */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER) {
+			chosen = workerid;
+			break;
+		}
+	}
+
+	if (chosen == -1)
+		return STARPU_TEST_SKIPPED;
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	n = starpu_worker_get_count();
+	size = 10 * n;
+
+	devid = starpu_worker_get_devid(chosen);
+	cudaSetDevice(devid);
+	cudaMalloc((void**)&foo_gpu, size * sizeof(*foo_gpu));
+
+	foo = calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	cures = cudaMemcpy(foo_gpu, foo, size * sizeof(*foo_gpu), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	starpu_vector_data_register(&handle, starpu_worker_get_memory_node(chosen), (uintptr_t)foo_gpu, size, sizeof(*foo_gpu));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	/* Even with just one worker, split in at least two */
+	if (n == 1)
+		pieces = 2;
+	else
+		pieces = n;
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = pieces,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	for (i = 0; i < pieces; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i%n;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unpartition(handle, starpu_worker_get_memory_node(chosen));
+	starpu_data_unregister(handle);
+
+	cudaSetDevice(devid);
+	cures = cudaMemcpy(foo, foo_gpu, size * sizeof(*foo_gpu), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	starpu_shutdown();
+
+	for (i = 0; i < size; i++) {
+		if (foo[i] != i*2) {
+			fprintf(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
+			return EXIT_FAILURE;
+		}
+	}
+
+        return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+#endif
+#endif
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 23 - 17
tests/datawizard/handle_to_pointer.c

@@ -19,6 +19,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <stdlib.h>
 #include <stdlib.h>
+#include "../helper.h"
 
 
 static void cpu_task(void **buffers, void *args)
 static void cpu_task(void **buffers, void *args)
 {
 {
@@ -27,7 +28,7 @@ static void cpu_task(void **buffers, void *args)
 	size_t size;
 	size_t size;
 
 
 	numbers = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
 	numbers = (int *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 
 
 	for(i = 0; i < size; i++)
 	for(i = 0; i < size; i++)
 	{
 	{
@@ -43,7 +44,7 @@ static void cuda_task(void **buffers, void *args)
 	size_t size;
 	size_t size;
 
 
 	numbers = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
 	numbers = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
-	starpu_unpack_cl_args (args, &size);
+	starpu_codelet_unpack_args (args, &size);
 
 
 	for(i = 0; i < size; i++)
 	for(i = 0; i < size; i++)
 	{
 	{
@@ -52,41 +53,44 @@ static void cuda_task(void **buffers, void *args)
 }
 }
 #endif
 #endif
 
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU | STARPU_CUDA,
 	.where = STARPU_CPU | STARPU_CUDA,
-	.cpu_func = cpu_task,
+	.cpu_funcs = {cpu_task, NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = cuda_task,
+	.cuda_funcs = {cuda_task, NULL},
 #endif
 #endif
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_W}
 };
 };
 
 
 int main(int argc, char *argv[])
 int main(int argc, char *argv[])
 {
 {
-	int err;
+	int err, ret;
 	size_t i;
 	size_t i;
 	int *pointer;
 	int *pointer;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	static const size_t count = 123;
 	static const size_t count = 123;
 
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 
 
 	err = starpu_malloc((void **)&pointer, count * sizeof(int));
 	err = starpu_malloc((void **)&pointer, count * sizeof(int));
-	assert((err == 0) && (pointer != NULL));
+	STARPU_ASSERT((err == 0) && (pointer != NULL));
 
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)pointer,
 	starpu_variable_data_register(&handle, 0, (uintptr_t)pointer,
 				      sizeof(int));
 				      sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 
 
 	starpu_vector_data_register(&handle, 0, (uintptr_t)pointer,
 	starpu_vector_data_register(&handle, 0, (uintptr_t)pointer,
 				    count, sizeof(int));
 				    count, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 
 
 	starpu_matrix_data_register(&handle, 0, (uintptr_t)pointer, 0,
 	starpu_matrix_data_register(&handle, 0, (uintptr_t)pointer, 0,
 				    count, 1, sizeof(int));
 				    count, 1, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == pointer);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == pointer);
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
 
 
 	starpu_free(pointer);
 	starpu_free(pointer);
@@ -95,24 +99,26 @@ int main(int argc, char *argv[])
 	/* Lazy allocation.  */
 	/* Lazy allocation.  */
 	starpu_vector_data_register(&handle, -1, 0 /* NULL */,
 	starpu_vector_data_register(&handle, -1, 0 /* NULL */,
 				    count, sizeof(int));
 				    count, sizeof(int));
-	assert(starpu_handle_to_pointer(handle, 0) == NULL);
+	STARPU_ASSERT(starpu_handle_to_pointer(handle, 0) == NULL);
 
 
 	/* Pass the handle to a task.  */
 	/* Pass the handle to a task.  */
-	starpu_insert_task(&cl,
+	err = starpu_insert_task(&cl,
 			   STARPU_W, handle,
 			   STARPU_W, handle,
 			   STARPU_VALUE, &count, sizeof(count),
 			   STARPU_VALUE, &count, sizeof(count),
 			   0);
 			   0);
+	if (err == -ENODEV)
+		return STARPU_TEST_SKIPPED;
 
 
 	/* Acquire the handle, forcing a local allocation.  */
 	/* Acquire the handle, forcing a local allocation.  */
 	starpu_data_acquire(handle, STARPU_R);
 	starpu_data_acquire(handle, STARPU_R);
 
 
 	/* Make sure we have a local pointer to it.  */
 	/* Make sure we have a local pointer to it.  */
 	pointer = (int *) starpu_handle_to_pointer(handle, 0);
 	pointer = (int *) starpu_handle_to_pointer(handle, 0);
-	assert(pointer != NULL);
+	STARPU_ASSERT(pointer != NULL);
 	for(i = 0; i < count; i++)
 	for(i = 0; i < count; i++)
 	{
 	{
 		int *numbers = (int *)pointer;
 		int *numbers = (int *)pointer;
-		assert(numbers[i] == i);
+		STARPU_ASSERT(numbers[i] == i);
 	}
 	}
 	starpu_data_release(handle);
 	starpu_data_release(handle);
 
 

+ 102 - 0
tests/datawizard/in_place_partition.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../helper.h"
+#include "scal.h"
+
+int main(int argc, char **argv)
+{
+	unsigned *foo;
+	starpu_data_handle_t handle;
+	int ret;
+	int n, i, size;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/datawizard/scal_opencl.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	n = starpu_worker_get_count();
+	if (n == 1)
+	{
+		starpu_shutdown();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	size = 10 * n;
+
+	foo = (unsigned *) calloc(size, sizeof(*foo));
+	for (i = 0; i < size; i++)
+		foo[i] = i;
+
+	starpu_vector_data_register(&handle, 0, (uintptr_t)foo, size, sizeof(*foo));
+
+	/* Broadcast the data to force in-place partitioning */
+	for (i = 0; i < n; i++)
+		starpu_data_prefetch_on_node(handle, starpu_worker_get_memory_node(i), 0);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = n,
+	};
+
+	starpu_data_partition(handle, &f);
+
+	for (i = 0; i < f.nchildren; i++) {
+		struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		task->cl = &scal_codelet;
+		task->execute_on_a_specific_worker = 1;
+		task->workerid = i;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unregister(handle);
+	starpu_shutdown();
+
+	ret = EXIT_SUCCESS;
+	for (i = 0; i < size; i++) {
+		if (foo[i] != i*2) {
+			FPRINTF(stderr,"value %d is %d instead of %d\n", i, foo[i], 2*i);
+			ret = EXIT_FAILURE;
+		}
+	}
+
+        return ret;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+}

+ 78 - 31
tests/datawizard/increment_redux.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
@@ -24,9 +26,10 @@
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #endif
 #endif
 
 
+#warning memory leak
 
 
 static unsigned var = 0;
 static unsigned var = 0;
-static starpu_data_handle handle;
+static starpu_data_handle_t handle;
 
 
 /*
 /*
  *	Reduction methods
  *	Reduction methods
@@ -35,6 +38,8 @@ static starpu_data_handle handle;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void redux_cuda_kernel(void *descr[], void *arg)
 static void redux_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
@@ -53,6 +58,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
 static void neutral_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	/* This is a dummy technique of course */
 	/* This is a dummy technique of course */
@@ -65,6 +72,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static void redux_opencl_kernel(void *descr[], void *arg)
 static void redux_opencl_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst, h_src;
 	unsigned h_dst, h_src;
 
 
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -79,18 +88,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
 
 	h_dst += h_src;
 	h_dst += h_src;
 
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 }
 
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
 static void neutral_opencl_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst = 0;
 	unsigned h_dst = 0;
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	cl_command_queue queue;
 	cl_command_queue queue;
 	starpu_opencl_get_current_queue(&queue);
 	starpu_opencl_get_current_queue(&queue);
 
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 }
 #endif
 #endif
 
 
@@ -98,6 +109,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
 
 static void redux_cpu_kernel(void *descr[], void *arg)
 static void redux_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	*dst = *dst + *src;
 	*dst = *dst + *src;
@@ -105,31 +118,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
 
 static void neutral_cpu_kernel(void *descr[], void *arg)
 static void neutral_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dst = 0;
 	*dst = 0;
 }
 }
 
 
-static starpu_codelet redux_cl = {
+static struct starpu_codelet redux_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = redux_cuda_kernel,
+	.cuda_funcs = {redux_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = redux_opencl_kernel,
+	.opencl_funcs = {redux_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = redux_cpu_kernel,
+	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
-static starpu_codelet neutral_cl = {
+static struct starpu_codelet neutral_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = neutral_cuda_kernel,
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = neutral_opencl_kernel,
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = neutral_cpu_kernel,
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
@@ -141,6 +158,8 @@ static starpu_codelet neutral_cl = {
 /* dummy OpenCL implementation */
 /* dummy OpenCL implementation */
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned h_token;
 	unsigned h_token;
 
 
@@ -149,7 +168,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
 
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	h_token++;
-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 }
 }
 #endif
 #endif
 
 
@@ -157,6 +176,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void increment_cuda_kernel(void *descr[], void *arg)
 static void increment_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned host_token;
 	unsigned host_token;
 
 
@@ -173,25 +194,33 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
 
 static void increment_cpu_kernel(void *descr[], void *arg)
 static void increment_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*tokenptr = *tokenptr + 1;
 	*tokenptr = *tokenptr + 1;
 }
 }
 
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda_kernel,
+	.cuda_funcs = {increment_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = increment_opencl_kernel,
+	.opencl_funcs = {increment_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = increment_cpu_kernel,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 
 
@@ -208,26 +237,44 @@ int main(int argc, char **argv)
 		for (t = 0; t < ntasks; t++)
 		for (t = 0; t < ntasks; t++)
 		{
 		{
 			struct starpu_task *task = starpu_task_create();
 			struct starpu_task *task = starpu_task_create();
-	
+
 			task->cl = &increment_cl;
 			task->cl = &increment_cl;
-	
-			task->buffers[0].mode = STARPU_REDUX;
-			task->buffers[0].handle = handle;
-	
-			int ret = starpu_task_submit(task);
-			STARPU_ASSERT(!ret);
+			task->handles[0] = handle;
 
 
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		}
 
 
-		starpu_data_acquire(handle, STARPU_R);
-		STARPU_ASSERT(var == ntasks*(loop + 1));
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		if (var != ntasks * (loop+1))
+		{
+			starpu_data_release(handle);
+			starpu_data_unregister(handle);
+			goto err;
+		}
 		starpu_data_release(handle);
 		starpu_data_release(handle);
 	}
 	}
 
 
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
-	STARPU_ASSERT(var == ntasks*nloops);
-	
+	if (var != ntasks * nloops)
+		goto err;
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
 	starpu_shutdown();
 	starpu_shutdown();
+	STARPU_RETURN(EXIT_FAILURE);
 
 
-	return 0;
 }
 }

+ 255 - 0
tests/datawizard/increment_redux_lazy.c

@@ -0,0 +1,255 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../helper.h"
+
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+
+#warning memory leak
+
+static starpu_data_handle_t handle;
+
+/*
+ *	Reduction methods
+ */
+
+#ifdef STARPU_USE_CUDA
+static void redux_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	unsigned host_dst, host_src;
+
+	/* This is a dummy technique of course */
+	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	host_dst += host_src;
+
+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+
+static void neutral_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	/* This is a dummy technique of course */
+	unsigned host_dst = 0;
+	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+static void redux_opencl_kernel(void *descr[], void *arg)
+{
+	unsigned h_dst, h_src;
+
+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+	cl_mem d_src = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	/* This is a dummy technique of course */
+	clEnqueueReadBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+	clEnqueueReadBuffer(queue, d_src, CL_TRUE, 0, sizeof(unsigned), (void *)&h_src, 0, NULL, NULL);
+
+	h_dst += h_src;
+
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+}
+
+static void neutral_opencl_kernel(void *descr[], void *arg)
+{
+	unsigned h_dst = 0;
+	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
+}
+#endif
+
+
+
+static void redux_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	*dst = *dst + *src;
+}
+
+static void neutral_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*dst = 0;
+}
+
+static struct starpu_codelet redux_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {redux_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {redux_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {redux_cpu_kernel, NULL},
+	.nbuffers = 2
+};
+
+static struct starpu_codelet neutral_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
+	.nbuffers = 1
+};
+
+/*
+ *	Increment codelet
+ */
+
+#ifdef STARPU_USE_OPENCL
+/* dummy OpenCL implementation */
+static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
+{
+	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned h_token;
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+	h_token++;
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
+}
+#endif
+
+
+#ifdef STARPU_USE_CUDA
+static void increment_cuda_kernel(void *descr[], void *arg)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned host_token;
+
+	/* This is a dummy technique of course */
+	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	host_token++;
+
+	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+}
+#endif
+
+static void increment_cpu_kernel(void *descr[], void *arg)
+{
+	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*tokenptr = *tokenptr + 1;
+}
+
+static struct starpu_codelet increment_cl =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
+};
+
+int main(int argc, char **argv)
+{
+	int ret;
+	unsigned *var;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, -1, (uintptr_t)NULL, sizeof(unsigned));
+
+	starpu_data_set_reduction_methods(handle, &redux_cl, &neutral_cl);
+
+	unsigned ntasks = 1024;
+	unsigned nloops = 16;
+
+	unsigned loop;
+	unsigned t;
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		for (t = 0; t < ntasks; t++)
+		{
+			struct starpu_task *task = starpu_task_create();
+
+			task->cl = &increment_cl;
+			task->handles[0] = handle;
+
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		var = (unsigned*) starpu_variable_get_local_ptr(handle);
+		STARPU_ASSERT(*var == ntasks*(loop + 1));
+		starpu_data_release(handle);
+	}
+
+	ret = starpu_data_acquire(handle, STARPU_R);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+	var = (unsigned*) starpu_variable_get_local_ptr(handle);
+	STARPU_ASSERT(*var == ntasks*nloops);
+	starpu_data_release(handle);
+	starpu_data_unregister(handle);
+
+	starpu_shutdown();
+
+	STARPU_RETURN(EXIT_SUCCESS);
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	STARPU_RETURN(STARPU_TEST_SKIPPED);
+}

+ 102 - 30
tests/datawizard/increment_redux_v2.c

@@ -14,7 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+#include "../helper.h"
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
@@ -23,9 +25,10 @@
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #endif
 #endif
 
 
+#warning memory leak
 
 
 static unsigned var = 0;
 static unsigned var = 0;
-static starpu_data_handle handle;
+static starpu_data_handle_t handle;
 
 
 /*
 /*
  *	Reduction methods
  *	Reduction methods
@@ -34,6 +37,8 @@ static starpu_data_handle handle;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void redux_cuda_kernel(void *descr[], void *arg)
 static void redux_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 
@@ -52,6 +57,8 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
 static void neutral_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	/* This is a dummy technique of course */
 	/* This is a dummy technique of course */
@@ -64,6 +71,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 static void redux_opencl_kernel(void *descr[], void *arg)
 static void redux_opencl_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst, h_src;
 	unsigned h_dst, h_src;
 
 
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -78,18 +87,20 @@ static void redux_opencl_kernel(void *descr[], void *arg)
 
 
 	h_dst += h_src;
 	h_dst += h_src;
 
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 }
 
 
 static void neutral_opencl_kernel(void *descr[], void *arg)
 static void neutral_opencl_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned h_dst = 0;
 	unsigned h_dst = 0;
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
 	cl_command_queue queue;
 	cl_command_queue queue;
 	starpu_opencl_get_current_queue(&queue);
 	starpu_opencl_get_current_queue(&queue);
 
 
-	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_dst, CL_TRUE, 0, sizeof(unsigned), (void *)&h_dst, 0, NULL, NULL);
 }
 }
 #endif
 #endif
 
 
@@ -97,6 +108,8 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
 
 static void redux_cpu_kernel(void *descr[], void *arg)
 static void redux_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *src = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	*dst = *dst + *src;
 	*dst = *dst + *src;
@@ -104,31 +117,35 @@ static void redux_cpu_kernel(void *descr[], void *arg)
 
 
 static void neutral_cpu_kernel(void *descr[], void *arg)
 static void neutral_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dst = 0;
 	*dst = 0;
 }
 }
 
 
-static starpu_codelet redux_cl = {
+static struct starpu_codelet redux_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = redux_cuda_kernel,
+	.cuda_funcs = {redux_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = redux_opencl_kernel,
+	.opencl_funcs = {redux_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = redux_cpu_kernel,
+	.cpu_funcs = {redux_cpu_kernel, NULL},
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
-static starpu_codelet neutral_cl = {
+static struct starpu_codelet neutral_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = neutral_cuda_kernel,
+	.cuda_funcs = {neutral_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = neutral_opencl_kernel,
+	.opencl_funcs = {neutral_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = neutral_cpu_kernel,
+	.cpu_funcs = {neutral_cpu_kernel, NULL},
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
@@ -140,6 +157,8 @@ static starpu_codelet neutral_cl = {
 /* dummy OpenCL implementation */
 /* dummy OpenCL implementation */
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((unused)))
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned h_token;
 	unsigned h_token;
 
 
@@ -148,7 +167,7 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 
 
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	clEnqueueReadBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 	h_token++;
 	h_token++;
-	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL); 
+	clEnqueueWriteBuffer(queue, d_token, CL_TRUE, 0, sizeof(unsigned), (void *)&h_token, 0, NULL, NULL);
 }
 }
 #endif
 #endif
 
 
@@ -156,6 +175,8 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg __attribute__((u
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static void increment_cuda_kernel(void *descr[], void *arg)
 static void increment_cuda_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned host_token;
 	unsigned host_token;
 
 
@@ -172,25 +193,47 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
 
 static void increment_cpu_kernel(void *descr[], void *arg)
 static void increment_cpu_kernel(void *descr[], void *arg)
 {
 {
+	STARPU_SKIP_IF_VALGRIND;
+
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*tokenptr = *tokenptr + 1;
 	*tokenptr = *tokenptr + 1;
 }
 }
 
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda_kernel,
+	.cuda_funcs = {increment_cuda_kernel, NULL},
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	.opencl_func = increment_opencl_kernel,
+	.opencl_funcs = {increment_opencl_kernel, NULL},
 #endif
 #endif
-	.cpu_func = increment_cpu_kernel,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
+};
+
+static struct starpu_codelet increment_cl_redux =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {increment_cuda_kernel, NULL},
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {increment_opencl_kernel, NULL},
+#endif
+	.cpu_funcs = {increment_cpu_kernel, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_REDUX}
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&var, sizeof(unsigned));
 
 
@@ -207,26 +250,55 @@ int main(int argc, char **argv)
 		for (t = 0; t < ntasks; t++)
 		for (t = 0; t < ntasks; t++)
 		{
 		{
 			struct starpu_task *task = starpu_task_create();
 			struct starpu_task *task = starpu_task_create();
-	
-			task->cl = &increment_cl;
-	
-			task->buffers[0].mode = (t % 10 == 0)?STARPU_RW:STARPU_REDUX;
-			task->buffers[0].handle = handle;
-	
-			int ret = starpu_task_submit(task);
-			STARPU_ASSERT(!ret);
 
 
+			if (t % 10 == 0)
+			{
+				task->cl = &increment_cl;
+			}
+			else
+			{
+				task->cl = &increment_cl_redux;
+			}
+			task->handles[0] = handle;
+
+			int ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		}
 
 
-		starpu_data_acquire(handle, STARPU_R);
-		STARPU_ASSERT(var == ntasks*(loop + 1));
+		ret = starpu_data_acquire(handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+		if (var != ntasks *(loop+1))
+		{
+			_STARPU_DEBUG("%d != %d\n", var, ntasks*(loop+1));
+			starpu_data_release(handle);
+			starpu_data_unregister(handle);
+			goto err;
+		}
 		starpu_data_release(handle);
 		starpu_data_release(handle);
 	}
 	}
 
 
 	starpu_data_unregister(handle);
 	starpu_data_unregister(handle);
-	STARPU_ASSERT(var == ntasks*nloops);
+	if (var != ntasks *nloops)
+	{
+		_STARPU_DEBUG("%d != %d\n", var, ntasks*nloops);
+		goto err;
+	}
 	
 	
+
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	return 0;
+	return EXIT_SUCCESS;
+
+enodev:
+	starpu_data_unregister(handle);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	return STARPU_TEST_SKIPPED;
+
+err:
+	starpu_shutdown();
+	STARPU_RETURN(EXIT_FAILURE);
 }
 }

+ 70 - 0
tests/datawizard/interfaces/bcsr/bcsr_cuda.cu

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config bcsr_config;
+
+__global__ void bcsr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= nnz)
+		return;
+
+	if (nzval[i] != i*factor)
+		*err = 1;
+	else
+		nzval[i] = -nzval[i];
+}
+
+extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
+
+	factor = *(int *) args;
+	//val = (int *) starpu_bcsr_get_local_nzval((starpu_data_handle_t)buffers[0]);
+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &bcsr_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
+		(val, nnz, ret, factor);
+
+	error = cudaMemcpy(&bcsr_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 198 - 0
tests/datawizard/interfaces/bcsr/bcsr_interface.c

@@ -0,0 +1,198 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+/*
+ * XXX : These values should not be changed. If you really understand all that
+ * BCSR stuff, feel free to write a better example :)
+ */
+
+/* Size of the matrix */
+#define WIDTH          4
+#define HEIGHT         4
+#define SIZE           (WIDTH * HEIGHT)
+
+/* Size of the blocks */
+#define R              2
+#define C              2
+#define BLOCK_SIZE     (R*C)
+
+/* The matrix is simply 0 1 2... There are SIZE-1 non zero values... */
+#define NNZ            (SIZE-1)
+
+/* ... and SIZE/BLOCK_SIZE non zero blocks */
+#define NNZ_BLOCKS     (SIZE/BLOCK_SIZE)
+
+
+#ifdef STARPU_USE_CPU
+static void test_bcsr_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_bcsr_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_bcsr_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static int nzval[NNZ];
+static int nzval2[NNZ];
+
+static uint32_t colind[NNZ_BLOCKS];
+static uint32_t colind2[NNZ_BLOCKS];
+
+static uint32_t rowptr[1+WIDTH/R];
+static uint32_t rowptr2[1+WIDTH/R];
+
+static starpu_data_handle_t bcsr_handle;
+static starpu_data_handle_t bcsr2_handle;
+
+
+struct test_config bcsr_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_bcsr_cpu_func,
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_bcsr_cuda_func,
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_bcsr_opencl_func,
+#endif /* !STARPU_USE_OPENCL */
+	.handle        = &bcsr_handle,
+	.dummy_handle  = &bcsr2_handle,
+	.copy_failed   = 0,
+	.name          = "bcsr_interface"
+};
+
+static void
+register_data(void)
+{
+	int i;
+
+	for (i = 0; i < NNZ; i++)
+		nzval[i] = i;
+
+	colind[0] = 0;
+	colind[1] = 2;
+	colind[2] = 0;
+	colind[3] = 2;
+
+	rowptr[0] = 0;
+	rowptr[1] = 2;
+	rowptr[2] = 4;
+	
+	starpu_bcsr_data_register(&bcsr_handle,
+				  0,
+				  NNZ_BLOCKS,
+				  HEIGHT/R,
+				  (uintptr_t) nzval,
+				  colind,
+				  rowptr,
+				  0,
+				  R,
+				  C,
+				  sizeof(nzval[0]));
+
+	starpu_bcsr_data_register(&bcsr2_handle,
+				  0,
+				  NNZ_BLOCKS,
+				  HEIGHT/R,
+				  (uintptr_t) nzval2,
+				  colind2,
+				  rowptr2,
+				  0,
+				  R,
+				  C,
+				  sizeof(nzval2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(bcsr_handle);
+	starpu_data_unregister(bcsr2_handle);
+}
+
+static void
+test_bcsr_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	val = (int *) STARPU_BCSR_GET_NZVAL(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nnz; i++)
+	{
+		if (val[i] != i * factor)
+		{
+			bcsr_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+
+	/* Check colind */
+	uint32_t *col = STARPU_BCSR_GET_COLIND(buffers[0]);
+	for (i = 0; i < NNZ_BLOCKS; i++)
+		if (col[i] != colind[i])
+			bcsr_config.copy_failed = 1;
+
+	/* Check rowptr */
+	uint32_t *row = STARPU_BCSR_GET_ROWPTR(buffers[0]);
+	for (i = 0; i < 1 + WIDTH/R; i++)
+		if (row[i] != rowptr[i])
+			bcsr_config.copy_failed = 1;
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		return STARPU_TEST_SKIPPED;
+
+	register_data();
+
+	summary = run_tests(&bcsr_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+}
+

+ 130 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl"
+extern struct test_config bcsr_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_bcsr_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	uint32_t nnz = STARPU_BCSR_GET_NNZ(buffers[0]);
+	cl_mem nzval = (cl_mem)STARPU_BCSR_GET_NZVAL(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &bcsr_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"test_bcsr_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(nzval), &nzval,
+					      sizeof(nnz), &nnz,
+					      sizeof(fail), &fail,
+					      sizeof(factor), &factor,
+					      0);
+
+	if (nargs != 4)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", err);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nnz;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &bcsr_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 29 - 0
tests/datawizard/interfaces/bcsr/bcsr_opencl_kernel.cl

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void test_bcsr_opencl(__global int *val,
+			       unsigned int nx,
+			       __global int *err,
+			       int factor)
+{
+        const int i = get_global_id(0);
+        if (i >=  nx)
+		return;
+
+	if (val[i] != i * factor)
+		*err = 1;
+	else
+		val[i] = - val[i];
+}

+ 80 - 0
tests/datawizard/interfaces/block/block_cuda.cu

@@ -0,0 +1,80 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config block_config;
+
+static __global__ void block_cuda(int *block,
+				  int nx, int ny, int nz,
+				  unsigned ldy, unsigned ldz,
+				  float factor, int *err)
+{
+        int i, j, k;
+	int val = 0;
+
+        for (k = 0; k < nz ;k++)
+	{
+                for (j = 0; j < ny ;j++)
+		{
+                        for(i = 0; i < nx ;i++)
+			{
+				if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					*err = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+                }
+        }
+}
+
+extern "C" void test_block_cuda_func(void *buffers[], void *args)
+{
+	cudaError_t error;
+	int *ret;
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret, &block_config.copy_failed, sizeof(int), cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
+	int factor = *(int*) args;
+
+        block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
+		(block, nx, ny, nz, ldy, ldz, factor, ret);
+	error = cudaMemcpy(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 163 - 0
tests/datawizard/interfaces/block/block_interface.c

@@ -0,0 +1,163 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define NX 16
+#define NY NX
+#define NZ NX
+
+/* Prototypes */
+static void register_data(void);
+static void unregister_data(void);
+static void test_block_cpu_func(void *buffers[], void *args);
+#ifdef STARPU_USE_CUDA
+extern void test_block_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_block_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static starpu_data_handle_t block_handle;
+static starpu_data_handle_t block2_handle;
+
+struct test_config block_config =
+{
+	.cpu_func      = test_block_cpu_func,
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_block_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_block_opencl_func,
+#endif
+	.handle        = &block_handle,
+	.dummy_handle  = &block2_handle,
+	.copy_failed   = 0,
+	.name          = "block_interface"
+};
+
+static int block[NX*NY*NZ];
+static int block2[NX*NY*NZ];
+
+static void
+register_data(void)
+{
+	/* Initializing data */
+	int val = 0;
+	int i, j, k;
+	for (k = 0; k < NZ; k++)
+		for (j = 0; j < NY; j++)
+			for (i = 0; i < NX; i++)
+                                block[(k*NX*NY)+(j*NX)+i] = val++;
+
+	/* Registering data */
+	starpu_block_data_register(&block_handle,
+                                    0,
+                                    (uintptr_t)block,
+				    NX,
+				    NX * NY,
+				    NX,
+				    NY,
+				    NZ,
+				    sizeof(block[0]));
+	starpu_block_data_register(&block2_handle,
+                                    0,
+                                    (uintptr_t)block2,
+				    NX,
+				    NX * NY,
+				    NX,
+				    NY,
+				    NZ,
+				    sizeof(block2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(block_handle);
+	starpu_data_unregister(block2_handle);
+}
+
+static void test_block_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int factor = *(int*)args;
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	int *block = (int *) STARPU_BLOCK_GET_PTR(buffers[0]);
+	unsigned int i, j, k;
+	int val = 0;
+	block_config.copy_failed = 0;
+	for (k = 0; k < nz; k++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			for (i = 0; i < nx; i++)
+			{
+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					block_config.copy_failed = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+		}
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&block_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}
+

+ 120 - 0
tests/datawizard/interfaces/block/block_opencl.c

@@ -0,0 +1,120 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/block/block_opencl_kernel.cl"
+extern struct test_config block_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_block_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	int nx = STARPU_BLOCK_GET_NX(buffers[0]);
+	int ny = STARPU_BLOCK_GET_NY(buffers[0]);
+	int nz = STARPU_BLOCK_GET_NZ(buffers[0]);
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+	cl_mem block = (cl_mem) STARPU_BLOCK_GET_DEV_HANDLE(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &block_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"block_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(block), &block,
+					      sizeof(nx), &nx,
+					      sizeof(ny), &ny,
+					      sizeof(nz), &nz,
+					      sizeof(ldy), &ldy,
+					      sizeof(ldz), &ldz,
+					      sizeof(factor), &factor,
+					      sizeof(fail), &fail,
+					      0);
+
+	if (nargs != 8)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", nargs);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nx * ny * nz;
+		err = clEnqueueNDRangeKernel(queue,
+					     kernel,
+					     1,
+					     NULL,
+					     &global,
+					     NULL,
+					     0,
+					     NULL,
+					     &event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &block_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 46 - 0
tests/datawizard/interfaces/block/block_opencl_kernel.cl

@@ -0,0 +1,46 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void block_opencl(__global int *block,
+			   int nx, int ny, int nz,
+			   int ldy, int ldz,
+			   int factor, __global int *err)
+{
+        const int id = get_global_id(0);
+	if (id > 0)
+		return;
+
+	unsigned int i, j, k;
+	int val = 0;
+	for (k = 0; k < nz; k++)
+	{
+		for (j = 0; j < ny; j++)
+		{
+			for (i = 0; i < nx; i++)
+			{
+                                if (block[(k*ldz)+(j*ldy)+i] != factor * val)
+				{
+					*err = 1;
+					return;
+				}
+				else
+				{
+					block[(k*ldz)+(j*ldy)+i] *= -1;
+					val++;
+				}
+			}
+		}
+	}
+}

+ 106 - 0
tests/datawizard/interfaces/copy_interfaces.c

@@ -0,0 +1,106 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "../../helper.h"
+#include <datawizard/coherency.h>
+
+static int check_copy(starpu_data_handle_t handle, char *header)
+{
+	void *old_interface, *new_interface;
+	starpu_data_handle_t new_handle;
+	int ret=0;
+
+	starpu_data_register_same(&new_handle, handle);
+
+	if (!getenv("STARPU_SSILENT") && new_handle->ops->display)
+	{
+		fprintf(stderr, "%s: ", header);
+		new_handle->ops->display(new_handle, stderr);
+		fprintf(stderr, "\n");
+	}
+
+	old_interface = starpu_data_get_interface_on_node(handle, 0);
+	new_interface = starpu_data_get_interface_on_node(new_handle, 0);
+
+	if (new_handle->ops->compare(old_interface, new_interface) == 0)
+	{
+		FPRINTF(stderr, "Error when copying %s data\n", header);
+		assert(0);
+		ret = 1;
+	}
+	starpu_data_unregister(handle);
+	starpu_data_unregister(new_handle);
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	starpu_data_handle_t handle;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		int x=42;
+		starpu_variable_data_register(&handle, 0, (uintptr_t)&x, sizeof(x));
+		ret = check_copy(handle, "variable");
+	}
+
+	if (ret == 0)
+	{
+		int xx[] = {12, 23, 45};
+		starpu_vector_data_register(&handle, 0, (uintptr_t)xx, 3, sizeof(xx[0]));
+		ret = check_copy(handle, "vector");
+	}
+
+	if (ret == 0)
+	{
+		int NX=3;
+		int NY=2;
+		int matrix[NX][NY];
+		starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
+		ret = check_copy(handle, "matrix");
+	}
+
+	if (ret == 0)
+	{
+		int NX=3;
+		int NY=2;
+		int NZ=4;
+		int block[NX*NY*NZ];
+		starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(block[0]));
+		ret = check_copy(handle, "block");
+	}
+
+	if (ret == 0)
+	{
+		uint32_t nnz = 2;
+		unsigned nrow = 5;
+		float nzvalA[20];
+		uint32_t colind[1];
+		uint32_t rowptr[2];
+		starpu_csr_data_register(&handle, 0, nnz, nrow, (uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
+		ret = check_copy(handle, "csr");
+	}
+
+	starpu_shutdown();
+	return ret;
+}
+
+

+ 68 - 0
tests/datawizard/interfaces/csr/csr_cuda.cu

@@ -0,0 +1,68 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config csr_config;
+
+__global__ void csr_cuda(int *nzval, uint32_t nnz, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= nnz)
+		return;
+
+	if (nzval[i] != (i+1)*factor)
+		*err = 1;
+	else
+		nzval[i] = -nzval[i];
+}
+
+extern "C" void test_csr_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (nnz + threads_per_block-1) / threads_per_block;
+
+	factor = *(int *) args;
+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &csr_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
+
+	error = cudaMemcpy(&csr_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 170 - 0
tests/datawizard/interfaces/csr/csr_interface.c

@@ -0,0 +1,170 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define WIDTH  8
+#define HEIGHT 4
+#define SIZE   (WIDTH * HEIGHT)
+#define NNZ    (SIZE-1)
+
+#ifdef STARPU_USE_CPU
+static void test_csr_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_csr_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_csr_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static int nzval[NNZ];
+static int nzval2[NNZ];
+
+static uint32_t colind[NNZ];
+static uint32_t colind2[NNZ];
+
+static uint32_t rowptr[HEIGHT+1];
+static uint32_t rowptr2[HEIGHT+1];
+
+static starpu_data_handle_t csr_handle;
+static starpu_data_handle_t csr2_handle;
+
+struct test_config csr_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_csr_cpu_func,
+#endif /* ! STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_csr_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_csr_opencl_func,
+#endif
+	.handle        = &csr_handle,
+	.dummy_handle  = &csr2_handle,
+	.copy_failed   = 0,
+	.name          = "csr_interface"
+};
+
+static void
+register_data(void)
+{
+	int i;
+	for (i = 1; i < SIZE; i++)
+	{
+		nzval[i-1] = i;
+		nzval2[i-1] = 42;
+
+		colind[i-1] = i % WIDTH;
+		colind2[i-1] = colind[i];
+	}
+
+	rowptr[0] = 1;
+	rowptr2[0] = 1;
+	for (i = 1; i < HEIGHT; i++)
+	{
+		rowptr[i] = i * WIDTH;
+		rowptr2[i] = rowptr[i];
+	}
+	rowptr[HEIGHT] = NNZ + 1;
+	rowptr2[HEIGHT] = rowptr[HEIGHT];
+
+	starpu_csr_data_register(&csr_handle,
+				 0,
+				 NNZ,
+				 HEIGHT,
+				 (uintptr_t) nzval,
+				 colind,
+				 rowptr,
+				 0,
+				 sizeof(nzval[0]));
+	starpu_csr_data_register(&csr2_handle,
+				 0,
+				 NNZ,
+				 HEIGHT,
+				 (uintptr_t) nzval2,
+				 colind2,
+				 rowptr2,
+				 0,
+				 sizeof(nzval2[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(csr_handle);
+	starpu_data_unregister(csr2_handle);
+}
+
+static void
+test_csr_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	val = (int *) STARPU_CSR_GET_NZVAL(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nnz; i++)
+	{
+		if (val[i] != (i+1) * factor)
+		{
+			csr_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&csr_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}

+ 130 - 0
tests/datawizard/interfaces/csr/csr_opencl.c

@@ -0,0 +1,130 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/csr/csr_opencl_kernel.cl"
+extern struct test_config csr_config;
+static struct starpu_opencl_program opencl_program;
+
+void
+test_csr_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, ret;
+	int factor = *(int *) args;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	uint32_t nnz = STARPU_CSR_GET_NNZ(buffers[0]);
+	cl_mem nzval = (cl_mem)STARPU_CSR_GET_NZVAL(buffers[0]);
+
+	cl_context context;
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &csr_config.copy_failed, &err);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"test_csr_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					      sizeof(nzval), &nzval,
+					      sizeof(nnz), &nnz,
+					      sizeof(fail), &fail,
+					      sizeof(factor), &factor,
+					      0);
+
+	if (nargs != 4)
+	{
+		fprintf(stderr, "Failed to set argument #%d\n", err);
+		STARPU_OPENCL_REPORT_ERROR(err);
+	}
+			
+	{
+		size_t global = nnz;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &csr_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&opencl_program);
+}

+ 29 - 0
tests/datawizard/interfaces/csr/csr_opencl_kernel.cl

@@ -0,0 +1,29 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+__kernel void test_csr_opencl(__global int *val,
+			      unsigned int nx,
+			      __global int *err,
+			      int factor)
+{
+        const int i = get_global_id(0);
+        if (i >=  nx)
+		return;
+
+	if (val[i] != (i+1) * factor)
+		*err = 1;
+	else
+		val[i] = - val[i];
+}

+ 71 - 0
tests/datawizard/interfaces/matrix/matrix_cuda.cu

@@ -0,0 +1,71 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "../test_interfaces.h"
+
+extern struct test_config matrix_config;
+
+__global__ void matrix_cuda(int *val, unsigned n, int *err, int factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i >= n)
+		return;
+
+	if (val[i] != i*factor)
+		*err = 1;
+	else
+		val[i] = -val[i];
+}
+
+extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
+{
+	int factor;
+	int *ret;
+	int *val;
+	cudaError_t error;
+	unsigned int nx, ny, n;
+
+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
+	n = nx * ny;
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+	factor = *(int *) args;
+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
+
+	error = cudaMalloc(&ret, sizeof(int));
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+	error = cudaMemcpy(ret,
+			   &matrix_config.copy_failed,
+			   sizeof(int),
+			   cudaMemcpyHostToDevice);
+	if (error != cudaSuccess)
+		STARPU_CUDA_REPORT_ERROR(error);
+
+        matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+
+	error = cudaMemcpy(&matrix_config.copy_failed,
+			   ret,
+			   sizeof(int),
+			   cudaMemcpyDeviceToHost);
+	
+	cudaFree(ret);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 145 - 0
tests/datawizard/interfaces/matrix/matrix_interface.c

@@ -0,0 +1,145 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include "../test_interfaces.h"
+#include "../../../helper.h"
+
+#define WIDTH  16
+#define HEIGHT 16
+
+#ifdef STARPU_USE_CPU
+static void test_matrix_cpu_func(void *buffers[], void *args);
+#endif /* !STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+extern void test_matrix_cuda_func(void *buffers[], void *_args);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void test_matrix_opencl_func(void *buffers[], void *args);
+#endif
+
+
+static starpu_data_handle_t matrix_handle;
+static starpu_data_handle_t matrix2_handle;
+
+struct test_config matrix_config =
+{
+#ifdef STARPU_USE_CPU
+	.cpu_func      = test_matrix_cpu_func,
+#endif /* ! STARPU_USE_CPU */
+#ifdef STARPU_USE_CUDA
+	.cuda_func     = test_matrix_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_func   = test_matrix_opencl_func,
+#endif
+	.handle        = &matrix_handle,
+	.dummy_handle  = &matrix2_handle,
+	.copy_failed   = 0,
+	.name          = "matrix_interface"
+};
+
+static int matrix[WIDTH * HEIGHT];
+static int matrix2[WIDTH * HEIGHT];
+
+static void
+register_data(void)
+{
+	int i;
+	int size = WIDTH * HEIGHT;
+	for (i = 0; i < size; i++)
+		matrix[i] = i;
+
+	starpu_matrix_data_register(&matrix_handle,
+				    0,
+				    (uintptr_t) matrix,
+				    WIDTH, /* ld */
+				    WIDTH,
+				    HEIGHT,
+				    sizeof(matrix[0]));
+	starpu_matrix_data_register(&matrix2_handle,
+				    0,
+				    (uintptr_t) matrix2,
+				    WIDTH, /* ld */
+				    WIDTH,
+				    HEIGHT,
+				    sizeof(matrix[0]));
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(matrix_handle);
+	starpu_data_unregister(matrix2_handle);
+}
+
+static void
+test_matrix_cpu_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int *val;
+	int factor;
+	int i;
+	unsigned int nx, ny;
+
+	nx = STARPU_MATRIX_GET_NX(buffers[0]);
+	ny = STARPU_MATRIX_GET_NY(buffers[0]);
+	val = (int *) STARPU_MATRIX_GET_PTR(buffers[0]);
+	factor = *(int *) args;
+
+	for (i = 0; i < nx*ny; i++)
+	{
+		if (val[i] != i * factor)
+		{
+			matrix_config.copy_failed = 1;
+			return;
+		}
+		val[i] *= -1;
+	}
+}
+
+int
+main(void)
+{
+	data_interface_test_summary *summary;
+	struct starpu_conf conf =
+	{
+		.ncpus   = -1,
+		.ncuda   = 2,
+		.nopencl = 1
+	};
+
+	if (starpu_init(&conf) == -ENODEV)
+		goto enodev;
+
+	register_data();
+
+	summary = run_tests(&matrix_config);
+	if (!summary)
+		exit(EXIT_FAILURE);
+
+	unregister_data();
+
+	starpu_shutdown();
+
+	data_interface_test_summary_print(stderr, summary);
+
+	return data_interface_test_summary_success(summary);
+
+enodev:
+	return STARPU_TEST_SKIPPED;
+}

+ 129 - 0
tests/datawizard/interfaces/matrix/matrix_opencl.c

@@ -0,0 +1,129 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <config.h>
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "../test_interfaces.h"
+
+#define KERNEL_LOCATION "tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl"
+
+extern struct test_config matrix_config;
+static struct starpu_opencl_program matrix_program;
+
+void test_matrix_opencl_func(void *buffers[], void *args)
+{
+	STARPU_SKIP_IF_VALGRIND;
+
+	int id, devid, factor, ret;
+	unsigned int n;
+
+        cl_int             err;
+	cl_kernel          kernel;
+	cl_command_queue   queue;
+	cl_event           event;
+	cl_context         context;
+	cl_mem             val, fail;
+
+	ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION,
+						  &matrix_program,
+						  NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+
+	factor = *(int *)args;
+	n = STARPU_MATRIX_GET_NX(buffers[0]);
+	n*= STARPU_MATRIX_GET_NY(buffers[0]);
+	val = (cl_mem)STARPU_MATRIX_GET_DEV_HANDLE(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_context(devid, &context);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&matrix_program,
+					"matrix_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+		sizeof(int), &matrix_config.copy_failed, &err);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	/* Setting args */
+	int nargs;
+	nargs = starpu_opencl_set_kernel_args(&err, &kernel,
+					sizeof(val), &val,
+					sizeof(n), &n,
+					sizeof(fail), &fail,
+					sizeof(factor), &factor,
+					0);
+	if (nargs != 4)
+		STARPU_OPENCL_REPORT_ERROR(err);
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	err = clEnqueueReadBuffer(queue,
+				  fail,
+				  CL_TRUE,
+				  0, 
+				  sizeof(int),
+				  &matrix_config.copy_failed,
+				  0,
+				  NULL,
+				  NULL);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+        starpu_opencl_unload_opencl(&matrix_program);
+}
+

+ 31 - 0
tests/datawizard/interfaces/matrix/matrix_opencl_kernel.cl

@@ -0,0 +1,31 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void matrix_opencl(__global int *val,
+				 unsigned int nx,
+				 __global int *err,
+				 int factor)
+{
+        const int i = get_global_id(0);
+	if (i >= nx)
+		return;
+
+	if (val[i] != i * factor)
+		*err = i;
+	else
+		val[i] *= -1;
+}
+

+ 0 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets.c


部分文件因文件數量過多而無法顯示