Sfoglia il codice sorgente

merge with trunk - part 1

Andra Hugo 14 anni fa
parent
commit
03b6b6a55b
100 ha cambiato i file con 16047 aggiunte e 1833 eliminazioni
  1. 4 0
      .dir-locals.el
  2. 187 0
      .gitignore
  3. 4 0
      AUTHORS
  4. 7 6
      ChangeLog
  5. 32 4
      Makefile.am
  6. 28 41
      README
  7. 169 0
      README.dev
  8. 16 0
      acinclude.m4
  9. 528 205
      configure.ac
  10. 1098 630
      doc/starpu.texi
  11. 45 0
      doc/tutorial/Makefile
  12. 33 0
      doc/tutorial/README
  13. 70 0
      doc/tutorial/hello_world.c
  14. 124 0
      doc/tutorial/vector_scal.c
  15. 50 0
      doc/tutorial/vector_scal_cpu.c
  16. 43 0
      doc/tutorial/vector_scal_cuda.cu
  17. 60 0
      doc/tutorial/vector_scal_opencl.c
  18. 25 0
      doc/tutorial/vector_scal_opencl_kernel.cl
  19. 1 1
      doc/vector_scal_c.texi
  20. 2 1
      doc/vector_scal_cuda.texi
  21. 3 3
      doc/vector_scal_opencl.texi
  22. 1 0
      examples/.gitignore
  23. 197 235
      examples/Makefile.am
  24. 5 7
      examples/audio/starpu_audio_processing.c
  25. 18 13
      examples/axpy/axpy.c
  26. 13 10
      examples/basic_examples/block.c
  27. 7 8
      examples/basic_examples/block_opencl.c
  28. 21 15
      examples/basic_examples/hello_world.c
  29. 36 79
      examples/basic_examples/mult.c
  30. 9 6
      examples/basic_examples/variable.c
  31. 2 2
      examples/basic_examples/variable_kernels_opencl.c
  32. 10 5
      examples/basic_examples/vector_scal.c
  33. 2 2
      examples/basic_examples/vector_scal_cuda.cu
  34. 3 3
      examples/basic_examples/vector_scal_opencl.c
  35. 73 0
      examples/callback/callback.c
  36. 21 22
      examples/cg/cg.c
  37. 2 2
      examples/cg/cg.h
  38. 19 0
      examples/cg/cg_dot_kernel.cu
  39. 9 4
      examples/cg/cg_kernels.c
  40. 3 2
      examples/cholesky/cholesky.h
  41. 31 33
      examples/cholesky/cholesky_grain_tag.c
  42. 30 32
      examples/cholesky/cholesky_implicit.c
  43. 26 9
      examples/cholesky/cholesky_kernels.c
  44. 9 9
      examples/cholesky/cholesky_models.c
  45. 34 36
      examples/cholesky/cholesky_tag.c
  46. 7 7
      examples/cholesky/cholesky_tile_tag.c
  47. 2 2
      examples/common/blas.h
  48. 3 3
      examples/common/blas_model.c
  49. 2 2
      examples/common/blas_model.h
  50. 20 18
      examples/filters/fblock.c
  51. 2 2
      examples/filters/fblock_opencl.c
  52. 13 12
      examples/filters/fmatrix.c
  53. 11 10
      examples/filters/fvector.c
  54. 21 23
      examples/heat/dw_factolu.c
  55. 47 45
      examples/heat/dw_factolu.h
  56. 24 26
      examples/heat/dw_factolu_grain.c
  57. 17 17
      examples/heat/dw_factolu_kernels.c
  58. 18 20
      examples/heat/dw_factolu_tag.c
  59. 7 6
      examples/heat/dw_sparse_cg.c
  60. 3 3
      examples/heat/dw_sparse_cg.h
  61. 0 4
      examples/heat/dw_sparse_cg_kernels.c
  62. 10 10
      examples/heat/heat.c
  63. 5 3
      examples/heat/heat.h
  64. 6 6
      examples/heat/heat_display.c
  65. 14 14
      examples/heat/lu_kernels_model.c
  66. 2 2
      examples/heat/lu_kernels_model.h
  67. 13 9
      examples/incrementer/incrementer.c
  68. 2 2
      examples/incrementer/incrementer_kernels_opencl.c
  69. 3 3
      examples/lu/lu_example.c
  70. 9 11
      examples/lu/xlu.c
  71. 10 12
      examples/lu/xlu_implicit.c
  72. 10 12
      examples/lu/xlu_implicit_pivot.c
  73. 36 30
      examples/mandelbrot/mandelbrot.c
  74. 36 30
      examples/matvecmult/matvecmult.c
  75. 26 22
      examples/mult/xgemm.c
  76. 105 0
      examples/openmp/vector_scal.c
  77. 78 0
      examples/opt/Makefile.am
  78. 50 0
      examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
  79. 60 0
      examples/opt/pi/SobolQRNG/sobol.h
  80. 141 0
      examples/opt/pi/SobolQRNG/sobol_gold.c
  81. 61 0
      examples/opt/pi/SobolQRNG/sobol_gold.h
  82. 170 0
      examples/opt/pi/SobolQRNG/sobol_gpu.cu
  83. 61 0
      examples/opt/pi/SobolQRNG/sobol_gpu.h
  84. 10271 0
      examples/opt/pi/SobolQRNG/sobol_primitives.c
  85. 75 0
      examples/opt/pi/SobolQRNG/sobol_primitives.h
  86. 175 0
      examples/opt/pi/pi.c
  87. 33 0
      examples/opt/pi/pi.h
  88. 150 0
      examples/opt/pi/pi_kernel.cu
  89. 362 0
      examples/opt/pi/pi_redux.c
  90. 128 0
      examples/opt/pi/pi_redux_kernel.cu
  91. 3 3
      examples/ppm_downscaler/ppm_downscaler.c
  92. 7 11
      examples/ppm_downscaler/yuv_downscaler.c
  93. 10 8
      examples/profiling/profiling.c
  94. 36 11
      examples/reductions/dot_product.c
  95. 9 3
      examples/reductions/minmax_reduction.c
  96. 12 6
      examples/scheduler/dummy_sched.c
  97. 51 0
      examples/socl/Makefile.am
  98. 211 0
      examples/socl/basic/basic.c
  99. 299 0
      examples/socl/clinfo/clinfo.c
  100. 0 0
      examples/socl/mandelbrot/mandelbrot.c

+ 4 - 0
.dir-locals.el

@@ -0,0 +1,4 @@
+;; Hey Emacs, use the ugly style!
+
+((c-mode . ((c-file-style . "linux")
+	    (indent-tabs-mode . t))))

+ 187 - 0
.gitignore

@@ -0,0 +1,187 @@
+/configure
+/config.log
+/config.status
+/autom4te.cache
+/libtool
+/libstarpu.pc
+/aclocal.m4
+/build-aux
+/GPATH
+/GRTAGS
+/GTAGS
+/config.cache
+/doc/starpu.info
+*~
+,*
+Makefile
+Makefile.in
+.libs
+.deps
+*.o
+*.lo
+*.la
+*.swp
+.dirstamp
+stamp-h[0-9]*
+starpu.log
+/gcc-plugin/src/starpu-gcc-config.h
+/gcc-plugin/tests/*.c.[0-9]*.*
+/tests/datawizard/handle_to_pointer
+/tests/datawizard/data_lookup
+/doc/stamp-vti
+/doc/version.texi
+/examples/basic_examples/block
+/examples/basic_examples/hello_world
+/examples/basic_examples/mult
+/examples/basic_examples/variable
+/examples/basic_examples/vector_scal
+/examples/callback/callback
+/examples/filters/fblock
+/examples/filters/fmatrix
+/examples/filters/fvector
+/examples/incrementer/incrementer
+/examples/mandelbrot/mandelbrot
+/examples/matvecmult/matvecmult
+/examples/pi/pi
+/examples/pi/pi_redux
+/examples/ppm_downscaler/ppm_downscaler
+/examples/ppm_downscaler/yuv_downscaler
+/examples/profiling/profiling
+/examples/reductions/dot_product
+/examples/reductions/minmax_reduction
+/examples/scheduler/dummy_sched
+/examples/spmv/dw_spmv
+/examples/spmv/spmv
+/examples/stencil/stencil
+/examples/tag_example/tag_example
+/examples/tag_example/tag_example2
+/examples/tag_example/tag_example3
+/examples/tag_example/tag_restartable
+/mpi/examples/stencil/stencil5
+/mpi/tests/block_interface
+/mpi/tests/block_interface_pinned
+/mpi/tests/insert_task
+/mpi/tests/insert_task_block
+/mpi/tests/insert_task_cache
+/mpi/tests/insert_task_owner
+/mpi/tests/insert_task_owner2
+/mpi/tests/mpi_detached_tag
+/mpi/tests/mpi_irecv
+/mpi/tests/mpi_irecv_detached
+/mpi/tests/mpi_isend
+/mpi/tests/mpi_isend_detached
+/mpi/tests/mpi_test
+/mpi/tests/multiple_send
+/mpi/tests/pingpong
+/mpi/tests/ring
+/mpi/tests/ring_async
+/mpi/tests/ring_async_implicit
+/tests/core/declare_deps_after_submission
+/tests/core/declare_deps_after_submission_synchronous
+/tests/core/declare_deps_in_callback
+/tests/core/empty_task
+/tests/core/empty_task_chain
+/tests/core/empty_task_sync_point
+/tests/core/empty_task_sync_point_tasks
+/tests/core/execute_on_a_specific_worker
+/tests/core/get_current_task
+/tests/core/insert_task
+/tests/core/multithreaded
+/tests/core/multithreaded_init
+/tests/core/regenerate
+/tests/core/restart
+/tests/core/starpu_task_wait
+/tests/core/starpu_task_wait_for_all
+/tests/core/static_restartable
+/tests/core/static_restartable_tag
+/tests/core/static_restartable_using_initializer
+/tests/core/subgraph_repeat
+/tests/core/subgraph_repeat_regenerate
+/tests/core/tag_wait_api
+/tests/core/task_wait_api
+/tests/core/wait_all_regenerable_tasks
+/tests/datawizard/acquire_cb
+/tests/datawizard/acquire_release
+/tests/datawizard/acquire_release2
+/tests/datawizard/critical_section_with_void_interface
+/tests/datawizard/data_implicit_deps
+/tests/datawizard/data_invalidation
+/tests/datawizard/dining_philosophers
+/tests/datawizard/dsm_stress
+/tests/datawizard/increment_redux
+/tests/datawizard/increment_redux_v2
+/tests/datawizard/lazy_allocation.c
+/tests/datawizard/manual_reduction
+/tests/datawizard/mpi_like
+/tests/datawizard/mpi_like_async
+/tests/datawizard/readers_and_writers
+/tests/datawizard/reclaim
+/tests/datawizard/scratch
+/tests/datawizard/sync_and_notify_data
+/tests/datawizard/sync_and_notify_data_implicit
+/tests/datawizard/sync_with_data_with_mem
+/tests/datawizard/sync_with_data_with_mem_non_blocking
+/tests/datawizard/sync_with_data_with_mem_non_blocking_implicit
+/tests/datawizard/unpartition
+/tests/datawizard/user_interaction_implicit
+/tests/datawizard/write_only_tmp_buffer
+/tests/errorcheck/invalid_blocking_calls
+/tests/errorcheck/invalid_tasks
+/tests/errorcheck/starpu_init_noworker
+/tests/helper/cublas_init
+/tests/helper/execute_on_all
+/tests/helper/pinned_memory
+/tests/helper/starpu_create_sync_task
+/tests/helper/starpu_data_cpy
+/tests/microbenchs/async_tasks_overhead
+/tests/microbenchs/display_structures_size
+/tests/microbenchs/local_pingpong
+/tests/microbenchs/prefetch_data_on_node
+/tests/microbenchs/redundant_buffer
+/tests/microbenchs/sync_tasks_overhead
+/tests/microbenchs/tasks_overhead
+/tests/overlap/overlap
+/tests/parallel_tasks/explicit_combined_worker
+/tests/parallel_tasks/parallel_kernels
+/tests/parallel_tasks/parallel_kernels_spmd
+/tests/parallel_tasks/spmd_pgreedy
+/tests/perfmodels/non_linear_regression_based
+/tests/perfmodels/regression_based
+/tools/cbc2paje
+/tools/lp2paje
+/tools/starpu_calibrate_bus
+/tools/starpu_machine_display
+/tools/starpu_perfmodel_display
+/tools/starpu_regression_display
+/gcc-plugin/tests/scalar-tasks
+/gcc-plugin/tests/pointers
+/tests/datawizard/lazy_allocation
+/gcc-plugin/tests/pointer-tasks
+/gcc-plugin/tests/*.s
+/gcc-plugin/tests/base
+/gcc-plugin/tests/core
+/mpi/tests/insert_task_owner_data
+/mpi/examples/scatter_gather/mpi_scatter_gather
+/examples/top/hello_world_top
+/doc/starpu.aux
+/doc/starpu.cp
+/doc/starpu.cps
+/doc/starpu.fn
+/doc/starpu.fns
+/doc/starpu.html
+/doc/starpu.ky
+/doc/starpu.pdf
+/doc/starpu.pg
+/doc/starpu.toc
+/doc/starpu.tp
+/doc/starpu.tps
+/doc/starpu.vr
+/gcc-plugin/tests/register
+/tests/datawizard/acquire_cb_insert
+/tools/starpu_perfmodel_plot
+/gcc-plugin/tests/run-test
+/gcc-plugin/tests/register-errors
+/gcc-plugin/tests/acquire
+/gcc-plugin/tests/unregister
+/gcc-plugin/tests/lib-user
+/gcc-plugin/examples/matrix-mult

+ 4 - 0
AUTHORS

@@ -6,3 +6,7 @@ Sylvain Henry <sylvain.henry@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
+William Braik <wbraik@gmail.com>
+Yann Courtois <yann.courtois33@gmail.com>
+Jean-Marie Couteyen <jm.couteyen@gmail.com>
+Anthony Roy <theanthony33@gmail.com>

+ 7 - 6
ChangeLog

@@ -1,7 +1,7 @@
-StarPU 0.5 (svn revision ????)
+StarPU 0.9 (svn revision 3721)
 ==============================================
-The yet-more-stuff release
-  
+The extensions release
+
   * Provide the STARPU_REDUX data access mode
   * Externalize the scheduler API.
   * Add theoretical bound computation
@@ -19,11 +19,12 @@ The yet-more-stuff release
   * Add mandelbrot OpenCL example
   * Add cg example
   * Add stencil MPI example
+  * Initial support for CUDA4
 
 StarPU 0.4 (svn revision 2535)
 ==============================================
 The API strengthening release
-  
+
   * Major API improvements
     - Provide the STARPU_SCRATCH data access mode
     - Rework data filter interface
@@ -41,7 +42,7 @@ The API strengthening release
   * Provide a library to help accelerating MPI applications
   * Improve data transfers overhead prediction
     - Transparently benchmark buses to generate performance models
-    - Bind accelerator-controlling threads with respect to NUMA locality 
+    - Bind accelerator-controlling threads with respect to NUMA locality
   * Improve StarPU's portability
     - Add OpenCL support
     - Add support for Windows
@@ -63,7 +64,7 @@ The asynchronous heterogeneous multi-accelerator release
     - All data transfers use data requests now
     - Implement asynchronous data transfers
     - Implement prefetch mechanism
-    - Chain data requests to support GPU->RAM->GPU transfers 
+    - Chain data requests to support GPU->RAM->GPU transfers
   * Make it possible to bypass the scheduler and to assign a task to a specific
     worker
   * Support restartable tasks to reinstanciate dependencies task graphs

+ 32 - 4
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2009-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,22 @@ SUBDIRS = src
 if USE_MPI
 SUBDIRS += mpi
 endif
+
+if BUILD_SOCL
+SUBDIRS += socl
+endif
+
 SUBDIRS += tools examples tests doc
 
+if COND_OPT
+SUBDIRS += tests/opt examples/opt
+endif
+
+
+if BUILD_GCC_PLUGIN
+SUBDIRS += gcc-plugin
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpu.pc
 
@@ -42,10 +56,24 @@ include_HEADERS = 				\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
-	include/starpu_scheduler.h
+	include/starpu_scheduler.h		\
+	include/starpu_top.h
+
+if BUILD_STARPU_TOP
+all-local:
+	cd starpu-top ; $(QMAKE) ; $(MAKE)
+clean-local:
+	cd starpu-top ; $(MAKE) clean
+# TODO: resources
+install-exec-local:
+	$(MKDIR_P) $(DESTDIR)$(bindir)
+	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
+endif
 
+if STARPU_HAVE_WINDOWS
 txtdir = ${prefix}
+else
+txtdir = ${docdir}
+endif
 txt_DATA = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
-
-

+ 28 - 41
README

@@ -59,6 +59,12 @@ advantage of their specificities in a portable fashion.
    units according to the machine topology. For more details on hwloc, see
    http://www.open-mpi.org/projects/hwloc/ .
 
+ * To build the StarPU-Top tool the following are also required:
+   * libqt4 >= 4.7
+   * libqt4-network
+   * libqt4-opengl
+   * libqt4-sql
+
 ++=====================++
 || III. Getting StarPU ||
 ++=====================++
@@ -97,61 +103,42 @@ we provide MinGW-built binaries.  The build process produces libstarpu.dll,
 libstarpu.def, and libstarpu.lib, which should be enough to use it from e.g.
 Microsoft Visual Studio.
 
-A few details need to be fixed when building StarPU on windows:
-
-- To get a .def file built, make sure that MSVC's lib.exe tool is in PATH.
-
-- Update the video drivers to the latest stable release available for your
-  hardware. Older ATI drivers (< 2.3) contain bugs that cause OpenCL support in
-  StarPU to hang or exhibit incorrect behaviour.
+Update the video drivers to the latest stable release available for your
+hardware. Old ATI drivers (< 2.3) contain bugs that cause OpenCL support in
+StarPU to hang or exhibit incorrect behaviour.
 
-- c:\cuda\include\host_defines.h has a bogus CUDARTAPI definition which makes
-  linking fail completely. Replace the first occurence of
+For details on the Windows build process, see the README.dev file in the
+subversion tree.
 
-    #define CUDARTAPI
-    
-  with
-    
-    #ifdef _WIN32
-    #define CUDARTAPI __stdcall
-    #else
-    #define CUDARTAPI
-    #endif
-
-  While at it, you can also comment the __cdecl definition to avoid spurious
-  warnings.
-
-- If you have a non-english version of windows, use
+++==================++
+|| V. Documentation ||
+++==================++
 
-    export LANG=C
+Texinfo documentation is available in doc/ . If LaTeX is available on the
+machine, a pdf can be generated by running
 
-  else libtool has troubles parsing the translated output of the toolchain.
+  $ make -C doc pdf
 
-- libtool is not able to find the libraries automatically, you need to make some
-  copies:
+If makeinfo is available on the machine, html pages can be generated by running
 
-    copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
-    copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
-    copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
-    copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
-    copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
+  $ make -C doc html
 
-++===========++
-|| V. Trying ||
-++===========++
+++============++
+|| VI. Trying ||
+++============++
 
 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 
-++=============++
-|| VI. Upgrade ||
-++=============++
+++==============++
+|| VII. Upgrade ||
+++==============++
 
 To upgrade your source code from older version (there were quite a few
 renamings), use the tools/rename.sh script
 
-++==============++
-|| VII. Contact ||
-++==============++
+++===============++
+|| VIII. Contact ||
+++===============++
 
 For any questions regarding StarPU, please contact the starpu-devel
 mailing-list at starpu-devel@lists.gforge.inria.fr .

+ 169 - 0
README.dev

@@ -0,0 +1,169 @@
+Installing StarPU on windows
+----------------------------
+
+If you are building from a tarball downloaded from the website, you can skip the
+cygwin part.
+
+1. Install cygwin
+
+http://cygwin.com/install.html
+
+Make sure the following packages are available:
+- (Devel)/subversion
+- (Devel)/libtool
+- (Devel)/gcc
+- (Devel)/make
+- your favorite editor (vi, emacs, ...)
+- (Devel)/gdb
+- (Archive)/zip
+- (Devel)/pkg-config
+
+2. Install mingw
+
+http://sourceforge.net/projects/mingw/
+
+3. Install hwloc (not mandatory)
+
+http://www.open-mpi.org/projects/hwloc
+
+4. Install Microsoft Visual C++ Studio Express
+
+   http://www.microsoft.com/express/Downloads
+
+   Add in your path the following directories.
+   (adjusting where necessary for the Installation location according to VC
+    version and on 64 and 32bit Windows versions)
+
+   On cygwin, with Visual C++ 2010 e.g.;
+
+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
+
+   On MingW, with Visual C++ 2010, e.g.;
+
+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
+
+   Try to call <lib.exe> and <link.exe> without any option to make sure these
+   dump their help output, else no .def or .lib file will be produced.
+
+5. Install GPU Drivers (not mandatory)
+
+  5.1 Install Cuda
+
+      http://developer.nvidia.com/object/cuda_3_2_downloads.html
+
+      You need to install at least the CUDA toolkit.
+
+      libtool is not able to find the libraries automatically, you
+      need to make some copies:
+
+      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
+      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
+      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
+      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
+      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
+
+      (and if the version of your CUDA driver is >= 3.2)
+
+      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
+
+      Add the CUDA bin directory in your path
+
+      export PATH=/cygdrive/c/CUDA/bin:$PATH
+
+      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
+      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
+      definition which makes linking fail completely. Replace the first
+      occurence of
+
+      #define CUDARTAPI
+
+      with
+
+      #ifdef _WIN32
+      #define CUDARTAPI __stdcall
+      #else
+      #define CUDARTAPI
+      #endif
+
+      While at it, you can also comment the __cdecl definition to avoid spurious
+      warnings.
+
+
+  5.2 Install OpenCL
+
+      http://developer.nvidia.com/object/opencl-download.html
+
+      You need to download the NVIDIA Drivers for your version of
+      Windows. Executing the file will extract all files in a given
+      directory. The the driver installation will start, it will fail
+      if no compatibles drivers can be found on your system.
+
+      Anyway, you should copy the *.dl_ files from the directory
+      (extraction path) in the bin directory of the CUDA installation
+      directory (the directory should be v3.2/bin/)
+
+  5.3 Install MsCompress
+
+      http://gnuwin32.sourceforge.net/packages/mscompress.htm
+
+      Go in the CUDA bin directory, uncompress .dl_ files and rename
+      them in .dll files
+
+      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
+      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
+
+If you are building from a tarball downloaded from the website, you can skip the
+autogen.sh part.
+
+6. Start autogen.sh from cygwin
+
+   cd starpu-trunk
+   ./autogen.sh
+
+7. Start a MinGW shell
+
+   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
+
+8. Configure, make, install from MinGW
+
+   If you have a non-english version of windows, use
+
+     export LANG=C
+
+   else libtool has troubles parsing the translated output of the toolchain.
+
+   cd starpu-trunk
+   mkdir build
+   cd build
+   ../configure --prefix=$PWD/target --disable-default-drand48 \
+        --with-hwloc=<HWLOC installation directory> \
+        --with-cuda-dir=<CUDA installation directory> \
+        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
+	--with-opencl-dir=<CUDA installation directory>
+   make
+   make install
+
+   Also convert a couple of files to CRLF:
+
+   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
+   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
+   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
+
+9. If you want your StarPU installation to be standalone, you need to
+   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
+   installation bin directory, as well as MinGW/bin/libpthread*dll
+
+   cp <CUDA directory>/bin/*dll target/bin
+   cp <HWLOC directory>/bin/*dll target/bin
+   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
+
+   and set the StarPU bin directory in your path.
+
+   export PATH=<StarPU installation directory>/bin:$PATH
+
+
+Developers warning
+------------------
+They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.

+ 16 - 0
acinclude.m4

@@ -1,3 +1,19 @@
+dnl Copyright (C) Free Software Foundation, Inc.
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+dnl
 dnl This test is taken from libgfortran
 
 dnl Check whether the target supports __sync_val_compare_and_swap.

File diff suppressed because it is too large
+ 528 - 205
configure.ac


File diff suppressed because it is too large
+ 1098 - 630
doc/starpu.texi


+ 45 - 0
doc/tutorial/Makefile

@@ -0,0 +1,45 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CFLAGS          +=      $$(pkg-config --cflags libstarpu)
+LDFLAGS         +=      $$(pkg-config --libs libstarpu)
+
+HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
+NVCC		?=	nvcc
+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
+
+%.o: %.cu
+	nvcc $(CFLAGS) $< -c
+
+all: hello_world vector_scal
+
+VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o 
+ifneq ($(strip $(HAS_CUDA)),)
+VECTOR_SCAL_PREREQUISITES	+=	vector_scal_cuda.o
+VECTOR_SCAL_COMPILER		=	$(NVCC)
+else
+VECTOR_SCAL_COMPILER		=	$(CC)
+endif
+ifneq ($(strip $(HAS_OPENCL)),)
+VECTOR_SCAL_PREREQUISITES += vector_scal_opencl.o
+endif
+
+vector_scal: $(VECTOR_SCAL_PREREQUISITES)
+	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
+
+clean:
+	rm -f hello_world vector_scal *.o
+

+ 33 - 0
doc/tutorial/README

@@ -0,0 +1,33 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+Instructions on how to compile and run StarPU examples
+------------------------------------------------------
+
+% export STARPU_DIR=<directory where StarPU is installed>
+% export PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
+% export LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
+
+% make hello_world
+% ./hello_world
+
+% make vector_scal
+% ./vector_scal
+
+% STARPU_NCPUS=0 ./vector_scal
+% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
+

+ 70 - 0
doc/tutorial/hello_world.c

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+struct params {
+    int i;
+    float f;
+};
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    struct params *params = cl_arg;
+
+    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+}
+
+starpu_codelet cl =
+{
+    .where = STARPU_CPU,
+    .cpu_func = cpu_func,
+    .nbuffers = 0
+};
+
+void callback_func(void *callback_arg)
+{
+    printf("Callback function (arg %x)\n", callback_arg);
+}
+
+int main(int argc, char **argv)
+{
+    /* initialize StarPU */
+    starpu_init(NULL);
+
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &cl; /* Pointer to the codelet defined above */
+
+    struct params params = { 1, 2.0f };
+    task->cl_arg = &params;
+    task->cl_arg_size = sizeof(params);
+
+    task->callback_func = callback_func;
+    task->callback_arg = 0x42;
+
+    /* starpu_task_submit will be a blocking call */
+    task->synchronous = 1;
+
+    /* submit the task to StarPU */
+    starpu_task_submit(task);
+
+    /* terminate StarPU */
+    starpu_shutdown();
+
+    return 0;
+}

+ 124 - 0
doc/tutorial/vector_scal.c

@@ -0,0 +1,124 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU to scale an array by a factor.
+ * It shows how to manipulate data with StarPU's data management library.
+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
+ *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
+ */
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+#define    NX    2048
+
+extern void scal_cpu_func(void *buffers[], void *_args);
+extern void scal_cuda_func(void *buffers[], void *_args);
+extern void scal_opencl_func(void *buffers[], void *_args);
+
+static starpu_codelet cl = {
+    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
+    /* CPU implementation of the codelet */
+    .cpu_func = scal_cpu_func,
+#ifdef STARPU_USE_CUDA
+    /* CUDA implementation of the codelet */
+    .cuda_func = scal_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+    /* OpenCL implementation of the codelet */
+    .opencl_func = scal_opencl_func,
+#endif
+    .nbuffers = 1
+};
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program programs;
+#endif
+
+int main(int argc, char **argv)
+{
+    /* We consider a vector of float that is initialized just as any of C
+      * data */
+    float vector[NX];
+    unsigned i;
+    for (i = 0; i < NX; i++)
+        vector[i] = 1.0f;
+
+    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+
+    /* Initialize StarPU with default configuration */
+    starpu_init(NULL);
+
+#ifdef STARPU_USE_OPENCL
+        starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
+#endif
+
+    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
+     * identifier. When a task needs to access a piece of data, it should
+     * refer to the handle that is associated to it.
+     * In the case of the "vector" data interface:
+     *  - the first argument of the registration method is a pointer to the
+     *    handle that should describe the data
+     *  - the second argument is the memory node where the data (ie. "vector")
+     *    resides initially: 0 stands for an address in main memory, as
+     *    opposed to an adress on a GPU for instance.
+     *  - the third argument is the adress of the vector in RAM
+     *  - the fourth argument is the number of elements in the vector
+     *  - the fifth argument is the size of each element.
+     */
+    starpu_data_handle vector_handle;
+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+                                NX, sizeof(vector[0]));
+
+    float factor = 3.14;
+
+    /* create a synchronous task: any call to starpu_task_submit will block
+      * until it is terminated */
+    struct starpu_task *task = starpu_task_create();
+    task->synchronous = 1;
+
+    task->cl = &cl;
+
+    /* the codelet manipulates one buffer in RW mode */
+    task->buffers[0].handle = vector_handle;
+    task->buffers[0].mode = STARPU_RW;
+
+    /* an argument is passed to the codelet, beware that this is a
+     * READ-ONLY buffer and that the codelet may be given a pointer to a
+     * COPY of the argument */
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+
+    /* execute the task on any eligible computational ressource */
+    starpu_task_submit(task);
+
+    /* StarPU does not need to manipulate the array anymore so we can stop
+      * monitoring it */
+    starpu_data_unregister(vector_handle);
+
+#ifdef STARPU_USE_OPENCL
+    starpu_opencl_unload_opencl(&programs);
+#endif
+
+    /* terminate StarPU, no task can be submitted after */
+    starpu_shutdown();
+
+    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+
+    return 0;
+}

+ 50 - 0
doc/tutorial/vector_scal_cpu.c

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+/* This kernel takes a buffer and scales it by a constant factor */
+void scal_cpu_func(void *buffers[], void *cl_arg)
+{
+    unsigned i;
+    float *factor = cl_arg;
+
+    /*
+     * The "buffers" array matches the task->buffers array: for instance
+     * task->buffers[0].handle is a handle that corresponds to a data with
+     * vector "interface", so that the first entry of the array in the
+     * codelet  is a pointer to a structure describing such a vector (ie.
+     * struct starpu_vector_interface_s *). Here, we therefore manipulate
+     * the buffers[0] element as a vector: nx gives the number of elements
+     * in the array, ptr gives the location of the array (that was possibly
+     * migrated/replicated), and elemsize gives the size of each elements.
+     */
+    starpu_vector_interface_t *vector = buffers[0];
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+    /* get a pointer to the local copy of the vector : note that we have to
+     * cast it in (float *) since a vector could contain any type of
+     * elements so that the .ptr field is actually a uintptr_t */
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* scale the vector */
+    for (i = 0; i < n; i++)
+        val[i] *= *factor;
+}
+

+ 43 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+        if (i < n)
+               val[i] *= factor;
+}
+
+extern "C" void scal_cuda_func(void *buffers[], void *_args)
+{
+        float *factor = (float *)_args;
+
+        /* length of the vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the vector pointer */
+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+        unsigned threads_per_block = 64;
+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+

+ 60 - 0
doc/tutorial/vector_scal_opencl.c

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program programs;
+
+void scal_opencl_func(void *buffers[], void *_args)
+{
+    float *factor = _args;
+    int id, devid, err;
+    cl_kernel kernel;
+    cl_command_queue queue;
+    cl_event event;
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    /* OpenCL copy of the vector pointer */
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+    id = starpu_worker_get_id();
+    devid = starpu_worker_get_devid(id);
+
+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
+                    "vector_mult_opencl", devid);   /* Name of the codelet defined above */
+    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
+    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
+    if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+    {
+        size_t global=1;
+        size_t local=1;
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+    }
+
+    clFinish(queue);
+    starpu_opencl_collect_stats(event);
+    clReleaseEvent(event);
+
+    starpu_opencl_release_kernel(kernel);
+}

+ 25 - 0
doc/tutorial/vector_scal_opencl_kernel.cl

@@ -0,0 +1,25 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+{
+        const int i = get_global_id(0);
+        if (i < nx) {
+                val[i] *= factor;
+        }
+}
+

+ 1 - 1
doc/vector_scal_c.texi

@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 
 #ifdef STARPU_USE_OPENCL
         starpu_opencl_load_opencl_from_file(
-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs);
+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
 #endif
 
     /* Tell StaPU to associate the "vector" vector with the "vector_handle"

+ 2 - 1
doc/vector_scal_cuda.texi

@@ -1,9 +1,10 @@
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
 @{
-        unsigned i;
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
                val[i] *= factor;
 @}

+ 3 - 3
doc/vector_scal_opencl.texi

@@ -13,8 +13,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* local copy of the vector pointer */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    /* OpenCL copy of the vector pointer */
+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
 
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);
@@ -23,7 +23,7 @@ void scal_opencl_func(void *buffers[], void *_args)
                                     devid);
     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
     err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
     if (err) STARPU_OPENCL_REPORT_ERROR(err);

+ 1 - 0
examples/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 197 - 235
examples/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -14,24 +14,23 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AUTOMAKE_OPTIONS = subdir-objects
-
-AM_CFLAGS = $(HWLOC_CFLAGS) -Wall
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
-TESTS	=	$(check_PROGRAMS)
+SUBDIRS = stencil
 
-SUBDIRS = stencil stencil_ctx
+if STARPU_USE_SOCL
+SUBDIRS += socl
+endif
 
 if STARPU_HAVE_FFTW
-if STARPU_HAVE_FFTWL
+if STARPU_HAVE_FFTWF
 SUBDIRS += starpufft
 endif
 endif
 
-check_PROGRAMS =
-
 BUILT_SOURCES =
 
 if STARPU_USE_OPENCL
@@ -40,7 +39,9 @@ endif
 
 EXTRA_DIST = 					\
 	basic_examples/vector_scal_opencl_kernel.cl \
+	common/blas_model.c			\
 	spmv/spmv_cuda.cu			\
+	spmv/spmv_opencl.cl			\
 	gordon/null_kernel_gordon.c		\
 	mult/xgemm.c				\
 	lu/xlu.c				\
@@ -54,6 +55,7 @@ EXTRA_DIST = 					\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
 	basic_examples/block_opencl_kernel.cl			\
+	openmp/vector_scal.c			\
 	filters/fblock_opencl_kernel.cl
 
 CLEANFILES = 					\
@@ -64,7 +66,7 @@ CLEANFILES += *.gcno *.gcda *.linkinfo
 
 if STARPU_USE_CUDA
 
-NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  -arch sm_13
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS) -arch sm_13
 
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
@@ -106,11 +108,6 @@ noinst_HEADERS = 				\
 	lu/xlu_kernels.h			\
 	lu/float.h				\
 	lu/double.h				\
-	pi/pi.h					\
-	pi/SobolQRNG/sobol.h			\
-	pi/SobolQRNG/sobol_gold.h		\
-	pi/SobolQRNG/sobol_gpu.h		\
-	pi/SobolQRNG/sobol_primitives.h		\
 	cholesky/cholesky.h			\
 	common/blas_model.h			\
 	common/blas.h				\
@@ -122,22 +119,134 @@ noinst_HEADERS = 				\
 	ppm_downscaler/yuv_downscaler.h		\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
-	spmv/dw_spmv.h				\
+	spmv/spmv.h				\
 	spmv/dw_block_spmv.h
 
+#####################################
+# What to install and what to check #
+#####################################
 
-##################
-# Basic examples #
-##################
+STARPU_EXAMPLES	=
+TESTS		=	$(STARPU_EXAMPLES)
+
+if STARPU_HAVE_WINDOWS
+check_PROGRAMS	=	$(STARPU_EXAMPLES)
+else
+check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
+endif
+
+if !STARPU_HAVE_WINDOWS
+## test loader program
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
+loader_SOURCES		=	../tests/loader.c
+TESTS_ENVIRONMENT	=	$(LOADER_BIN)
+endif
 
 examplebin_PROGRAMS +=				\
-	basic_examples/hello_world
+	basic_examples/hello_world		\
+	basic_examples/vector_scal		\
+	basic_examples/mult			\
+	basic_examples/block			\
+	basic_examples/variable			\
+	filters/fvector				\
+	filters/fblock				\
+	filters/fmatrix				\
+	tag_example/tag_example			\
+	tag_example/tag_example3		\
+	tag_example/tag_example2		\
+	tag_example/tag_restartable		\
+	spmv/spmv				\
+	callback/callback			\
+	incrementer/incrementer			\
+	matvecmult/matvecmult			\
+	profiling/profiling			\
+	scheduler/dummy_sched			\
+	reductions/dot_product			\
+	reductions/minmax_reduction		\
+	mandelbrot/mandelbrot			\
+	ppm_downscaler/ppm_downscaler		\
+	ppm_downscaler/yuv_downscaler
 
-basic_examples_hello_world_SOURCES =		\
-	basic_examples/hello_world.c
+if STARPU_HAVE_F77_H
+examplebin_PROGRAMS +=				\
+	basic_examples/vector_scal_fortran
+endif
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=				\
+	axpy/axpy				\
+	mult/sgemm 				\
+	mult/dgemm				\
+	cholesky/cholesky_tag			\
+	cholesky/cholesky_tile_tag		\
+	cholesky/cholesky_grain_tag		\
+	cholesky/cholesky_implicit		\
+	lu/lu_example_float			\
+	lu/lu_example_double			\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double		\
+	heat/heat				\
+	cg/cg
+endif
 
+if ATLAS_BLAS_LIB
 examplebin_PROGRAMS +=				\
-	basic_examples/vector_scal
+	spmv/dw_block_spmv
+endif
+
+STARPU_EXAMPLES +=				\
+	basic_examples/hello_world		\
+	basic_examples/vector_scal		\
+	basic_examples/mult			\
+	basic_examples/block			\
+	basic_examples/variable			\
+	filters/fvector				\
+	filters/fblock				\
+	filters/fmatrix				\
+	tag_example/tag_example			\
+	tag_example/tag_example3		\
+	tag_example/tag_example2		\
+	tag_example/tag_restartable		\
+	spmv/spmv				\
+	callback/callback			\
+	incrementer/incrementer			\
+	matvecmult/matvecmult			\
+	profiling/profiling			\
+	scheduler/dummy_sched			\
+	reductions/dot_product			\
+	reductions/minmax_reduction
+
+if STARPU_HAVE_F77_H
+STARPU_EXAMPLES +=				\
+	basic_examples/vector_scal_fortran
+endif
+
+if !NO_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	axpy/axpy				\
+	mult/sgemm 				\
+	mult/dgemm				\
+	cholesky/cholesky_tag			\
+	cholesky/cholesky_tile_tag		\
+	cholesky/cholesky_grain_tag		\
+	cholesky/cholesky_implicit		\
+	lu/lu_example_float			\
+	lu/lu_example_double			\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double		\
+	heat/heat				\
+	cg/cg
+endif
+
+if ATLAS_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	spmv/dw_block_spmv
+endif
+
+##################
+# Basic examples #
+##################
 
 basic_examples_vector_scal_SOURCES =		\
 	basic_examples/vector_scal.c		\
@@ -156,9 +265,6 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 endif
 
 if STARPU_HAVE_F77_H
-examplebin_PROGRAMS +=				\
-	basic_examples/vector_scal_fortran
-
 basic_examples_vector_scal_fortran_SOURCES =	\
 	basic_examples/vector_scal_fortran.F	\
 	basic_examples/vector_scal_c.c		\
@@ -167,25 +273,15 @@ basic_examples_vector_scal_fortran_SOURCES =	\
 if STARPU_USE_CUDA
 basic_examples_vector_scal_fortran_SOURCES +=	\
 	basic_examples/vector_scal_cuda.cu
+basic_examples_vector_scal_fortran_LDADD =	\
+	$(STARPU_CUDA_FORTRAN_LDFLAGS)
 endif
 endif
 
-examplebin_PROGRAMS +=				\
-	basic_examples/mult
-
-basic_examples_mult_SOURCES =			\
-	basic_examples/mult.c
-
 #################
 # block example #
 #################
 
-check_PROGRAMS +=				\
-	basic_examples/block
-
-examplebin_PROGRAMS +=				\
-	basic_examples/block
-
 basic_examples_block_SOURCES =			\
 	basic_examples/block.c			\
 	basic_examples/block_cpu.c
@@ -206,12 +302,6 @@ endif
 # Variable example #
 ####################
 
-check_PROGRAMS +=				\
-	basic_examples/variable
-
-examplebin_PROGRAMS +=				\
-	basic_examples/variable
-
 basic_examples_variable_SOURCES =		\
 	basic_examples/variable.c		\
 	basic_examples/variable_kernels_cpu.c
@@ -232,14 +322,6 @@ endif
 # Filters #
 ###########
 
-examplebin_PROGRAMS +=				\
-	filters/fvector				\
-	filters/fblock				\
-	filters/fmatrix
-
-filters_fvector_SOURCES =			\
-	filters/fvector.c
-
 filters_fblock_SOURCES =			\
 	filters/fblock.c			\
 	filters/fblock_cpu.c
@@ -255,69 +337,17 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	filters/fblock_opencl_kernel.cl
 endif
 
-filters_fmatrix_SOURCES =			\
-	filters/fmatrix.c
-
-###################
-# PPM downscaling #
-###################
-
-examplebin_PROGRAMS +=				\
-	ppm_downscaler/ppm_downscaler
-
-ppm_downscaler_ppm_downscaler_SOURCES =		\
-	ppm_downscaler/ppm_downscaler.c
-
-examplebin_PROGRAMS +=				\
-	ppm_downscaler/yuv_downscaler
-
-ppm_downscaler_yuv_downscaler_SOURCES =		\
-	ppm_downscaler/yuv_downscaler.c
-
-######
-# Pi #
-######
-
-check_PROGRAMS +=				\
-	pi/pi_redux
-
-examplebin_PROGRAMS +=				\
-	pi/pi					\
-	pi/pi_redux
-
-pi_pi_SOURCES =					\
-	pi/pi.c					\
-	pi/SobolQRNG/sobol_gold.c		\
-	pi/SobolQRNG/sobol_primitives.c
-
-if STARPU_USE_CUDA
-pi_pi_SOURCES +=				\
-	pi/pi_kernel.cu				\
-	pi/SobolQRNG/sobol_gpu.cu
-endif
-
-pi_pi_redux_SOURCES =				\
-	pi/pi_redux.c
-
-if STARPU_USE_CUDA
-pi_pi_redux_SOURCES +=				\
-	pi/pi_redux_kernel.cu
-endif
-
-
 ################
 # AXPY example #
 ################
 
 if !NO_BLAS_LIB
-
-examplebin_PROGRAMS +=				\
-	axpy/axpy
-
 axpy_axpy_SOURCES =				\
 	axpy/axpy.c				\
 	common/blas.c
 
+axpy_axpy_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 
 ################
@@ -326,18 +356,20 @@ endif
 
 if !NO_BLAS_LIB
 
-examplebin_PROGRAMS += 				\
-	mult/sgemm 				\
-	mult/dgemm
-
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
 	common/blas.c
 
+mult_sgemm_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
+
 mult_dgemm_SOURCES = 				\
 	mult/dgemm.c				\
 	common/blas.c
 
+mult_dgemm_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 
 ####################
@@ -346,36 +378,42 @@ endif
 
 if !NO_BLAS_LIB
 
-examplebin_PROGRAMS += 				\
-	cholesky/cholesky_tag			\
-	cholesky/cholesky_tile_tag		\
-	cholesky/cholesky_grain_tag		\
-	cholesky/cholesky_implicit
-
 cholesky_cholesky_tag_SOURCES =			\
 	cholesky/cholesky_tag.c			\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 
+cholesky_cholesky_tag_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_tile_tag_SOURCES =		\
 	cholesky/cholesky_tile_tag.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 
+cholesky_cholesky_tile_tag_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_grain_tag_SOURCES =		\
 	cholesky/cholesky_grain_tag.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 
+cholesky_cholesky_grain_tag_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 
+cholesky_cholesky_implicit_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 
 ##############
@@ -384,14 +422,6 @@ endif
 
 if !NO_BLAS_LIB
 
-check_PROGRAMS +=				\
-	lu/lu_example_float			\
-	lu/lu_implicit_example_float
-
-examplebin_PROGRAMS += 				\
-	lu/lu_example_float			\
-	lu/lu_example_double
-
 lu_lu_example_float_SOURCES =			\
 	lu/lu_example_float.c			\
 	lu/slu.c				\
@@ -399,6 +429,9 @@ lu_lu_example_float_SOURCES =			\
 	lu/slu_kernels.c			\
 	common/blas.c
 
+lu_lu_example_float_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
+
 lu_lu_example_double_SOURCES =			\
 	lu/lu_example_double.c			\
 	lu/dlu.c				\
@@ -406,9 +439,8 @@ lu_lu_example_double_SOURCES =			\
 	lu/dlu_kernels.c			\
 	common/blas.c
 
-examplebin_PROGRAMS += 				\
-	lu/lu_implicit_example_float		\
-	lu/lu_implicit_example_double
+lu_lu_example_double_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
 
 lu_lu_implicit_example_float_SOURCES =		\
 	lu/lu_example_float.c			\
@@ -417,6 +449,9 @@ lu_lu_implicit_example_float_SOURCES =		\
 	lu/slu_kernels.c			\
 	common/blas.c
 
+lu_lu_implicit_example_float_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 lu_lu_implicit_example_double_SOURCES =		\
 	lu/lu_example_double.c			\
 	lu/dlu_implicit.c			\
@@ -424,6 +459,8 @@ lu_lu_implicit_example_double_SOURCES =		\
 	lu/dlu_kernels.c			\
 	common/blas.c
 
+lu_lu_implicit_example_double_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 
 ###########################
@@ -448,8 +485,6 @@ endif
 
 if !NO_BLAS_LIB
 
-examplebin_PROGRAMS += heat/heat
-
 heat_heat_SOURCES =				\
 	heat/heat.c				\
 	heat/dw_factolu.c			\
@@ -462,6 +497,10 @@ heat_heat_SOURCES =				\
 	heat/dw_factolu_kernels.c		\
 	common/blas.c
 
+heat_heat_LDADD =				\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)		\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 
 ##############
@@ -470,8 +509,6 @@ endif
 
 if !NO_BLAS_LIB
 
-examplebin_PROGRAMS += cg/cg
-
 cg_cg_SOURCES =					\
 	cg/cg.c					\
 	cg/cg_kernels.c				\
@@ -482,62 +519,33 @@ cg_cg_SOURCES +=				\
 	cg/cg_dot_kernel.cu
 endif
 
-endif
-
+cg_cg_LDADD =					\
+	$(STARPU_BLAS_LDFLAGS)
 
-
-################
-# Tag examples #
-################
-
-check_PROGRAMS +=			\
-	tag_example/tag_example			\
-	tag_example/tag_example3			\
-	tag_example/tag_example2	\
-	tag_example/tag_restartable
-
-examplebin_PROGRAMS +=			\
-	tag_example/tag_example			\
-	tag_example/tag_example3		\
-	tag_example/tag_example2	\
-	tag_example/tag_restartable
-
-tag_example_tag_example_SOURCES =		\
-	tag_example/tag_example.c
-
-tag_example_tag_example2_SOURCES =		\
-	tag_example/tag_example2.c
-
-tag_example_tag_example3_SOURCES =		\
-	tag_example/tag_example3.c
-
-tag_example_tag_restartable_SOURCES =		\
-	tag_example/tag_restartable.c
+endif
 
 ################
 # SpMV example #
 ################
 
-examplebin_PROGRAMS += 				\
-	spmv/dw_spmv
-
-spmv_dw_spmv_SOURCES = 				\
-	spmv/dw_spmv.c
+spmv_spmv_SOURCES = 				\
+	spmv/spmv.c				\
+	spmv/spmv_kernels.c
 
 if STARPU_USE_CUDA
-spmv_dw_spmv_SOURCES +=				\
+spmv_spmv_SOURCES +=				\
 	spmv/spmv_cuda.cu
 endif
 
 if ATLAS_BLAS_LIB
-examplebin_PROGRAMS += 				\
-	spmv/dw_block_spmv
-
 spmv_dw_block_spmv_SOURCES =			\
 	spmv/dw_block_spmv.c			\
 	spmv/dw_block_spmv_kernels.c		\
 	spmv/matrix_market/mm_to_bcsr.c		\
 	spmv/matrix_market/mmio.c
+
+spmv_dw_block_spmv_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 
 #######################
@@ -545,12 +553,6 @@ endif
 #######################
 
 
-check_PROGRAMS +=				\
-	incrementer/incrementer
-
-examplebin_PROGRAMS +=				\
-	incrementer/incrementer
-
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c
 if STARPU_USE_CUDA
@@ -568,78 +570,38 @@ endif
 # matVecMult example #
 ######################
 
-check_PROGRAMS +=				\
-	matvecmult/matvecmult
-
-examplebin_PROGRAMS +=				\
-	matvecmult/matvecmult
-
-matvecmult_matvecmult_SOURCES =	\
-	matvecmult/matvecmult.c
-
 if STARPU_USE_OPENCL
 nobase_STARPU_OPENCL_DATA_DATA += \
 	matvecmult/matvecmult_kernel.cl
 endif
 
-#####################
-# profiling example #
-#####################
-
-check_PROGRAMS +=				\
-	profiling/profiling
-
-examplebin_PROGRAMS +=				\
-	profiling/profiling
-
-profiling_profiling_SOURCES =			\
-	profiling/profiling.c
-
-#####################
-# scheduler example #
-#####################
-
-check_PROGRAMS +=				\
-	scheduler/dummy_sched
-
-examplebin_PROGRAMS +=				\
-	scheduler/dummy_sched
-
-scheduler_dummy_sched_SOURCES =			\
-	scheduler/dummy_sched.c
-
 #######################
 # dot_product example #
 #######################
 
-check_PROGRAMS +=				\
-	reductions/dot_product
-
-examplebin_PROGRAMS +=				\
-	reductions/dot_product
-
 reductions_dot_product_SOURCES =		\
 	reductions/dot_product.c
-
-#####################
-# Min/Max reduction #
-#####################
-
-check_PROGRAMS +=				\
-	reductions/minmax_reduction
-
-examplebin_PROGRAMS +=				\
-	reductions/minmax_reduction
-
-reductions_minmax_reduction_SOURCES =		\
-	reductions/minmax_reduction.c
+if STARPU_USE_CUDA
+reductions_dot_product_SOURCES +=		\
+	reductions/dot_product_kernels.cu
+endif
 
 ##################
 # Mandelbrot Set #
 ##################
 
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+if HAVE_X11
+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+endif
+
+################
+# Top Examples #
+################
+
 examplebin_PROGRAMS +=				\
-	mandelbrot/mandelbrot
+	top/hello_world_top
 
-mandelbrot_mandelbrot_SOURCES =			\
-	mandelbrot/mandelbrot.c
+top_hello_world_top_SOURCES =			\
+	top/hello_world_top.c

+ 5 - 7
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
@@ -30,7 +30,7 @@
 #include <cufft.h>
 #endif
 
-//#define SAVE_RAW	1
+/* #define SAVE_RAW	1 */
 
 #define DEFAULTINPUTFILE	"input.wav"
 #define DEFAULTOUTPUTFILE	"output.wav"
@@ -328,14 +328,14 @@ static void init_problem(void)
 	/* allocate a buffer to store the content of input file */
 	if (use_pin)
 	{
-		starpu_data_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
+		starpu_malloc((void **)&A, length_data*sizeof(float));
 	}
 	else {
 		A = malloc(length_data*sizeof(float));
 	}
 
 	/* allocate working buffer (this could be done online, but we'll keep it simple) */
-	//starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
+	/* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
 
 	/* read input data into buffer "A" */
 	read_16bit_wav(infile, length_data, A, infile_raw);
@@ -396,9 +396,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f = 
 	{
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = niter,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = niter
 	};
 
 	starpu_data_partition(A_handle, &f);

+ 18 - 13
examples/axpy/axpy.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,6 +36,8 @@
 
 #define NBLOCKS	8
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 TYPE *vec_x, *vec_y;
 
 /* descriptors for StarPU */
@@ -93,21 +95,21 @@ int main(int argc, char **argv)
 		vec_a = malloc(N*sizeof(TYPE));
 		vec_b = malloc(N*sizeof(TYPE));
 	*/
-	starpu_data_malloc_pinned_if_possible((void **)&vec_x, N*sizeof(TYPE));
+	starpu_malloc((void **)&vec_x, N*sizeof(TYPE));
 	assert(vec_x);
 
-	starpu_data_malloc_pinned_if_possible((void **)&vec_y, N*sizeof(TYPE));
+	starpu_malloc((void **)&vec_y, N*sizeof(TYPE));
 	assert(vec_y);
 
 	unsigned i;
 	for (i = 0; i < N; i++)
 	{
-		vec_x[i] = 1.0f;//(TYPE)starpu_drand48();
-		vec_y[i] = 4.0f;//(TYPE)starpu_drand48();
+		vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
+		vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
 	}
 
-	fprintf(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
-	fprintf(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
+	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
+	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
 
 	/* Declare the data to StarPU */
 	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
@@ -116,9 +118,7 @@ int main(int argc, char **argv)
 	/* Divide the vector into blocks */
 	struct starpu_data_filter block_filter = {
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = NBLOCKS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = NBLOCKS
 	};
 
 	starpu_data_partition(handle_x, &block_filter);
@@ -151,16 +151,21 @@ int main(int argc, char **argv)
 
 	starpu_task_wait_for_all();
 
+	starpu_data_unpartition(handle_x, 0);
 	starpu_data_unpartition(handle_y, 0);
+	starpu_data_unregister(handle_x);
 	starpu_data_unregister(handle_y);
 
 	gettimeofday(&end, NULL);
         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
                                         (end.tv_usec - start.tv_usec));
 
-	fprintf(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
+
+	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
 
-	fprintf(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
+	starpu_free((void *)vec_x);
+	starpu_free((void *)vec_y);
 
 	/* Stop StarPU */
 	starpu_shutdown();

+ 13 - 10
examples/basic_examples/block.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <pthread.h>
 #include <math.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 extern void cpu_codelet(void *descr[], void *_args);
 #ifdef STARPU_USE_CUDA
 extern void cuda_codelet(void *descr[], void *_args);
@@ -52,24 +54,23 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
         task->buffers[0].handle = block_handle;
         task->buffers[0].mode = STARPU_RW;
 	task->cl_arg = &multiplier;
+	task->cl_arg_size = sizeof(multiplier);
 
         int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 return 1;
 	}
 
 	starpu_task_wait_for_all();
 
 	/* update the array in RAM */
-        starpu_data_acquire(block_handle, STARPU_R);
+	starpu_data_unregister(block_handle);
 
         for(i=0 ; i<pnx*pny*pnz; i++) {
-          fprintf(stderr, "%f ", block[i]);
+          FPRINTF(stderr, "%f ", block[i]);
         }
-        fprintf(stderr, "\n");
-
-        starpu_data_release(block_handle);
+        FPRINTF(stderr, "\n");
 
         return 0;
 }
@@ -98,7 +99,7 @@ int main(int argc, char **argv)
         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
         if (!ret) multiplier *= 1.0;
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code);
+        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
         if (!ret) multiplier *= 2.0;
 #endif
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
         if (!ret) multiplier *= 3.0;
 #endif
 
-        // Check result is correct
+        /* Check result is correct */
         ret=1;
         for(i=0 ; i<nx*ny*nz ; i++) {
           if (block[i] != (i+1) * multiplier) {
@@ -116,7 +117,9 @@ int main(int argc, char **argv)
           }
         }
 
-        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+        FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+	free(block);
+
         starpu_shutdown();
 
 	return 0;

+ 7 - 8
examples/basic_examples/block_opencl.c

@@ -25,8 +25,8 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;
-	int id, devid, err, n;
-	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
+	int id, devid, err;
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
@@ -41,14 +41,13 @@ void opencl_codelet(void *descr[], void *_args)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = 0;
-        n=0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
-	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
-	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
-	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
+	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
+	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
+	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
 	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
 	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
-	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
+	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{

+ 21 - 15
examples/basic_examples/hello_world.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,12 +31,14 @@
 #include <stdint.h>
 #include <starpu.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 /* When the task is done, task->callback_func(task->callback_arg) is called. Any
  * callback function must have the prototype void (*)(void *).
  * NB: Callback are NOT allowed to perform potentially blocking operations */
 void callback_func(void *callback_arg)
 {
-	printf("Callback function got argument %p\n", callback_arg);
+        FPRINTF(stdout, "Callback function got argument %p\n", callback_arg);
 }
 
 /* Every implementation of a codelet must have this prototype, the first
@@ -52,22 +54,16 @@ void cpu_func(void *buffers[], void *cl_arg)
 {
 	struct params *params = cl_arg;
 
-	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 
-starpu_codelet cl =
-{
-	/* this codelet may only be executed on a CPU, and its cpu
- 	 * implementation is function "cpu_func" */
-	.where = STARPU_CPU,
-	.cpu_func = cpu_func,
-	/* the codelet does not manipulate any data that is managed
-	 * by our DSM */
-	.nbuffers = 0
-};
+starpu_codelet cl;
 
 int main(int argc, char **argv)
 {
+	struct starpu_task *task;
+	struct params params = {1, 2.0f};
+
 	/* initialize StarPU : passing a NULL argument means that we use
  	* default configuration for the scheduling policies and the number of
 	* processors/accelerators */
@@ -76,7 +72,15 @@ int main(int argc, char **argv)
 	/* create a new task that is non-blocking by default : the task is not
 	 * submitted to the scheduler until the starpu_task_submit function is
 	 * called */
-	struct starpu_task *task = starpu_task_create();
+	task = starpu_task_create();
+
+	/* this codelet may only be executed on a CPU, and its cpu
+ 	 * implementation is function "cpu_func" */
+	cl.where = STARPU_CPU;
+	cl.cpu_func = cpu_func;
+	/* the codelet does not manipulate any data that is managed
+	 * by our DSM */
+	cl.nbuffers = 0;
 
 	/* the task uses codelet "cl" */
 	task->cl = &cl;
@@ -89,7 +93,6 @@ int main(int argc, char **argv)
 	 * is read-only so that any modification is not passed to other copies
 	 * of the buffer.  For this reason, a buffer passed as a codelet
 	 * argument (cl_arg) is NOT a valid synchronization medium! */
-	struct params params = { 1, 2.0f };
 	task->cl_arg = &params;
 	task->cl_arg_size = sizeof(params);
 		
@@ -103,6 +106,9 @@ int main(int argc, char **argv)
 	
 	/* submit the task to StarPU */
 	starpu_task_submit(task);
+
+	/* destroy the task */
+	starpu_task_destroy(task);
 	
 	/* terminate StarPU: statistics and other debug outputs are not
 	 * guaranteed to be generated unless this function is called. Once it

+ 36 - 79
examples/basic_examples/mult.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,8 +28,7 @@
  *    monitoring data (starpu_data_unregister)
  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
  *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
- *  - how to submit asynchronous tasks and how to use callback to handle task
- *    termination
+ *  - how to submit asynchronous tasks
  */
 
 #include <string.h>
@@ -44,11 +43,6 @@
 static float *A, *B, *C;
 static starpu_data_handle A_handle, B_handle, C_handle;
 
-static pthread_mutex_t mutex;
-static pthread_cond_t cond;
-static unsigned taskcounter;
-static unsigned terminated = 0;
-
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
 static unsigned xdim = 1024;
@@ -77,37 +71,11 @@ static unsigned zdim = 512;
 
  */
 
-static void callback_func(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	int *counterptr = arg;
-
-	/* counterptr points to a variable with the number of remaining tasks,
- 	 * when it reaches 0, all tasks are done */
-	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
-	if (counter == 0)
-	{
-		/* IMPORTANT : note that we CANNOT call blocking operations
-		 * within callbacks as it may lead to a deadlock of StarPU.
-		 * starpu_data_unpartition is for instance called by the main
-		 * thread since it may cause /potentially/ blocking operations
-		 * such as memory transfers from a GPU to a CPU. */
-		
-		/* wake the application to notify the termination of all the
- 		 * tasks */
-		pthread_mutex_lock(&mutex);
-		terminated = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
-	}
-}
-
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
- * registered data with the "blas" data interface, we manipulate the .blas
- * field of the descr[x] elements which are union types.
+ * registered data with the "matrix" data interface, we use the matrix macros.
  */
 
 static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
@@ -218,18 +186,14 @@ static void partition_mult_data(void)
 	/* StarPU supplies some basic filters such as the partition of a matrix
 	 * into blocks, note that we are using a FORTRAN ordering so that the
 	 * name of the filters are a bit misleading */
-	struct starpu_data_filter f = {
+	struct starpu_data_filter vert = {
 		.filter_func = starpu_vertical_block_filter_func,
-		.nchildren = nslicesx,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = nslicesx
 	};
 		
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter horiz = {
 		.filter_func = starpu_block_filter_func,
-		.nchildren = nslicesy,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = nslicesy
 	};
 		
 /*
@@ -269,17 +233,17 @@ static void partition_mult_data(void)
  *	enforce memory consistency.
  */
 
-	starpu_data_partition(B_handle, &f);
-	starpu_data_partition(A_handle, &f2);
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
 
 	/* starpu_data_map_filters is a variable-arity function, the first argument
 	 * is the handle of the data to partition, the second argument is the
 	 * number of filters to apply recursively. Filters are applied in the
 	 * same order as the arguments.
-	 * This would be equivalent to starpu_data_partition(C_handle, &f) and
-	 * then applying f2 on each sub-data (ie. each column of C)
+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
+	 * then applying horiz on each sub-data (ie. each column of C)
 	 */
-	starpu_data_map_filters(C_handle, 2, &f, &f2);
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 }
 
 static struct starpu_perfmodel_t mult_perf_model = {
@@ -287,28 +251,23 @@ static struct starpu_perfmodel_t mult_perf_model = {
 	.symbol = "mult_perf_model"
 };
 
+static starpu_codelet cl = {
+        /* we can only execute that kernel on a CPU yet */
+        .where = STARPU_CPU,
+        /* CPU implementation of the codelet */
+        .cpu_func = cpu_mult,
+        /* the codelet manipulates 3 buffers that are managed by the
+         * DSM */
+        .nbuffers = 3,
+        /* in case the scheduling policy may use performance models */
+        .model = &mult_perf_model
+};
+
 static void launch_tasks(void)
 {
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
-	/* the callback decrements this value every time a task is terminated
-	 * and notify the termination of the computation to the application
-	 * when the counter reaches 0 */
-	taskcounter = nslicesx * nslicesy;
-
-	starpu_codelet cl = {
-		/* we can only execute that kernel on a CPU yet */
-		.where = STARPU_CPU,
-		/* CPU implementation of the codelet */
-		.cpu_func = cpu_mult,
-		/* the codelet manipulates 3 buffers that are managed by the
- 		 * DSM */
-		.nbuffers = 3,
-		/* in case the scheduling policy may use performance models */
-		.model = &mult_perf_model
-	};
-
 	for (taskx = 0; taskx < nslicesx; taskx++) 
 	{
 		for (tasky = 0; tasky < nslicesy; tasky++)
@@ -322,9 +281,6 @@ static void launch_tasks(void)
 			/* this task implements codelet "cl" */
 			task->cl = &cl;
 
-			task->callback_func = callback_func;
-			task->callback_arg = &taskcounter;
-
 			/*
 			 *              |---|---|---|---|
 			 *              |   | * |   |   | B
@@ -371,9 +327,6 @@ static void launch_tasks(void)
 int main(__attribute__ ((unused)) int argc, 
 	 __attribute__ ((unused)) char **argv)
 {
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
-
 	/* start the runtime */
 	starpu_init(NULL);
 
@@ -387,26 +340,30 @@ int main(__attribute__ ((unused)) int argc,
 	/* submit all tasks in an asynchronous fashion */
 	launch_tasks();
 
-	/* the different tasks are asynchronous so we use a callback to get
-	 * notified of the termination of the computation */
-	pthread_mutex_lock(&mutex);
-	if (!terminated)
-		pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	/* wait for termination */
+        starpu_task_wait_for_all();
 
 	/* remove the filters applied by the means of starpu_data_map_filters; now
  	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
 	 * starpu_data_map_filters is called again on C_handle.
 	 * The second argument is the memory node where the different subsets
 	 * should be reassembled, 0 = main memory (RAM) */
+	starpu_data_unpartition(A_handle, 0);
+	starpu_data_unpartition(B_handle, 0);
 	starpu_data_unpartition(C_handle, 0);
 
 	/* stop monitoring matrix C : after this, it is not possible to pass C 
 	 * (or any subset of C) as a codelet input/output. This also implements
 	 * a barrier so that the piece of data is put back into main memory in
 	 * case it was only available on a GPU for instance. */
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
-	
+
+	free(A);
+	free(B);
+	free(C);
+
 	starpu_shutdown();
 
 	return 0;

+ 9 - 6
examples/basic_examples/variable.c

@@ -18,6 +18,8 @@
 #include <starpu.h>
 #include <pthread.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static unsigned niter = 50000;
 
 extern void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args);
@@ -41,6 +43,9 @@ int main(int argc, char **argv)
 
 	starpu_init(NULL);
 
+#ifdef STARPU_SLOW_MACHINE
+	niter /= 100;
+#endif
         if (argc == 2) niter = atoi(argv[1]);
         foo = 0.0f;
 
@@ -48,7 +53,7 @@ int main(int argc, char **argv)
                                       (uintptr_t)&foo, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
@@ -77,7 +82,7 @@ int main(int argc, char **argv)
 		ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 		}
 	}
@@ -85,11 +90,9 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 
 	/* update the array in RAM */
-	starpu_data_acquire(float_array_handle, STARPU_R);
-
-	fprintf(stderr, "variable -> %f\n", foo);
+	starpu_data_unregister(float_array_handle);
 
-	starpu_data_release(float_array_handle);
+	FPRINTF(stderr, "variable -> %f\n", foo);
 
 	starpu_shutdown();
 

+ 2 - 2
examples/basic_examples/variable_kernels_opencl.c

@@ -21,7 +21,7 @@
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)
 {
-	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{

+ 10 - 5
examples/basic_examples/vector_scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,7 @@
 #include <stdio.h>
 
 #define	NX	2048
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
@@ -71,16 +72,17 @@ int main(int argc, char **argv)
 	float vector[NX];
 	unsigned i;
 	for (i = 0; i < NX; i++)
-		vector[i] = 1.0f;
+                vector[i] = (i+1.0f);
 
-	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
 
 	/* Initialize StarPU with default configuration */
 	starpu_init(NULL);
 
 #ifdef STARPU_USE_OPENCL
 	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
-					    &opencl_program);
+					    &opencl_program, NULL);
 #endif
 
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
@@ -125,6 +127,8 @@ int main(int argc, char **argv)
  	 * monitoring it */
 	starpu_data_unregister(vector_handle);
 
+	starpu_task_destroy(task);
+
 #ifdef STARPU_USE_OPENCL
         starpu_opencl_unload_opencl(&opencl_program);
 #endif
@@ -132,7 +136,8 @@ int main(int argc, char **argv)
 	/* terminate StarPU, no task can be submitted after */
 	starpu_shutdown();
 
-	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
 
 	return 0;
 }

+ 2 - 2
examples/basic_examples/vector_scal_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,7 @@
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
 {
-        unsigned i = threadIdx.x;
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
 
 	if (i < n)
                val[i] *= factor;

+ 3 - 3
examples/basic_examples/vector_scal_opencl.c

@@ -36,8 +36,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-	/* local copy of the vector pointer */
-	float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+	/* OpenCL copy of the vector pointer */
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);
@@ -45,7 +45,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);

+ 73 - 0
examples/callback/callback.c

@@ -0,0 +1,73 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <pthread.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+starpu_data_handle handle;
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	*val += 1;
+}
+
+starpu_codelet cl =
+{
+	.where = STARPU_CPU,
+	.cpu_func = cpu_codelet,
+	.nbuffers = 1
+};
+
+void callback_func(void *callback_arg)
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->buffers[0].handle = handle;
+	task->buffers[0].mode = STARPU_RW;
+	starpu_task_submit(task);
+}
+
+int main(int argc, char **argv)
+{
+	int v=40;
+
+	starpu_init(NULL);
+	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
+
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	task->buffers[0].handle = handle;
+	task->buffers[0].mode = STARPU_RW;
+
+	starpu_task_submit(task);
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle);
+
+	FPRINTF(stderr, "v -> %d\n", v);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 21 - 22
examples/cg/cg.c

@@ -13,6 +13,7 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #include <math.h>
 #include <assert.h>
 #include <sys/time.h>
@@ -24,6 +25,8 @@
 #include <cublas.h>
 #endif
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 /*
  *	Conjugate Gradient
  *
@@ -92,23 +95,19 @@ extern starpu_codelet bzero_vector_cl;
 
 static void generate_random_problem(void)
 {
-	srand48(0xdeadbeef);
-
 	int i, j;
 
-	starpu_data_malloc_pinned_if_possible((void **)&A, n*n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&b, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&x, n*sizeof(TYPE));
+	starpu_malloc((void **)&A, n*n*sizeof(TYPE));
+	starpu_malloc((void **)&b, n*sizeof(TYPE));
+	starpu_malloc((void **)&x, n*sizeof(TYPE));
 	assert(A && b && x);
 
-	/* Create a random matrix (A) and two random vectors (x and b) */
 	for (j = 0; j < n; j++)
 	{
 		b[j] = (TYPE)1.0;
 		x[j] = (TYPE)0.0;
 
 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
-
 		for (i = 0; i < n; i++)
 		{
 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));
@@ -116,9 +115,9 @@ static void generate_random_problem(void)
 	}
 
 	/* Internal vectors */
-	starpu_data_malloc_pinned_if_possible((void **)&r, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&d, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&q, n*sizeof(TYPE));
+	starpu_malloc((void **)&r, n*sizeof(TYPE));
+	starpu_malloc((void **)&d, n*sizeof(TYPE));
+	starpu_malloc((void **)&q, n*sizeof(TYPE));
 	assert(r && d && q);
 
 	memset(r, 0, n*sizeof(TYPE));
@@ -205,12 +204,12 @@ static void display_vector(starpu_data_handle handle, TYPE *ptr)
 		starpu_data_acquire(starpu_data_get_sub_data(handle, 1, b), STARPU_R);
 		for (ind = 0; ind < block_size; ind++)
 		{
-			fprintf(stderr, "%2.2e ", ptr[b*block_size + ind]);
+			FPRINTF(stderr, "%2.2e ", ptr[b*block_size + ind]);
 		}
-		fprintf(stderr, "| ");
+		FPRINTF(stderr, "| ");
 		starpu_data_release(starpu_data_get_sub_data(handle, 1, b));
 	}
-	fprintf(stderr, "\n");
+	FPRINTF(stderr, "\n");
 }
 
 static void display_matrix(void)
@@ -220,9 +219,9 @@ static void display_matrix(void)
 	{
 		for (j = 0; j < n; j++)
 		{
-			fprintf(stderr, "%2.2e ", A[j*n + i]);
+			FPRINTF(stderr, "%2.2e ", A[j*n + i]);
 		}
-		fprintf(stderr, "\n");
+		FPRINTF(stderr, "\n");
 	}
 }
 #endif
@@ -255,8 +254,8 @@ static void cg(void)
 	delta_0 = delta_new;
 	starpu_data_release(rtr_handle);
 
-	fprintf(stderr, "*************** INITIAL ************ \n");
-	fprintf(stderr, "Delta 0: %e\n", delta_new);
+	FPRINTF(stderr, "*************** INITIAL ************ \n");
+	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
 
 	struct timeval start;
 	struct timeval end;
@@ -307,8 +306,8 @@ static void cg(void)
 		{
 			/* We here take the error as ||r||_2 / (n||b||_2) */
 			double error = sqrt(delta_new/delta_0)/(1.0*n);
-			fprintf(stderr, "*****************************************\n");
-			fprintf(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+			FPRINTF(stderr, "*****************************************\n");
+			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
 		}
 
 		i++;
@@ -317,8 +316,8 @@ static void cg(void)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
-	fprintf(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	fprintf(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
+	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
+	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
 }
 
 static int check(void)
@@ -351,7 +350,7 @@ static void parse_args(int argc, char **argv)
 		}
 
 	        if (strcmp(argv[i], "-h") == 0) {
-			fprintf(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 			continue;
 		}

+ 2 - 2
examples/cg/cg.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -82,4 +82,4 @@ void copy_handle(starpu_data_handle dst,
 		starpu_data_handle src,
 		unsigned nblocks);
 
-#endif // __STARPU_EXAMPLE_CG_H__
+#endif /* __STARPU_EXAMPLE_CG_H__ */

+ 19 - 0
examples/cg/cg_dot_kernel.cu

@@ -126,3 +126,22 @@ extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
 
 	cudaFree(per_block_sum);
 }
+
+static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
+{
+	unsigned i;
+	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	for (i = first_i; i < nelems; i += nelems_per_thread)
+		x[i] = 0.0;
+}
+
+extern "C" void zero_vector(TYPE *x, unsigned nelems)
+{
+	unsigned nblocks = STARPU_MIN(128, nelems);
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
+
+	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
+
+	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
+}

+ 9 - 4
examples/cg/cg_kernels.c

@@ -16,6 +16,7 @@
 
 #include "cg.h"
 #include <math.h>
+#include <limits.h>
 
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
@@ -123,11 +124,14 @@ starpu_codelet accumulate_vector_cl = {
  */
 
 #ifdef STARPU_USE_CUDA
+extern void zero_vector(TYPE *x, unsigned nelems);
+
 static void bzero_variable_cuda(void *descr[], void *cl_arg)
 {
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	zero_vector(v, 1);
  
-	cublasscal (1, (TYPE)0.0, v, 1);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
@@ -159,7 +163,8 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
-	cublasscal (n, (TYPE)0.0, v, 1);
+	zero_vector(v, n);
+
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
@@ -578,8 +583,8 @@ static void copy_handle_cuda(void *descr[], void *cl_arg)
 	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
 
-	cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 3 - 2
examples/cholesky/cholesky.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,6 +31,7 @@
 #include <common/blas.h>
 #include <starpu.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define NMAXBLOCKS	32
 
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
@@ -112,4 +113,4 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	}
 }
 
-#endif // __DW_CHOLESKY_H__
+#endif /* __DW_CHOLESKY_H__ */

+ 31 - 33
examples/cholesky/cholesky_grain_tag.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
 	
@@ -121,7 +121,7 @@ static starpu_codelet cl22 =
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
 
@@ -173,17 +173,15 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
@@ -214,7 +212,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 	}
 
@@ -261,7 +259,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	if (pinned)
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
+		starpu_malloc((void **)A, dim*dim*sizeof(float));
 	} 
 	else {
 		*A = malloc(dim*dim*sizeof(float));
@@ -280,11 +278,11 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	double flop = (1.0f*size*size*size)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 	starpu_helper_cublas_shutdown();
 
@@ -311,26 +309,26 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
 
 
 #ifdef CHECK_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 
@@ -338,43 +336,43 @@ int main(int argc, char **argv)
 	cholesky_grain(mat, size, size, nblocks, nbigblocks);
 
 #ifdef CHECK_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
-	fprintf(stderr, "compute explicit LLt ...\n");
+	FPRINTF(stderr, "compute explicit LLt ...\n");
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
 	SSYRK("L", "N", size, size, 1.0f, 
 				mat, size, 0.0f, test_mat, size);
 
-	fprintf(stderr, "comparing results ...\n");
+	FPRINTF(stderr, "comparing results ...\n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 

+ 30 - 32
examples/cholesky/cholesky_implicit.c

@@ -126,13 +126,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
@@ -143,17 +143,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
@@ -174,7 +172,7 @@ int main(int argc, char **argv)
 	starpu_helper_cublas_init();
 
 	float *mat;
-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
 
 	unsigned i,j;
 	for (i = 0; i < size; i++)
@@ -182,58 +180,58 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
 
-//#define PRINT_OUTPUT
+/* #define PRINT_OUTPUT */
 #ifdef PRINT_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 
 	cholesky(mat, size, size, nblocks);
 
 #ifdef PRINT_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 
 	if (check)
 	{
-		fprintf(stderr, "compute explicit LLt ...\n");
+		FPRINTF(stderr, "compute explicit LLt ...\n");
 		for (j = 0; j < size; j++)
 		{
 			for (i = 0; i < size; i++)
 			{
 				if (i > j) {
-					mat[j+i*size] = 0.0f; // debug
+					mat[j+i*size] = 0.0f; /* debug */
 				}
 			}
 		}
@@ -243,20 +241,20 @@ int main(int argc, char **argv)
 		SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 	
-		fprintf(stderr, "comparing results ...\n");
+		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
 		for (j = 0; j < size; j++)
 		{
 			for (i = 0; i < size; i++)
 			{
 				if (i <= j) {
-					printf("%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 				}
 				else {
-					printf(".\t");
+					FPRINTF(stdout, ".\t");
 				}
 			}
-			printf("\n");
+			FPRINTF(stdout, "\n");
 		}
 #endif
 	
@@ -268,7 +266,7 @@ int main(int argc, char **argv)
 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 	                                float err = abs(test_mat[j +i*size] - orig);
 	                                if (err > 0.00001) {
-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }

+ 26 - 9
examples/cholesky/cholesky_kernels.c

@@ -20,6 +20,10 @@
 #include "../common/blas.h"
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
 #endif
 
 /*
@@ -28,7 +32,7 @@
 
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
 {
-	//printf("22\n");
+	/* printf("22\n"); */
 	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
@@ -88,7 +92,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 {
 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 /* 
  * U21
@@ -96,7 +100,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
 static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
 {
-//	printf("21\n");
+/*	printf("21\n"); */
 	float *sub11;
 	float *sub21;
 
@@ -143,7 +147,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 
 static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
 {
-//	printf("11\n");
+/*	printf("11\n"); */
 	float *sub11;
 
 	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
@@ -179,13 +183,27 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
+#ifdef STARPU_HAVE_MAGMA
 			{
+			int ret;
+			int info;
+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+			if (ret != MAGMA_SUCCESS) {
+				fprintf(stderr, "Error in Magma: %d\n", ret);
+				STARPU_ABORT();
+			}
+			cudaError_t cures = cudaThreadSynchronize();
+			STARPU_ASSERT(!cures);
+			}
+#else
+			{
+
 			float *lambda11;
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 
 			for (z = 0; z < nx; z++)
 			{
-
+				
 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
@@ -193,7 +211,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				
 				*lambda11 = sqrt(*lambda11);
 
-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
 				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 
 				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
@@ -206,8 +224,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			cudaFreeHost(lambda11);
 			}
-		
-
+#endif
 			break;
 #endif
 		default:
@@ -227,4 +244,4 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 {
 	chol_common_codelet_update_u11(descr, 1, _args);
 }
-#endif// STARPU_USE_CUDA
+#endif/* STARPU_USE_CUDA */

+ 9 - 9
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@
 
 #include <starpu.h>
 
-//#define USE_PERTURBATION	1
+/* #define USE_PERTURBATION	1 */
 
 #ifdef USE_PERTURBATION
 #define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
@@ -43,7 +43,7 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -58,7 +58,7 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -73,7 +73,7 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -88,7 +88,7 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -103,7 +103,7 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -118,7 +118,7 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);

+ 34 - 36
examples/cholesky/cholesky_tag.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11(k));
 	
@@ -108,7 +108,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
 
@@ -127,7 +127,7 @@ static starpu_codelet cl22 =
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
@@ -155,7 +155,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
 }
@@ -189,7 +189,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 		else {
 			int ret = starpu_task_submit(task);
                         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                                fprintf(stderr, "No worker may execute this task\n");
+                                FPRINTF(stderr, "No worker may execute this task\n");
                                 exit(0);
                         }
 
@@ -210,7 +210,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	/* schedule the codelet */
 	int ret = starpu_task_submit(entry_task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
 
@@ -224,13 +224,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
@@ -241,7 +241,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	if (pinned)
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
 	} 
 	else {
 		*A = malloc(dim*dim*sizeof(float));
@@ -258,17 +258,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
@@ -299,26 +297,26 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
 
 
 #ifdef CHECK_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 
@@ -326,43 +324,43 @@ int main(int argc, char **argv)
 	cholesky(mat, size, size, nblocks);
 
 #ifdef CHECK_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
-	fprintf(stderr, "compute explicit LLt ...\n");
+	FPRINTF(stderr, "compute explicit LLt ...\n");
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
 	SSYRK("L", "N", size, size, 1.0f, 
 				mat, size, 0.0f, test_mat, size);
 
-	fprintf(stderr, "comparing results ...\n");
+	FPRINTF(stderr, "comparing results ...\n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
 			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 

+ 7 - 7
examples/cholesky/cholesky_tile_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,7 +59,7 @@ static starpu_codelet cl11 =
 
 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11(k));
 	
@@ -145,7 +145,7 @@ static starpu_codelet cl22 =
 
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
@@ -224,11 +224,11 @@ static void cholesky_no_stride(void)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	double flop = (1.0f*size*size*size)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 int main(int argc, char **argv)
@@ -239,7 +239,7 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 	assert(nblocks <= NMAXBLOCKS);
 
-	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
 
 	starpu_init(NULL);
 

+ 2 - 2
examples/common/blas.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -154,4 +154,4 @@ extern void dswap_(const int *n, double *x, const int *incx, double *y, const in
 
 #endif
 
-#endif // __BLAS_H__
+#endif /* __BLAS_H__ */

+ 3 - 3
examples/common/blas_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -37,11 +37,11 @@ double gemm_cost(starpu_buffer_descr *descr)
 	nyC = starpu_matrix_get_ny(descr[2].handle);
 	nxA = starpu_matrix_get_nx(descr[0].handle);
 
-//	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+/*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
 
 	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
 
-//	printf("cost %e \n", cost);
+/*	printf("cost %e \n", cost); */
 
 	return cost;
 }

+ 2 - 2
examples/common/blas_model.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -54,4 +54,4 @@ static struct starpu_perfmodel_t starpu_dgemm_model_common = {
 	.type = STARPU_COMMON,
 };
 
-#endif // __BLAS_MODEL_H__
+#endif /* __BLAS_MODEL_H__ */

+ 20 - 18
examples/filters/fblock.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,8 @@
 #define NZ    3
 #define PARTS 2
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 extern void cpu_func(void *buffers[], void *cl_arg);
 
 #ifdef STARPU_USE_CUDA
@@ -36,17 +38,17 @@ extern void opencl_func(void *buffers[], void *cl_arg);
 void print_block(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz)
 {
         int i, j, k;
-        fprintf(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%d ldz=%d\n", block, nx, ny, nz, ldy, ldz);
+        FPRINTF(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%u ldz=%u\n", block, nx, ny, nz, ldy, ldz);
         for(k=0 ; k<nz ; k++) {
                 for(j=0 ; j<ny ; j++) {
                         for(i=0 ; i<nx ; i++) {
-                                fprintf(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
+                                FPRINTF(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
                         }
-                        fprintf(stderr,"\n");
+                        FPRINTF(stderr,"\n");
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 }
 
 void print_data(starpu_data_handle block_handle)
@@ -96,30 +98,28 @@ int main(int argc, char **argv)
         starpu_init(NULL);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 
         /* Declare data to StarPU */
         starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
-        fprintf(stderr, "IN  Block\n");
+        FPRINTF(stderr, "IN  Block\n");
         print_data(handle);
 
         /* Partition the block in PARTS sub-blocks */
 	struct starpu_data_filter f =
 	{
 		.filter_func = starpu_block_filter_func_block,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
         starpu_data_partition(handle, &f);
 
-        fprintf(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
+        FPRINTF(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
 
         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
         {
                 starpu_data_handle sblock = starpu_data_get_sub_data(handle, 1, i);
-                fprintf(stderr, "Sub block %d\n", i);
+                FPRINTF(stderr, "Sub block %d\n", i);
                 print_data(sblock);
         }
 
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
                 int ret,multiplier=i;
                 struct starpu_task *task = starpu_task_create();
 
-                fprintf(stderr,"Dealing with sub-block %d\n", i);
+                FPRINTF(stderr,"Dealing with sub-block %d\n", i);
                 task->cl = &cl;
                 task->synchronous = 1;
                 task->callback_func = NULL;
@@ -139,9 +139,10 @@ int main(int argc, char **argv)
 
                 ret = starpu_task_submit(task);
                 if (ret) {
-                        fprintf(stderr, "Error when submitting task\n");
+                        FPRINTF(stderr, "Error when submitting task\n");
                         exit(ret);
                 }
+		starpu_task_destroy(task);
         }
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
@@ -150,10 +151,11 @@ int main(int argc, char **argv)
         starpu_data_unregister(handle);
 
         /* Print result block */
-        fprintf(stderr, "OUT Block\n");
+        FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);
 
-	starpu_shutdown();
+	free(block);
 
+	starpu_shutdown();
 	return 0;
 }

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -28,7 +28,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 	cl_event event;
 
         int *factor = cl_arg;
-	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
@@ -42,7 +42,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
 	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
 	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
 	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);

+ 13 - 12
examples/filters/fmatrix.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #define NY    4
 #define PARTS 2
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i, j;
@@ -43,15 +45,15 @@ int main(int argc, char **argv)
 	unsigned i, j, n=1;
         int matrix[NX*NY];
 
-        fprintf(stderr,"IN  Matrix: \n");
+        FPRINTF(stderr,"IN  Matrix: \n");
         for(j=0 ; j<NY ; j++) {
                 for(i=0 ; i<NX ; i++) {
                         matrix[(j*NX)+i] = n++;
-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 
         starpu_data_handle handle;
         starpu_codelet cl = {
@@ -68,9 +70,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f =
 	{
 		.filter_func = starpu_block_filter_func,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
 	starpu_data_partition(handle, &f);
 
@@ -86,6 +86,7 @@ int main(int argc, char **argv)
                 task->cl_arg = &factor;
                 task->cl_arg_size = sizeof(factor);
 		starpu_task_submit(task);
+		starpu_task_destroy(task);
 	}
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
@@ -94,14 +95,14 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
         /* Print result matrix */
-        fprintf(stderr,"OUT Matrix: \n");
+        FPRINTF(stderr,"OUT Matrix: \n");
         for(j=0 ; j<NY ; j++) {
                 for(i=0 ; i<NX ; i++) {
-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 
 	return 0;
 }

+ 11 - 10
examples/filters/fvector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,8 @@
 #define NX    21
 #define PARTS 3
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 void cpu_func(void *buffers[], void *cl_arg)
 {
         unsigned i;
@@ -47,9 +49,9 @@ int main(int argc, char **argv)
         };
 
         for(i=0 ; i<NX ; i++) vector[i] = i;
-        fprintf(stderr,"IN  Vector: ");
-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"IN  Vector: ");
+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
+        FPRINTF(stderr,"\n");
 
 	starpu_init(NULL);
 
@@ -60,9 +62,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f =
 	{
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
 	starpu_data_partition(handle, &f);
 
@@ -81,15 +81,16 @@ int main(int argc, char **argv)
                 task->cl_arg_size = sizeof(factor);
 
 		starpu_task_submit(task);
+		starpu_task_destroy(task);
 	}
 
 	starpu_data_unpartition(handle, 0);
         starpu_data_unregister(handle);
 	starpu_shutdown();
 
-        fprintf(stderr,"OUT Vector: ");
-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"OUT Vector: ");
+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
+        FPRINTF(stderr,"\n");
 
 	return 0;
 }

+ 21 - 23
examples/heat/dw_factolu.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -624,12 +624,12 @@ void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
@@ -666,7 +666,7 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 	int ret = starpu_task_submit(task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(0);
 	}
 
@@ -681,12 +681,12 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
@@ -697,8 +697,8 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
 	if (pinned)
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
-		starpu_data_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
+		starpu_malloc((void **)B, (size_t)dim*sizeof(float));
 	} 
 	else {
 		*A = malloc((size_t)dim*dim*sizeof(float));
@@ -714,7 +714,7 @@ void dw_factoLU(float *matA, unsigned size,
 {
 
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
@@ -730,17 +730,15 @@ void dw_factoLU(float *matA, unsigned size,
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
 			size, size, sizeof(float));
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 

+ 47 - 45
examples/heat/dw_factolu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,6 +36,8 @@
 
 #include "lu_kernels_model.h"
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define BLAS3_FLOP(n1,n2,n3)    \
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
 
@@ -82,53 +84,53 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
 #if 0
 	/* display L */
-	printf("(LU): \n");
+	FPRINTF(stdout, "(LU): \n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", LU[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", LU[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
 
 
 	/* display L */
-	printf("L: \n");
+	FPRINTF(stdout, "L: \n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", L[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
 	/* display U */
-	printf("U: \n");
+	FPRINTF(stdout, "U: \n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", U[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", U[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
 #endif
@@ -148,42 +150,42 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
 #if 0
 	/* display A */
-	printf("A: \n");
+	FPRINTF(stdout, "A: \n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
-	//		if (i <= j) {
-	      			printf("%2.2f\t", A[j +i*size]);
-	//		}
-	//		else {
-	//			printf(".\t");
-	//		}
+	/*		if (i <= j) { */
+	      			FPRINTF(stdout, "%2.2f\t", A[j +i*size]);
+	/*		}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 
 
 	/* display LU */
-	printf("LU: \n");
+	FPRINTF(stdout, "LU: \n");
 	for (j = 0; j < size; j++)
 	{
 		for (i = 0; i < size; i++)
 		{
-	//		if (i <= j) {
-	      			printf("%2.2f\t", L[j +i*size]);
-	//		}
-	//		else {
-	//			printf(".\t");
-	//		}
+	/*		if (i <= j) { */
+	      			FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
+	/*		}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 #endif
 
-	printf("max error between A and L*U = %f \n", max_err);
+	FPRINTF(stdout, "max error between A and L*U = %f \n", max_err);
 }
-#endif // CHECK_RESULTS
+#endif /* CHECK_RESULTS */
 
 void dw_cpu_codelet_update_u11(void **, void *);
 void dw_cpu_codelet_update_u12(void **, void *);
@@ -211,4 +213,4 @@ extern struct starpu_perfmodel_t model_12;
 extern struct starpu_perfmodel_t model_21;
 extern struct starpu_perfmodel_t model_22;
 
-#endif // __DW_FACTO_LU_H__
+#endif /* __DW_FACTO_LU_H__ */

+ 24 - 26
examples/heat/dw_factolu_grain.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,7 +54,7 @@ static starpu_codelet cl11 = {
 
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
 
@@ -87,7 +87,7 @@ static starpu_codelet cl12 = {
 
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
 {
-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+/*	FPRINTF(stdout, "task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
 
 	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
 	
@@ -163,7 +163,7 @@ static starpu_codelet cl22 = {
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
 
@@ -207,17 +207,15 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	unsigned nblocks = size / blocksize;
 	unsigned maxk = inner_size / blocksize;
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
@@ -262,7 +260,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 	}
 
@@ -299,13 +297,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
 		float *newmatA = &matA[inner_size*(ld+1)];
 
-//		if (tag_prefix < 2)
-//		{
-//			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
-//		}
-//		else {
+/*		if (tag_prefix < 2)
+		{
+			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
+		}
+		else { */
 			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
-//		}
+/*		} */
 	}
 
 }
@@ -314,7 +312,7 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 {
 
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	Asaved = malloc(ld*ld*sizeof(float));
 
@@ -333,12 +331,12 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 	unsigned n = size;
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 #ifdef CHECK_RESULTS
 	compare_A_LU(Asaved, matA, size, ld);

+ 17 - 17
examples/heat/dw_factolu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +33,7 @@ void display_stat_heat(void)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
-	fprintf(stderr, "STATS : \n");
+	FPRINTF(stderr, "STATS : \n");
 
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
@@ -49,7 +49,7 @@ void display_stat_heat(void)
 		count_22_total += count_22_per_worker[worker];
 	}
 
-	fprintf(stderr, "\t11 (diagonal block LU)\n");
+	FPRINTF(stderr, "\t11 (diagonal block LU)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (count_total_per_worker[worker])
@@ -57,11 +57,11 @@ void display_stat_heat(void)
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
 		}
 	}
 
-	fprintf(stderr, "\t12 (TRSM)\n");
+	FPRINTF(stderr, "\t12 (TRSM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (count_total_per_worker[worker])
@@ -69,12 +69,12 @@ void display_stat_heat(void)
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
 		}
 	}
 	
 	
-	fprintf(stderr, "\t21 (TRSM)\n");
+	FPRINTF(stderr, "\t21 (TRSM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (count_total_per_worker[worker])
@@ -82,11 +82,11 @@ void display_stat_heat(void)
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
 		}
 	}
 	
-	fprintf(stderr, "\t22 (SGEMM)\n");
+	FPRINTF(stderr, "\t22 (SGEMM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (count_total_per_worker[worker])
@@ -94,7 +94,7 @@ void display_stat_heat(void)
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
 		}
 	}
 }
@@ -162,7 +162,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	count_22_per_worker[id]++;
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 /*
  * U12
@@ -225,7 +225,7 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	count_12_per_worker[id]++;
 }
-#endif // STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 /* 
  * U21
@@ -298,12 +298,12 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 	{
 		for (i = 0; i < n; i++)
 		{
-			fprintf(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
+			FPRINTF(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
 		}
-		fprintf(stderr, "\n");
+		FPRINTF(stderr, "\n");
 	}
 	
-	fprintf(stderr, "\n");
+	FPRINTF(stderr, "\n");
 }
 
 static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
@@ -378,4 +378,4 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	count_11_per_worker[id]++;
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */

+ 18 - 20
examples/heat/dw_factolu_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -56,7 +56,7 @@ static starpu_codelet cl11 = {
 
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11(k));
 
@@ -90,7 +90,7 @@ static starpu_codelet cl12 = {
 
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 {
-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+/*	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
 
 	struct starpu_task *task = create_task(TAG12(k, i));
 	
@@ -166,7 +166,7 @@ static starpu_codelet cl22 = {
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
@@ -241,7 +241,7 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 	}
 
@@ -253,19 +253,19 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
+	FPRINTF(stderr, "Computation took (in ms)\n");
 	printf("%2.2f\n", timing/1000);
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
 void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
 {
 
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
@@ -280,17 +280,15 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 

+ 7 - 6
examples/heat/dw_sparse_cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,7 @@
  */
 
 #include "dw_sparse_cg.h"
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 static struct starpu_task *create_task(starpu_tag_t id)
 {
@@ -298,13 +299,13 @@ void iteration_cg(void *problem)
 {
 	struct cg_problem *pb = problem;
 
-	printf("i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
+	FPRINTF(stdout, "i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
 
 	if ((pb->i < MAXITER) && 
 		(pb->delta_new > pb->epsilon) )
 	{
 		if (pb->i % 1000 == 0)
-			printf("i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
+			FPRINTF(stdout, "i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
 
 		pb->i++;
 
@@ -313,8 +314,8 @@ void iteration_cg(void *problem)
 	}
 	else {
 		/* we may stop */
-		printf("We are done ... after %d iterations \n", pb->i - 1);
-		printf("i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
+		FPRINTF(stdout, "We are done ... after %d iterations \n", pb->i - 1);
+		FPRINTF(stdout, "i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
 		sem_post(pb->sem);
 	}
 }
@@ -353,7 +354,7 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 		ptr_vecq[i] = 0.0f;
 	}
 
-	printf("nrow = %d \n", nrow);
+	FPRINTF(stdout, "nrow = %u \n", nrow);
 
 	/* and register them as well */
 	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));

+ 3 - 3
examples/heat/dw_sparse_cg.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,7 +101,7 @@ static void __attribute__ ((unused)) print_results(float *result, unsigned size)
 
 	for (i = 0; i < STARPU_MIN(size, 16); i++)
 	{
-		printf("%d -> %f\n", i, result[i]);
+		printf("%u -> %f\n", i, result[i]);
 	}
 }
 
@@ -134,4 +134,4 @@ void iteration_cg(void *problem);
 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
 
-#endif // __DW_SPARSE_CG_H__
+#endif /* __DW_SPARSE_CG_H__ */

+ 0 - 4
examples/heat/dw_sparse_cg_kernels.c

@@ -64,10 +64,8 @@ void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
 	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
 
 
-	uint32_t nnz;
 	uint32_t nrow;
 
-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 
 	unsigned row;
@@ -173,10 +171,8 @@ void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
 	float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
 	float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
 
-	uint32_t nnz;
 	uint32_t nrow;
 
-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 
 	unsigned row;

+ 10 - 10
examples/heat/heat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -321,7 +321,7 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	/* solve the actual problem LU X = B */
         /* solve LX' = Y with X' = UX */
         /* solve UX = X' */
-	fprintf(stderr, "Solving the problem ...\n");
+	FPRINTF(stderr, "Solving the problem ...\n");
 
 	float *savedB;
 	float *LUB;
@@ -360,10 +360,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	
 		/* check if LUB is close to the 0 vector */
 		int maxind = ISAMAX(subsize, LUB, 1);
-		fprintf(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
+		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
 		float sum = SASUM(subsize, LUB, 1);
-		fprintf(stderr,"avg. error %e\n", sum/subsize);
+		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
 	
 		free(LUB);
 		free(savedB);
@@ -494,10 +494,10 @@ static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned n
 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
 				{
 					/* we got a possible neighbour */
-					unsigned node = 
+					unsigned pnode = 
 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
 
-					neighbours[nneighbours++] = TRANSLATEBACK(node);
+					neighbours[nneighbours++] = TRANSLATEBACK(pnode);
 				}
 			}
 		}
@@ -569,10 +569,10 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 
 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
 		{
-			unsigned i = neighbours[neighbour]; 
-			if (i >= newsize)
+			unsigned n = neighbours[neighbour]; 
+			if (n >= newsize)
 			{
-				B[j] -= compute_A_value(TRANSLATE(i), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(i)];
+				B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
 			}
 		}
 	}
@@ -729,7 +729,7 @@ int main(int argc, char **argv)
 
 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
 
-		fprintf(stderr, "Problem size : %dx%d (%dx%d) (%ld MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
+		FPRINTF(stderr, "Problem size : %ux%u (%ux%u) (%lu MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
 
 		STARPU_ASSERT(newsize % nblocks == 0);
 

+ 5 - 3
examples/heat/heat.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #include <assert.h>
 #include <math.h>
 
-// needed for STARPU_OPENGL_RENDER
+/* needed for STARPU_OPENGL_RENDER */
 #include <starpu_config.h>
 #include <starpu.h>
 
@@ -36,6 +36,8 @@
 #include <GL/glut.h>
 #endif
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define X	0
 #define Y	1
 
@@ -66,4 +68,4 @@ void display_stat_heat(void);
 extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
 #endif
 
-#endif // __HEAT_H__
+#endif /* __HEAT_H__ */

+ 6 - 6
examples/heat/heat_display.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -133,8 +133,8 @@ static void display(void)
 	float factor = 1.0/amplitude;
 	glScalef (factor, factor, factor);      /* modeling transformation */
 	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
-//	printf("factor %f\n", factor);
-	//   glRotatef(-0,0.0,0.0,0.0);
+/*	printf("factor %f\n", factor);
+	   glRotatef(-0,0.0,0.0,0.0); */
 	generate_graph();
 	glFlush ();
 }
@@ -211,7 +211,7 @@ void find_limits(void)
 
 void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
 {
-	fprintf(stderr, "OpenGL rendering ... \n");
+	FPRINTF(stderr, "OpenGL rendering ... \n");
 
 	ntheta = _ntheta;
 	nthick = _nthick;
@@ -236,4 +236,4 @@ void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_p
 	glutReshapeFunc(reshape);
 	glutMainLoop();
 }
-#endif // STARPU_OPENGL_RENDER
+#endif /* STARPU_OPENGL_RENDER */

+ 14 - 14
examples/heat/lu_kernels_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,7 +26,7 @@
  *	Number of flops of Gemm 
  */
 
-//#define USE_PERTURBATION	1
+/* #define USE_PERTURBATION	1 */
 
 
 #ifdef USE_PERTURBATION
@@ -58,10 +58,10 @@ double task_12_cost(starpu_buffer_descr *descr)
 
 	n = starpu_matrix_get_nx(descr[0].handle);
 
-//	double cost = ((n*n*n)/1744.695);
+/*	double cost = ((n*n*n)/1744.695); */
 	double cost = ((n*n*n)/3210.80);
 
-	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -72,10 +72,10 @@ double task_21_cost(starpu_buffer_descr *descr)
 
 	n = starpu_matrix_get_nx(descr[0].handle);
 
-//	double cost = ((n*n*n)/1744.695);
+/*	double cost = ((n*n*n)/1744.695); */
 	double cost = ((n*n*n)/3691.53);
 
-	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -109,7 +109,7 @@ double task_11_cost_cuda(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/1853.7806);
 
-//	printf("CUDA task 11 ; predict %e\n", cost);
+/*	printf("CUDA task 11 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -121,7 +121,7 @@ double task_12_cost_cuda(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/42838.5718);
 
-//	printf("CUDA task 12 ; predict %e\n", cost);
+/*	printf("CUDA task 12 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -134,7 +134,7 @@ double task_21_cost_cuda(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/49208.667);
 
-//	printf("CUDA task 21 ; predict %e\n", cost);
+/*	printf("CUDA task 21 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -150,7 +150,7 @@ double task_22_cost_cuda(starpu_buffer_descr *descr)
 
 	double cost = ((nx*ny*nz)/57523.560);
 
-//	printf("CUDA task 22 ; predict %e\n", cost);
+/*	printf("CUDA task 22 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -168,7 +168,7 @@ double task_11_cost_cpu(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/537.5);
 
-//	printf("CPU task 11 ; predict %e\n", cost);
+/*	printf("CPU task 11 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -180,7 +180,7 @@ double task_12_cost_cpu(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/6668.224);
 
-//	printf("CPU task 12 ; predict %e\n", cost);
+/*	printf("CPU task 12 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -193,7 +193,7 @@ double task_21_cost_cpu(starpu_buffer_descr *descr)
 
 	double cost = ((n*n*n)/6793.8423);
 
-//	printf("CPU task 21 ; predict %e\n", cost);
+/*	printf("CPU task 21 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 
@@ -209,7 +209,7 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
 	double cost = ((nx*ny*nz)/4203.0175);
 
-//	printf("CPU task 22 ; predict %e\n", cost);
+/*	printf("CPU task 22 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 }
 

+ 2 - 2
examples/heat/lu_kernels_model.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,4 +20,4 @@
 
 #include <starpu.h>
 
-#endif // __LU_KERNELS_MODEL_H__
+#endif /* __LU_KERNELS_MODEL_H__ */

+ 13 - 9
examples/incrementer/incrementer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 #include <sys/time.h>
 
 static unsigned niter = 50000;
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 #ifdef STARPU_USE_CUDA
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
@@ -42,6 +43,9 @@ int main(int argc, char **argv)
 {
 	starpu_init(NULL);
 
+#ifdef STARPU_SLOW_MACHINE
+	niter /= 100;
+#endif
 	if (argc == 2)
 		niter = atoi(argv[1]);
 
@@ -52,7 +56,7 @@ int main(int argc, char **argv)
 			(uintptr_t)&float_array, 4, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 
 	starpu_codelet cl =
@@ -88,7 +92,7 @@ int main(int argc, char **argv)
 		int ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 		}
 	}
@@ -96,24 +100,24 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 
 	/* update the array in RAM */
-	starpu_data_acquire(float_array_handle, STARPU_R);
+	starpu_data_unregister(float_array_handle);
 
 	gettimeofday(&end, NULL);
 
-	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
 
+	STARPU_ASSERT(float_array[0] == niter);
+
 	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
-		fprintf(stderr, "Incorrect result\n");
+		FPRINTF(stderr, "Incorrect result\n");
 		return 1;
 	}
 
-	starpu_data_release(float_array_handle);
-
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
 					(end.tv_usec - start.tv_usec));
 
-	fprintf(stderr, "%d elems took %lf ms\n", niter, timing/1000);
+	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
 
 	starpu_shutdown();
 

+ 2 - 2
examples/incrementer/incrementer_kernels_opencl.c

@@ -21,7 +21,7 @@
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)
 {
-	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_event event;
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{

+ 3 - 3
examples/lu/lu_example.c

@@ -137,7 +137,7 @@ void copy_matrix_into_blocks(void)
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
-		starpu_data_malloc_pinned_if_possible((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
+		starpu_malloc((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
 
 		for (j = 0; j < blocksize; j++)
 		for (i = 0; i < blocksize; i++)
@@ -151,7 +151,7 @@ void copy_matrix_into_blocks(void)
 static void init_matrix(void)
 {
 	/* allocate matrix */
-	starpu_data_malloc_pinned_if_possible((void **)&A, (size_t)size*size*sizeof(TYPE));
+	starpu_malloc((void **)&A, (size_t)size*size*sizeof(TYPE));
 	STARPU_ASSERT(A);
 
 	starpu_srand48((long int)time(NULL));
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
 		} else {
 			starpu_bound_compute(&min, NULL, 0);
 			if (min != 0.)
-				FPRINTF(stderr, "theoretical min: %lf ms\n", min);
+				FPRINTF(stderr, "theoretical min: %f ms\n", min);
 		}
 	}
 

+ 9 - 11
examples/lu/xlu.c

@@ -236,17 +236,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 	/* We already enforce deps by hand */
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 

+ 10 - 12
examples/lu/xlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -143,17 +143,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 	
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 

+ 10 - 12
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -189,17 +189,15 @@ void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 

+ 36 - 30
examples/mandelbrot/mandelbrot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,14 +29,15 @@ int use_x11 = 1;
 #endif
 
 int demo = 0;
+static double demozoom = 0.05;
 
 /* NB: The X11 code is inspired from the http://locklessinc.com/articles/mandelbrot/ article */
 
 static int nblocks = 20;
 static int height = 400;
 static int width = 640;
-static int maxIt = 20000; // max number of iteration in the Mandelbrot function
-static int niter = -1; // number of loops in case we don't use X11, -1 means infinite
+static int maxIt = 20000; /* max number of iteration in the Mandelbrot function */
+static int niter = -1; /* number of loops in case we don't use X11, -1 means infinite */
 static int use_spmd = 0;
 
 static double leftX = -0.745;
@@ -233,7 +234,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 {
 	int iby, block_size;
 	double stepX, stepY;
-	int *pcnt; // unused for CUDA tasks
+	int *pcnt; /* unused for CUDA tasks */
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 
 	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
@@ -247,15 +248,15 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
 	starpu_opencl_load_kernel(&kernel, &queue, &opencl_programs, "mandelbrot_kernel", devid);
 
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
-	clSetKernelArg(kernel, 1, sizeof(double), &leftX);
-	clSetKernelArg(kernel, 2, sizeof(double), &topY);
-	clSetKernelArg(kernel, 3, sizeof(double), &stepX);
-	clSetKernelArg(kernel, 4, sizeof(double), &stepY);
-	clSetKernelArg(kernel, 5, sizeof(int), &maxIt);
-	clSetKernelArg(kernel, 6, sizeof(int), &iby);
-	clSetKernelArg(kernel, 7, sizeof(int), &block_size);
-	clSetKernelArg(kernel, 8, sizeof(int), &width);
+	clSetKernelArg(kernel, 0, sizeof(data), &data);
+	clSetKernelArg(kernel, 1, sizeof(leftX), &leftX);
+	clSetKernelArg(kernel, 2, sizeof(topY), &topY);
+	clSetKernelArg(kernel, 3, sizeof(stepX), &stepX);
+	clSetKernelArg(kernel, 4, sizeof(stepY), &stepY);
+	clSetKernelArg(kernel, 5, sizeof(maxIt), &maxIt);
+	clSetKernelArg(kernel, 6, sizeof(iby), &iby);
+	clSetKernelArg(kernel, 7, sizeof(block_size), &block_size);
+	clSetKernelArg(kernel, 8, sizeof(width), &width);
 
 	unsigned dim = 16;
 	size_t local[2] = {dim, 1};
@@ -278,7 +279,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
 	int iby, block_size;
 	double stepX, stepY;
-	int *pcnt; // unused for sequential tasks
+	int *pcnt; /* unused for sequential tasks */
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
@@ -291,7 +292,7 @@ static void compute_block(void *descr[], void *cl_arg)
 		{
 			double cx = leftX + ix * stepX;
 			double cy = topY - iy * stepY;
-			// Z = X+I*Y
+			/* Z = X+I*Y */
 			double x = 0;
 			double y = 0;
 			int it;
@@ -300,13 +301,13 @@ static void compute_block(void *descr[], void *cl_arg)
 				double x2 = x*x;
 				double y2 = y*y;
 
-				// Stop iterations when |Z| > 2
+				/* Stop iterations when |Z| > 2 */
 				if (x2 + y2 > 4.0)
 					break;
 
 				double twoxy = 2.0*x*y;
 
-				// Z = Z^2 + C
+				/* Z = Z^2 + C */
 				x = x2 - y2 + cx;
 				y = twoxy + cy;
 			}
@@ -327,8 +328,8 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	int ix, iy; // global coordinates
-	int local_iy; // current line
+	int ix, iy; /* global coordinates */
+	int local_iy; /* current line */
 
 	while (1)
 	{
@@ -342,7 +343,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 		{
 			double cx = leftX + ix * stepX;
 			double cy = topY - iy * stepY;
-			// Z = X+I*Y
+			/* Z = X+I*Y */
 			double x = 0;
 			double y = 0;
 			int it;
@@ -351,13 +352,13 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 				double x2 = x*x;
 				double y2 = y*y;
 
-				// Stop iterations when |Z| > 2
+				/* Stop iterations when |Z| > 2 */
 				if (x2 + y2 > 4.0)
 					break;
 
 				double twoxy = 2.0*x*y;
 
-				// Z = Z^2 + C
+				/* Z = Z^2 + C */
 				x = x2 - y2 + cx;
 				y = twoxy + cy;
 			}
@@ -396,7 +397,7 @@ static void parse_args(int argc, char **argv)
 	int i;
 	for (i = 1; i < argc; i++) {
 		if (strcmp(argv[i], "-h") == 0) {
-			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd]\n", argv[0]);
+			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd] [-demo] [-demozoom 0.2]\n", argv[0]);
 			exit(-1);
 		}
 
@@ -434,6 +435,11 @@ static void parse_args(int argc, char **argv)
 
 		}
 
+		if (strcmp(argv[i], "-demozoom") == 0) {
+			char *argptr;
+			demozoom = strtof(argv[++i], &argptr);
+		}
+
 		if (strcmp(argv[i], "-no-x11") == 0) {
 #ifdef STARPU_HAVE_X11
 			use_x11 = 0;
@@ -461,7 +467,7 @@ int main(int argc, char **argv)
 	starpu_init(&conf);
 
 	unsigned *buffer;
-	starpu_data_malloc_pinned_if_possible((void **)&buffer, height*width*sizeof(unsigned));
+	starpu_malloc((void **)&buffer, height*width*sizeof(unsigned));
 
 #ifdef STARPU_HAVE_X11
 	if (use_x11)
@@ -472,7 +478,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT((height % nblocks) == 0);
 
 #ifdef STARPU_USE_OPENCL
-	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs);
+	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs, NULL);
 #endif
 
 	starpu_data_handle block_handles[nblocks];
@@ -520,24 +526,24 @@ int main(int argc, char **argv)
 
 		for (iby = 0; iby < nblocks; iby++)
 		{
-			starpu_data_acquire(block_handles[iby], STARPU_R);
 #ifdef STARPU_HAVE_X11
 			if (use_x11)
 			{
+				starpu_data_acquire(block_handles[iby], STARPU_R);
 				XPutImage(dpy, win, gc, bitmap,
 					0, iby*block_size,
 					0, iby*block_size,
 					width, block_size);
+				starpu_data_release(block_handles[iby]);
 			}
 #endif
-			starpu_data_release(block_handles[iby]);
 		}
 
 
 		if (demo)
 		{
 			/* Zoom in */
-			double zoom_factor = 0.05;
+			double zoom_factor = demozoom;
 			double widthX = rightX - leftX;
 			double heightY = topY - bottomY;
 
@@ -554,7 +560,7 @@ int main(int argc, char **argv)
 				gettimeofday(&end, NULL);
 				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-				fprintf(stderr, "Time to generate %d frames : %f s\n", iter, timing/1000000.0);
+				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
 
 				/* Reset counters */
@@ -583,7 +589,7 @@ int main(int argc, char **argv)
 	for (iby = 0; iby < nblocks; iby++)
 		starpu_data_unregister(block_handles[iby]);
 
-//	starpu_data_free_pinned_if_possible(buffer);
+/*	starpu_data_free_pinned_if_possible(buffer); */
 
 	starpu_shutdown();
 

+ 36 - 30
examples/matvecmult/matvecmult.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <pthread.h>
 #include <math.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #ifdef STARPU_USE_OPENCL
 struct starpu_opencl_program opencl_code;
 void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
@@ -27,9 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 	cl_kernel kernel;
 	cl_command_queue queue;
 	int id, devid, err, n;
-	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
-	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
+	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_PTR(descr[0]);
+	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
+	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 	cl_event event;
@@ -41,11 +43,11 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
         n=0;
-        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&nx);
-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&ny);
-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
+        err = clSetKernelArg(kernel, n++, sizeof(matrix), &matrix);
+        err |= clSetKernelArg(kernel, n++, sizeof(vector), &vector);
+        err |= clSetKernelArg(kernel, n++, sizeof(nx), (void*)&nx);
+        err |= clSetKernelArg(kernel, n++, sizeof(ny), (void*)&ny);
+	err |= clSetKernelArg(kernel, n++, sizeof(mult), &mult);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{
@@ -73,9 +75,9 @@ void fillArray(float* pfData, int iSize) {
 void printArray(float* pfData, int iSize) {
     int i;
     for (i = 0; i < iSize; ++i) {
-            fprintf(stderr, "%f ", pfData[i]);
+            FPRINTF(stderr, "%f ", pfData[i]);
     }
-    fprintf(stderr, "\n");
+    FPRINTF(stderr, "\n");
 }
 
 void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
@@ -121,8 +123,8 @@ int main(int argc, char **argv)
                 .nopencl = 1,
 	};
 
-        //int width=1100;
-        //int height=244021;
+        /* int width=1100; */
+        /* int height=244021; */
         int width=20;
         int height=4;
 
@@ -131,8 +133,14 @@ int main(int argc, char **argv)
         unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
 
 	starpu_data_handle matrix_handle, vector_handle, mult_handle;
+	int ret, submit;
 
-        starpu_init(&conf);
+        ret = starpu_init(&conf);
+	if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
+		starpu_shutdown();
+		exit(0);
+	}
 
         mem_size_matrix = width * height * sizeof(float);
         matrix = (float*)malloc(mem_size_matrix);
@@ -157,7 +165,7 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code);
+        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code, NULL);
 #endif
 
 	cl.where = STARPU_OPENCL;
@@ -177,30 +185,28 @@ int main(int argc, char **argv)
         task->buffers[2].handle = mult_handle;
         task->buffers[2].mode = STARPU_RW;
 
-        int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
-                exit(0);
+        submit = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(submit == -ENODEV)) {
+                FPRINTF(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
+	}
+	else {
+		starpu_task_wait_for_all();
 	}
 
-	starpu_task_wait_for_all();
+	starpu_data_unregister(matrix_handle);
+	starpu_data_unregister(vector_handle);
+	starpu_data_unregister(mult_handle);
 
-	/* update the array in RAM */
-        starpu_data_acquire(matrix_handle, STARPU_R);
-        starpu_data_acquire(vector_handle, STARPU_R);
-        starpu_data_acquire(mult_handle, STARPU_R);
+        if (STARPU_LIKELY(submit != -ENODEV)) {
+		int res = compareL2fe(correctResult, mult, height, 1e-6f);
+		FPRINTF(stdout, "TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
+	}
 
-        int res = compareL2fe(correctResult, mult, height, 1e-6f);
-        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
 #if 0
         printArray(matrix, width*height);
         printArray(vector, width);
         printArray(mult, height);
 #endif
-        starpu_data_release(matrix_handle);
-        starpu_data_release(vector_handle);
-        starpu_data_release(mult_handle);
-
         starpu_shutdown();
 
 	return 0;

+ 26 - 22
examples/mult/xgemm.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,8 @@ static unsigned check = 0;
 static TYPE *A, *B, *C;
 static starpu_data_handle A_handle, B_handle, C_handle;
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static void check_output(void)
 {
 	/* compute C = C - AB */
@@ -52,14 +54,14 @@ static void check_output(void)
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
 	if (err < xdim*ydim*0.001) {
-		fprintf(stderr, "Results are OK\n");
+		FPRINTF(stderr, "Results are OK\n");
 	}
 	else {
 		int max;
 		max = CPU_IAMAX(xdim*ydim, C, 1);
 
-		fprintf(stderr, "There were errors ... err = %f\n", err);
-		fprintf(stderr, "Max error : %e\n", C[max]);
+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
+		FPRINTF(stderr, "Max error : %e\n", C[max]);
 	}
 }
 
@@ -67,9 +69,9 @@ static void init_problem_data(void)
 {
 	unsigned i,j;
 
-	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
+	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
+	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
+	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
 
 	/* fill the A and B matrices */
 	for (j=0; j < ydim; j++) {
@@ -100,20 +102,20 @@ static void partition_mult_data(void)
 	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
 		ydim, ydim, xdim, sizeof(TYPE));
 
-	struct starpu_data_filter f;
-	memset(&f, 0, sizeof(f));
-	f.filter_func = starpu_vertical_block_filter_func;
-	f.nchildren = nslicesx;
+	struct starpu_data_filter vert;
+	memset(&vert, 0, sizeof(vert));
+	vert.filter_func = starpu_vertical_block_filter_func;
+	vert.nchildren = nslicesx;
 		
-	struct starpu_data_filter f2;
-	memset(&f2, 0, sizeof(f2));
-	f2.filter_func = starpu_block_filter_func;
-	f2.nchildren = nslicesy;
+	struct starpu_data_filter horiz;
+	memset(&horiz, 0, sizeof(horiz));
+	horiz.filter_func = starpu_block_filter_func;
+	horiz.nchildren = nslicesy;
 		
-	starpu_data_partition(B_handle, &f);
-	starpu_data_partition(A_handle, &f2);
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
 
-	starpu_data_map_filters(C_handle, 2, &f, &f2);
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 }
 
 static void mult_kernel_common(void *descr[], int type)
@@ -145,10 +147,12 @@ static void mult_kernel_common(void *descr[], int type)
 			int block_size = (nyC + worker_size - 1)/worker_size;
 			int new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
 
-			TYPE *new_subA = &subA[block_size*rank];
+			STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1]));
+
+			TYPE *new_subB = &subB[block_size*rank];
 			TYPE *new_subC = &subC[block_size*rank];
 
-			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, new_subA, ldA, subB, ldB, (TYPE)0.0, new_subC, ldC);
+			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
 		}
 	}
 #ifdef STARPU_USE_CUDA
@@ -282,11 +286,11 @@ int main(int argc, char **argv)
 	gettimeofday(&end, NULL);
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
 
 	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
 				*((unsigned long)ydim)*((unsigned long)zdim);
-	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
+	FPRINTF(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
 
 	starpu_data_unpartition(C_handle, 0);
 	starpu_data_unregister(C_handle);

+ 105 - 0
examples/openmp/vector_scal.c

@@ -0,0 +1,105 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* gcc build:
+
+   gcc -fopenmp vector_scal.c -o vector_scal $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu)
+
+ */
+
+#include <starpu.h>
+#include <stdio.h>
+#include <limits.h>
+
+#define	NX	2048
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void scal_cpu_func(void *buffers[], void *_args) {
+	unsigned i;
+	float *factor = _args;
+	starpu_vector_interface_t *vector = buffers[0];
+	unsigned n = STARPU_VECTOR_GET_NX(vector);
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+	FPRINTF(stderr, "running task with %d CPUs.\n", starpu_combined_worker_get_size());
+
+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
+	for (i = 0; i < n; i++)
+		val[i] *= *factor;
+}
+
+static struct starpu_perfmodel_t vector_scal_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "vector_scale_parallel"
+};
+
+static starpu_codelet cl = {
+	.where = STARPU_CPU,
+	.type = STARPU_FORKJOIN,
+	.max_parallelism = INT_MAX,
+	.cpu_func = scal_cpu_func,
+	.nbuffers = 1,
+	.model = &vector_scal_model,
+};
+
+int main(int argc, char **argv)
+{
+	struct starpu_conf conf;
+	float vector[NX];
+	unsigned i;
+	for (i = 0; i < NX; i++)
+                vector[i] = (i+1.0f);
+
+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
+
+	starpu_conf_init(&conf);
+
+	/* Most OpenMP implementations do not support concurrent parallel
+	 * sections, so only create one big worker */
+	conf.single_combined_worker = 1;
+
+	starpu_init(&conf);
+
+	starpu_data_handle vector_handle;
+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+
+	float factor = 3.14;
+
+	struct starpu_task *task = starpu_task_create();
+	task->synchronous = 1;
+
+	task->cl = &cl;
+
+	task->buffers[0].handle = vector_handle;
+	task->buffers[0].mode = STARPU_RW;
+	task->cl_arg = &factor;
+	task->cl_arg_size = sizeof(factor);
+
+	starpu_task_submit(task);
+	starpu_data_unregister(vector_handle);
+
+	starpu_task_destroy(task);
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
+
+	return 0;
+}

+ 78 - 0
examples/opt/Makefile.am

@@ -0,0 +1,78 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+if STARPU_USE_CUDA
+
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/ $(HWLOC_CFLAGS) -arch sm_13
+
+.cu.o:
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
+
+endif
+
+TESTS	=	$(check_PROGRAMS)
+
+check_PROGRAMS =
+
+examplebindir = $(libdir)/starpu/examples/
+
+examplebin_PROGRAMS =
+
+noinst_HEADERS = 				\
+	pi/SobolQRNG/sobol.h			\
+	pi/SobolQRNG/sobol_gold.h		\
+	pi/SobolQRNG/sobol_gpu.h		\
+	pi/SobolQRNG/sobol_primitives.h
+
+######
+# Pi #
+######
+
+check_PROGRAMS +=				\
+	pi/pi					\
+	pi/pi_redux
+
+examplebin_PROGRAMS +=				\
+	pi/pi					\
+	pi/pi_redux
+
+pi_pi_SOURCES =					\
+	pi/pi.c					\
+	pi/SobolQRNG/sobol_gold.c		\
+	pi/SobolQRNG/sobol_primitives.c
+
+if STARPU_USE_CUDA
+pi_pi_SOURCES +=				\
+	pi/pi_kernel.cu				\
+	pi/SobolQRNG/sobol_gpu.cu
+endif
+
+pi_pi_redux_SOURCES =				\
+	pi/pi_redux.c
+
+if STARPU_USE_CUDA
+pi_pi_redux_SOURCES +=				\
+	pi/pi_redux_kernel.cu
+pi_pi_redux_LDADD =				\
+	$(STARPU_CURAND_LDFLAGS)
+endif
+
+

File diff suppressed because it is too large
+ 50 - 0
examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt


+ 60 - 0
examples/opt/pi/SobolQRNG/sobol.h

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#ifndef SOBOL_H
+#define SOBOL_H
+
+/* Number of direction vectors is fixed to 32 */
+#define n_directions 32
+
+#endif

+ 141 - 0
examples/opt/pi/SobolQRNG/sobol_gold.c

@@ -0,0 +1,141 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "sobol.h"
+#include "sobol_gold.h"
+#include "sobol_primitives.h"
+
+#define k_2powneg32 2.3283064E-10F
+
+/* Create the direction numbers, based on the primitive polynomials. */
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
+{
+    unsigned int *v = directions;
+
+    int dim;
+    for (dim = 0 ; dim < n_dimensions ; dim++)
+    {
+        /* First dimension is a special case */
+        if (dim == 0)
+        {
+            int i;
+            for (i = 0 ; i < n_directions ; i++)
+            {
+                /* All m's are 1 */
+                v[i] = 1 << (31 - i);
+            }
+        }
+        else
+        {
+            int d = sobol_primitives[dim].degree;
+            /* The first direction numbers (up to the degree of the polynomial) 
+               are simply v[i] = m[i] / 2^i (stored in Q0.32 format) */
+            int i;
+            for (i = 0 ; i < d ; i++)
+            {
+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
+            }
+            /* The remaining direction numbers are computed as described in
+               the Bratley and Fox paper. */
+            /* v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d */
+            for (i = d ; i < n_directions ; i++)
+            {
+                /* First do the v[i-d] ^ v[i-d]/2^d part */
+                v[i] = v[i - d] ^ (v[i - d] >> d);
+                /* Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
+                   Note that the coefficients a[] are zero or one and for compactness in
+                   the input tables they are stored as bits of a single integer. To extract
+                   the relevant bit we use right shift and mask with 1.
+                   For example, for a 10 degree polynomial there are ten useful bits in a,
+                   so to get a[2] we need to right shift 7 times (to get the 8th bit into
+                   the LSB) and then mask with 1. */
+                int j;
+                for (j = 1 ; j < d ; j++)
+                {
+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
+                }
+            }
+        }
+        v += n_directions;
+    }
+}
+
+/* Reference model for generating Sobol numbers on the host */
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
+{
+    unsigned int *v = directions;
+
+    int d;
+    for (d = 0 ; d < n_dimensions ; d++)
+    {
+        unsigned int X = 0;
+        /* x[0] is zero (in all dimensions) */
+        output[n_vectors * d] = 0.0;        
+        int i;
+        for (i = 1 ; i < n_vectors ; i++)
+        {
+            /* x[i] = x[i-1] ^ v[c]
+                where c is the index of the rightmost zero bit in i
+                minus 1 (since C arrays count from zero)
+               In the Bratley and Fox paper this is equation (**) */
+            X ^= v[ffs(~(i - 1)) - 1];
+            output[i + n_vectors * d] = (float)X * k_2powneg32;
+        }
+        v += n_directions;
+    }
+}

+ 61 - 0
examples/opt/pi/SobolQRNG/sobol_gold.h

@@ -0,0 +1,61 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GOLD_H
+#define SOBOL_GOLD_H
+
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
+
+#endif

+ 170 - 0
examples/opt/pi/SobolQRNG/sobol_gpu.cu

@@ -0,0 +1,170 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#include "sobol.h"
+#include "sobol_gpu.h"
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+#define k_2powneg32 2.3283064E-10F
+
+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
+{
+    __shared__ unsigned int v[n_directions];
+
+    // Offset into the correct dimension as specified by the
+    // block y coordinate
+    d_directions = d_directions + n_directions * blockIdx.y;
+    d_output = d_output +  n_vectors * blockIdx.y;
+
+    // Copy the direction numbers for this dimension into shared
+    // memory - there are only 32 direction numbers so only the
+    // first 32 (n_directions) threads need participate.
+    if (threadIdx.x < n_directions)
+    {
+	    v[threadIdx.x] = d_directions[threadIdx.x];
+    }
+    __syncthreads();
+
+    // Set initial index (i.e. which vector this thread is
+    // computing first) and stride (i.e. step to the next vector
+    // for this thread)
+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = gridDim.x * blockDim.x;
+
+    // Get the gray code of the index
+    // c.f. Numerical Recipes in C, chapter 20
+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
+    unsigned int g = i0 ^ (i0 >> 1);
+
+    // Initialisation for first point x[i0]
+    // In the Bratley and Fox paper this is equation (*), where
+    // we are computing the value for x[n] without knowing the
+    // value of x[n-1].
+    unsigned int X = 0;
+    unsigned int mask;
+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
+    {
+        // We want X ^= g_k * v[k], where g_k is one or zero.
+        // We do this by setting a mask with all bits equal to
+        // g_k. In reality we keep shifting g so that g_k is the
+        // LSB of g. This way we avoid multiplication.
+        mask = - (g & 1);
+        X ^= mask & v[k];
+        g = g >> 1;
+    }
+    if (i0 < n_vectors)
+    {
+        d_output[i0] = (float)X * k_2powneg32;
+    }
+
+    // Now do rest of points, using the stride
+    // Here we want to generate x[i] from x[i-stride] where we
+    // don't have any of the x in between, therefore we have to
+    // revisit the equation (**), this is easiest with an example
+    // so assume stride is 16.
+    // From x[n] to x[n+16] there will be:
+    //   8 changes in the first bit
+    //   4 changes in the second bit
+    //   2 changes in the third bit
+    //   1 change in the fourth
+    //   1 change in one of the remaining bits
+    //
+    // What this means is that in the equation:
+    //   x[n+1] = x[n] ^ v[p]
+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
+    //   ...
+    // We will apply xor with v[1] eight times, v[2] four times,
+    // v[3] twice, v[4] once and one other direction number once.
+    // Since two xors cancel out, we can skip even applications
+    // and just apply xor with v[4] (i.e. log2(16)) and with
+    // the current applicable direction number.
+    // Note that all these indices count from 1, so we need to
+    // subtract 1 from them all to account for C arrays counting
+    // from zero.
+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
+    unsigned int v_stridemask = stride - 1;
+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
+    {
+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
+        //  where b is log2(stride) minus 1 for C array indexing
+        //  where c is the index of the rightmost zero bit in i,
+        //  not including the bottom log2(stride) bits, minus 1
+        //  for C array indexing
+        // In the Bratley and Fox paper this is equation (**)
+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
+        d_output[i] = (float)X * k_2powneg32;
+    }
+}
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
+{
+    const int threadsperblock = 64;
+
+    // Set up the execution configuration
+    dim3 dimGrid;
+    dim3 dimBlock;
+
+    // This implementation of the generator outputs all the draws for
+    // one dimension in a contiguous region of memory, followed by the
+    // next dimension and so on.
+    // Therefore all threads within a block will be processing different
+    // vectors from the same dimension. As a result we want the total
+    // number of blocks to be a multiple of the number of dimensions.
+    dimGrid.y = n_dimensions;
+
+    // If the number of dimensions is large then we will set the number
+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
+    // but if the number of dimensions is small (e.g. less than 32) then
+    // we'll partition the vectors across blocks (as well as threads).
+    // We also need to cap the dimGrid.x where the number of vectors
+    // is too small to be partitioned.
+    dimGrid.x = 1 + 31 / n_dimensions;
+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
+    {
+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
+    }
+    
+    // Fix the number of threads
+    dimBlock.x = threadsperblock;
+
+    // Execute GPU kernel
+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+}

+ 61 - 0
examples/opt/pi/SobolQRNG/sobol_gpu.h

@@ -0,0 +1,61 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GPU_H
+#define SOBOL_GPU_H
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
+
+#endif

File diff suppressed because it is too large
+ 10271 - 0
examples/opt/pi/SobolQRNG/sobol_primitives.c


+ 75 - 0
examples/opt/pi/SobolQRNG/sobol_primitives.h

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_PRIMITIVES_H
+#define SOBOL_PRIMITIVES_H
+
+#define max_m 17
+
+/* Each primitive is stored as a struct where
+   dimension is the dimension number of the polynomial (unused)
+   degree is the degree of the polynomial
+   a is a binary word representing the coefficients 
+   m is the array of m values */
+struct primitive
+{
+    unsigned int dimension;
+    unsigned int degree;
+    unsigned int a;
+    unsigned int m[max_m];
+};
+
+extern const struct primitive sobol_primitives[];
+
+#endif

+ 175 - 0
examples/opt/pi/pi.c

@@ -0,0 +1,175 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "SobolQRNG/sobol.h"
+#include "SobolQRNG/sobol_gold.h"
+#include "pi.h"
+#include <sys/time.h>
+
+#ifdef STARPU_USE_CUDA
+void cuda_kernel(void **descr, void *cl_arg);
+#endif
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+/* default value */
+static unsigned ntasks = 1024;
+
+static void cpu_kernel(void *descr[], void *cl_arg)
+{
+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
+
+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
+
+	unsigned current_cnt = 0;
+
+	unsigned i;
+	for (i = 0; i < nx; i++)
+	{
+		TYPE x = random_numbers_x[i];
+		TYPE y = random_numbers_y[i];
+
+		TYPE dist = (x*x + y*y);
+
+		unsigned success = (dist <= 1.0);
+		current_cnt += success;
+	}
+
+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
+	*cnt = current_cnt;
+
+	free(random_numbers);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-ntasks") == 0) {
+			char *argptr;
+			ntasks = strtol(argv[++i], &argptr, 10);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+
+	parse_args(argc, argv);
+
+	starpu_init(NULL);
+
+	/* Initialize the random number generator */
+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
+	STARPU_ASSERT(sobol_qrng_directions);
+
+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
+
+	/* Any worker may use that array now */
+	starpu_data_handle sobol_qrng_direction_handle;
+	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
+
+	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
+	STARPU_ASSERT(cnt_array);
+	starpu_data_handle cnt_array_handle;
+	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
+
+	/* Use a write-through policy : when the data is modified on an
+	 * accelerator, we know that it will only be modified once and be
+	 * accessed by the CPU later on */
+	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
+
+	struct starpu_data_filter f = {
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = ntasks
+	};
+	
+	starpu_data_partition(cnt_array_handle, &f);
+
+	static struct starpu_perfmodel_t model = {
+		.type = STARPU_HISTORY_BASED,
+		.symbol = "monte_carlo_pi"
+	};
+
+	struct starpu_codelet_t cl = {
+		.where = STARPU_CPU|STARPU_CUDA,
+		.cpu_func = cpu_kernel,
+#ifdef STARPU_USE_CUDA
+		.cuda_func = cuda_kernel,
+#endif
+		.nbuffers = 2,
+		.model = &model
+	};
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+
+		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
+
+		task->buffers[0].handle = sobol_qrng_direction_handle;
+		task->buffers[0].mode   = STARPU_R;
+		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
+		task->buffers[1].mode   = STARPU_W;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+	starpu_task_wait_for_all();
+
+	/* Get the cnt_array back in main memory */
+	starpu_data_unpartition(cnt_array_handle, 0);
+	starpu_data_unregister(cnt_array_handle);
+
+	/* Count the total number of entries */
+	unsigned long total_cnt = 0;
+	for (i = 0; i < ntasks; i++)
+		total_cnt += cnt_array[i];
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
+
+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
+
+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 33 - 0
examples/opt/pi/pi.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PI_H__
+#define __PI_H__
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <stdio.h>
+
+#define NSHOT_PER_TASK	(16*1024*1024ULL)
+
+#define TYPE	float
+
+/* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */
+
+static int n_dimensions = 100;
+
+#endif /* __PI_H__ */

+ 150 - 0
examples/opt/pi/pi_kernel.cu

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "SobolQRNG/sobol_gpu.h"
+#include "pi.h"
+#include <starpu_cuda.h>
+
+#define MAXNBLOCKS	128
+#define MAXTHREADSPERBLOCK	256
+
+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
+						unsigned n, unsigned *output_cnt)
+{
+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
+
+	/* Do we have a successful shot ? */
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	const int nthreads = gridDim.x * blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < MAXTHREADSPERBLOCK)
+		scnt[threadIdx.x] = 0;
+
+	__syncthreads();
+	int ind;
+	for (ind = tid; ind < n; ind += nthreads)
+	{ 
+		TYPE x = random_numbers_x[ind];
+		TYPE y = random_numbers_y[ind];
+		TYPE dist = (x*x + y*y);
+
+		unsigned success = (dist <= 1.0f)?1:0;
+
+		scnt[threadIdx.x] += success;
+
+	}
+
+	__syncthreads();
+
+	/* Perform a reduction to compute the sum on each thread within that block */
+
+	/* NB: We assume that the number of threads per block is a power of 2 ! */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* report the number of successful shots in the block */
+	if (threadIdx.x == 0)
+		output_cnt[blockIdx.x] = scnt[0];
+
+	__syncthreads();
+}
+
+static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
+{
+	__shared__ unsigned accumulator[MAXNBLOCKS];
+
+	unsigned i;
+
+	/* Load the values from global mem */
+	for (i = 0; i < blockDim.x; i++)
+		accumulator[i] = output_cnt[i];
+
+	__syncthreads();
+
+	/* Perform a reduction in shared memory */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* Save the result in global memory */
+	if (threadIdx.x == 0)
+		*cnt = accumulator[0];
+}
+
+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
+{
+	cudaError_t cures;
+
+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
+
+	/* Generate Random numbers */
+	float *random_numbers;
+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
+	STARPU_ASSERT(random_numbers);
+	
+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
+
+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* How many blocks do we use ? */ 
+	unsigned nblocks = 128; // TODO
+
+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
+	
+	unsigned *per_block_cnt;
+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
+
+	STARPU_ASSERT((nx % nblocks) == 0);
+
+	/* How many threads per block ? At most 256, but no more threads than
+	 * there are entries to process per block. */
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
+
+	/* each entry of per_block_cnt contains the number of successful shots
+	 * in the corresponding block. */
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+
+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
+
+	/* compute the total number of successful shots by adding the elements
+	 * of the per_block_cnt array */
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cudaFree(per_block_cnt);
+	cudaFree(random_numbers);
+}

+ 362 - 0
examples/opt/pi/pi_redux.c

@@ -0,0 +1,362 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <starpu_config.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define PI	3.14159265358979323846
+
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
+#warning CURAND is required to run that example on CUDA devices
+#endif
+
+#ifdef STARPU_HAVE_CURAND
+#include <cuda.h>
+#include <curand.h>
+#include <starpu_cuda.h>
+#endif
+
+#define NSHOT_PER_TASK	(1024*1024)
+
+/* default value */
+static unsigned long ntasks = 1024;
+static unsigned long ntasks_warmup = 0;
+
+static unsigned use_redux = 1;
+static unsigned do_warmup = 0;
+
+/*
+ *	Initialization of the Random Number Generators (RNG)
+ */
+
+#ifdef STARPU_HAVE_CURAND
+/* RNG for the CURAND library */
+static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
+#endif 
+
+/* state for the erand48 function : note the huge padding to avoid false-sharing */
+#define PADDING	1024
+static unsigned short xsubi[STARPU_NMAXWORKERS*PADDING];
+static struct drand48_data randbuffer[STARPU_NMAXWORKERS*PADDING];
+
+/* Function to initialize the random number generator in the current worker */
+static void init_rng(void *arg __attribute__((unused)))
+{
+#ifdef STARPU_HAVE_CURAND
+	curandStatus_t res;
+#endif
+
+	int workerid = starpu_worker_get_id();
+
+	switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+			/* create a seed */
+			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
+
+			xsubi[0 + PADDING*workerid] = (unsigned short)workerid;
+			xsubi[1 + PADDING*workerid] = (unsigned short)workerid;
+			xsubi[2 + PADDING*workerid] = (unsigned short)workerid;
+			break;
+#ifdef STARPU_HAVE_CURAND
+		case STARPU_CUDA_WORKER:
+
+			/* Create a RNG */
+			res = curandCreateGenerator(&curandgens[workerid],
+						CURAND_RNG_PSEUDO_DEFAULT);
+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+
+			/* Seed it with worker's id */
+			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
+							(unsigned long long)workerid);
+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-ntasks") == 0) {
+			char *argptr;
+			ntasks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-noredux") == 0) {
+			use_redux = 0;
+		}
+
+		if (strcmp(argv[i], "-warmup") == 0) {
+			do_warmup = 1;
+			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
+			exit(-1);
+		}
+	}
+}
+
+/*
+ *	Monte-carlo kernel
+ */
+
+static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
+{
+	int workerid = starpu_worker_get_id();
+
+	unsigned short *worker_xsub;
+	worker_xsub = &xsubi[PADDING*workerid];
+	
+	struct drand48_data *buffer;
+	buffer = &randbuffer[PADDING*workerid];
+
+	unsigned long local_cnt = 0;
+
+	/* Fill the scratchpad with random numbers */
+	int i;
+	for (i = 0; i < NSHOT_PER_TASK; i++)
+	{
+		double randx, randy;
+
+		starpu_erand48_r(worker_xsub, buffer, &randx);
+		starpu_erand48_r(worker_xsub, buffer, &randy);
+
+		double x = (2.0*randx - 1.0);
+		double y = (2.0*randy - 1.0);
+
+		double dist = x*x + y*y;
+		if (dist < 1.0)
+			local_cnt++;
+	}
+
+	/* Put the contribution of that task into the counter */
+	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	*cnt = *cnt + local_cnt;
+}
+
+extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
+
+#ifdef STARPU_HAVE_CURAND
+static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
+{
+	cudaError_t cures;
+	curandStatus_t res;	
+
+	int workerid = starpu_worker_get_id();
+
+	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
+	 * cudaGetLastError resets the last error value. */
+	cures = cudaGetLastError();
+/*	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures); */
+
+	/* Fill the scratchpad with random numbers. Note that both x and y
+	 * arrays are in stored the same vector. */
+	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
+	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+
+	float *x = &scratchpad_xy[0];
+	float *y = &scratchpad_xy[NSHOT_PER_TASK];
+
+	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
+}
+#endif
+
+static struct starpu_codelet_t pi_cl = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+	.cpu_func = pi_func_cpu,
+#ifdef STARPU_HAVE_CURAND
+	.cuda_func = pi_func_cuda,
+#endif
+	.nbuffers = 2,
+	.model = NULL
+};
+
+/*
+ *	Codelets to implement reduction
+ */
+
+static void init_cpu_func(void *descr[], void *cl_arg)
+{
+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        *val = 0;
+}
+
+#ifdef STARPU_HAVE_CURAND
+static void init_cuda_func(void *descr[], void *cl_arg)
+{
+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        cudaMemset(val, 0, sizeof(unsigned long));
+        cudaThreadSynchronize();
+}
+#endif
+
+static struct starpu_codelet_t init_codelet = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+        .cpu_func = init_cpu_func,
+#ifdef STARPU_HAVE_CURAND
+        .cuda_func = init_cuda_func,
+#endif
+        .nbuffers = 1
+};
+
+#ifdef STARPU_HAVE_CURAND
+/* Dummy implementation of the addition of two unsigned longs in CUDA */
+static void redux_cuda_func(void *descr[], void *cl_arg)
+{
+	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	unsigned long h_a, h_b;
+	
+	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
+
+	h_a += h_b;
+
+	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
+};
+#endif
+
+static void redux_cpu_func(void *descr[], void *cl_arg)
+{
+	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*a = *a + *b;
+};
+
+static struct starpu_codelet_t redux_codelet = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+	.cpu_func = redux_cpu_func,
+#ifdef STARPU_HAVE_CURAND
+	.cuda_func = redux_cuda_func,
+#endif
+	.nbuffers = 2
+};
+
+/*
+ *	Main program
+ */
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+
+	parse_args(argc, argv);
+
+	starpu_init(NULL);
+
+	/* Launch a Random Number Generator (RNG) on each worker */
+	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
+
+	/* Create a scratchpad data */
+	starpu_data_handle xy_scratchpad_handle;
+	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
+		2*NSHOT_PER_TASK, sizeof(float));
+
+	/* Create a variable that will be used to count the number of shots
+	 * that actually hit the unit circle when shooting randomly in
+	 * [-1,1]^2. */
+	unsigned long shot_cnt = 0;
+	starpu_data_handle shot_cnt_handle;
+	starpu_variable_data_register(&shot_cnt_handle, 0,
+			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
+
+	starpu_data_set_reduction_methods(shot_cnt_handle,
+					&redux_codelet, &init_codelet);
+
+	struct timeval start;
+	struct timeval end;
+
+	for (i = 0; i < ntasks_warmup; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &pi_cl;
+
+		task->buffers[0].handle = xy_scratchpad_handle;
+		task->buffers[0].mode   = STARPU_SCRATCH;
+		task->buffers[1].handle = shot_cnt_handle;
+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &pi_cl;
+
+		task->buffers[0].handle = xy_scratchpad_handle;
+		task->buffers[0].mode   = STARPU_SCRATCH;
+		task->buffers[1].handle = shot_cnt_handle;
+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+	starpu_data_unregister(shot_cnt_handle);
+
+	gettimeofday(&end, NULL);
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
+	 * probability to impact the disk: pi/4 */
+	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
+	double pi_approx = ((double)shot_cnt*4.0)/total;
+
+	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");
+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", pi_approx, shot_cnt, total);
+	FPRINTF(stderr, "Error %e \n", pi_approx - PI);
+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
+
+	starpu_shutdown();
+
+	if (abs(pi_approx - PI) > 1.0)
+		return 1;
+
+	return 0;
+}

+ 128 - 0
examples/opt/pi/pi_redux_kernel.cu

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+#define MAXNBLOCKS	128
+#define MAXTHREADSPERBLOCK	256
+
+static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
+{
+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
+
+	/* Do we have a successful shot ? */
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	const int nthreads = gridDim.x * blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < MAXTHREADSPERBLOCK)
+		scnt[threadIdx.x] = 0;
+
+	__syncthreads();
+	int ind;
+	for (ind = tid; ind < n; ind += nthreads)
+	{ 
+		float xval = (2.0f * x[ind] - 1.0f);
+		float yval = (2.0f * y[ind] - 1.0f);
+		float dist = (xval*xval + yval*yval);
+
+		unsigned long success = (dist <= 1.0f)?1:0;
+
+		scnt[threadIdx.x] += success;
+
+	}
+
+	__syncthreads();
+
+	/* Perform a reduction to compute the sum on each thread within that block */
+
+	/* NB: We assume that the number of threads per block is a power of 2 ! */
+	unsigned long s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* report the number of successful shots in the block */
+	if (threadIdx.x == 0)
+		output_cnt[blockIdx.x] = scnt[0];
+
+	__syncthreads();
+}
+
+static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
+{
+	__shared__ unsigned long accumulator[MAXNBLOCKS];
+
+	unsigned i;
+
+	/* Load the values from global mem */
+	for (i = 0; i < blockDim.x; i++)
+		accumulator[i] = output_cnt[i];
+
+	__syncthreads();
+
+	/* Perform a reduction in shared memory */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* Save the result in global memory */
+	if (threadIdx.x == 0)
+		*cnt = *cnt + accumulator[0];
+}
+
+extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
+{
+	cudaError_t cures;
+
+	/* How many blocks do we use ? */ 
+	unsigned nblocks = 128; // TODO
+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
+	STARPU_ASSERT((n % nblocks) == 0);
+	
+	unsigned long *per_block_cnt;
+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
+
+	/* How many threads per block ? At most 256, but no more threads than
+	 * there are entries to process per block. */
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
+
+	/* each entry of per_block_cnt contains the number of successful shots
+	 * in the corresponding block. */
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+
+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
+
+	/* compute the total number of successful shots by adding the elements
+	 * of the per_block_cnt array */
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cudaFree(per_block_cnt);
+}

+ 3 - 3
examples/ppm_downscaler/ppm_downscaler.c

@@ -76,7 +76,7 @@ struct ppm_image *file_to_ppm(char *filename)
 	unsigned i;
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	{
-//		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b);
+/*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
 	}
 
 	fclose(file);
@@ -136,7 +136,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 				{
 					unsigned index = (big_col + i)+(big_line + j)*input_ppm->ncols;
 
-//					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b);
+/*					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b); */
 
 					sum_r += (unsigned)in[index].r;
 					sum_g += (unsigned)in[index].g;
@@ -148,7 +148,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 			out[col + line*output_ppm->ncols].g = (unsigned char)(sum_g/(FACTOR*FACTOR));
 			out[col + line*output_ppm->ncols].b = (unsigned char)(sum_b/(FACTOR*FACTOR));
 
-//			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r);
+/*			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r); */
 	
 		}
 	}

+ 7 - 11
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
@@ -92,16 +92,12 @@ static struct starpu_codelet_t ds_codelet = {
 /* each block contains BLOCK_HEIGHT consecutive lines */
 static struct starpu_data_filter filter_y = {
 	.filter_func = starpu_block_filter_func,
-	.nchildren= HEIGHT/BLOCK_HEIGHT,
-	.get_nchildren = NULL,
-	.get_child_ops = NULL
+	.nchildren= HEIGHT/BLOCK_HEIGHT
 };
 	
 static struct starpu_data_filter filter_uv = {
 	.filter_func = starpu_block_filter_func,
-	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT,
-	.get_nchildren = NULL,
-	.get_child_ops = NULL
+	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
 };
 
 int main(int argc, char **argv)
@@ -111,7 +107,7 @@ int main(int argc, char **argv)
 	
 	parse_args(argc, argv);
 
-//	fprintf(stderr, "Reading input file ...\n");
+/*	fprintf(stderr, "Reading input file ...\n"); */
 
 	/* how many frames ? */
 	struct stat stbuf;
@@ -120,7 +116,7 @@ int main(int argc, char **argv)
 
 	unsigned nframes = filesize/FRAMESIZE; 
 
-//	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes);
+/*	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */
 	assert((filesize % sizeof(struct yuv_frame)) == 0);
 
 	/* fetch input data */
@@ -134,7 +130,7 @@ int main(int argc, char **argv)
 	FILE *f_out = fopen(filename_out, "w+");
 	assert(f_out);
 
-//	fprintf(stderr, "Alloc output file ...\n");
+/*	fprintf(stderr, "Alloc output file ...\n"); */
 	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
 	assert(yuv_out_buffer);
 
@@ -199,7 +195,7 @@ int main(int argc, char **argv)
 
 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
 
-	fprintf(stderr, "Start computation: there will be %d tasks for %d frames\n", ntasks, nframes);
+	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
 	gettimeofday(&start, NULL);
 
 	/* do the computation */

+ 10 - 8
examples/profiling/profiling.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <assert.h>
 #include <unistd.h>
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static unsigned niter = 500;
 
 void sleep_codelet(__attribute__ ((unused)) void *descr[],
@@ -70,7 +72,7 @@ int main(int argc, char **argv)
 		int ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 		}
 	}
@@ -97,8 +99,8 @@ int main(int argc, char **argv)
 
 	free(tasks);
 
-	fprintf(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
-	fprintf(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
+	FPRINTF(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
+	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
 
 	/* Display the occupancy of all workers during the test */
 	int worker;
@@ -117,10 +119,10 @@ int main(int argc, char **argv)
 
 		char workername[128];
 		starpu_worker_get_name(worker, workername, 128);
-		fprintf(stderr, "Worker %s:\n", workername);
-		fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
-		fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
-		fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
+		FPRINTF(stderr, "Worker %s:\n", workername);
+		FPRINTF(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
+		FPRINTF(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
+		FPRINTF(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
 	}
 
 	starpu_shutdown();

+ 36 - 11
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,13 +22,15 @@
 #include <cublas.h>
 #endif
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static float *x;
 static float *y;
 static starpu_data_handle *x_handles;
 static starpu_data_handle *y_handles;
 
 static unsigned nblocks = 4096;
-static unsigned entries_per_bock = 1024;
+static unsigned entries_per_block = 1024;
 
 #define DOT_TYPE double
 
@@ -75,9 +77,16 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 	*dota = *dota + *dotb;
 }
 
+#ifdef STARPU_USE_CUDA
+extern void redux_cuda_func(void *descr[], void *_args);
+#endif
+
 static struct starpu_codelet_t redux_codelet = {
-	.where = STARPU_CPU,
+	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = redux_cpu_func,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = redux_cuda_func,
+#endif
 	.nbuffers = 2
 };
 
@@ -118,11 +127,11 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
 	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
 
-	int ret = cudaThreadSynchronize();
+	cudaThreadSynchronize();
 
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	current_dot += local_dot;
 
 	cudaThreadSynchronize();
@@ -146,15 +155,13 @@ static struct starpu_codelet_t dot_codelet = {
  *	Tasks initialization
  */
 
-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
-
 int main(int argc, char **argv)
 {
 	starpu_init(NULL);
 
 	starpu_helper_cublas_init();
 
-	unsigned long nelems = nblocks*entries_per_bock;
+	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 
 	x = malloc(size);
@@ -182,9 +189,9 @@ int main(int argc, char **argv)
 	for (block = 0; block < nblocks; block++)
 	{
 		starpu_vector_data_register(&x_handles[block], 0,
-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
 		starpu_vector_data_register(&y_handles[block], 0,
-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
 	}
 
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
@@ -199,6 +206,7 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &dot_codelet;
+		task->destroy = 1;
 
 		task->buffers[0].handle = x_handles[block];
 		task->buffers[0].mode = STARPU_R;
@@ -208,16 +216,33 @@ int main(int argc, char **argv)
 		task->buffers[2].mode = STARPU_REDUX;
 
 		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
 		STARPU_ASSERT(!ret);
 	}
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+		starpu_data_unregister(y_handles[block]);
+	}
 	starpu_data_unregister(dot_handle);
 
-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
 
 	starpu_helper_cublas_shutdown();
 
 	starpu_shutdown();
 
+	free(x);
+	free(y);
+	free(x_handles);
+	free(y_handles);
+
 	return 0;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	return 77;
 }

+ 9 - 3
examples/reductions/minmax_reduction.c

@@ -22,6 +22,8 @@
 static unsigned nblocks = 8192;
 static unsigned entries_per_bock = 1024;
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define TYPE		double
 #define TYPE_MAX	DBL_MAX
 #define TYPE_MIN	DBL_MIN
@@ -171,15 +173,19 @@ int main(int argc, char **argv)
 		if (ret)
 		{
 			STARPU_ASSERT(ret == -ENODEV);
-			fprintf(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
+			FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
 			return 0;
 		}
 	}
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+	}
 	starpu_data_unregister(minmax_handle);
 
-	fprintf(stderr, "Min : %e\n", minmax[0]);
-	fprintf(stderr, "Max : %e\n", minmax[1]);
+	FPRINTF(stderr, "Min : %e\n", minmax[0]);
+	FPRINTF(stderr, "Max : %e\n", minmax[1]);
 
 	STARPU_ASSERT(minmax[0] <= minmax[1]);
 

+ 12 - 6
examples/scheduler/dummy_sched.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #include <starpu.h>
 
 #define NTASKS	32000
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 struct starpu_task_list sched_list;
 
@@ -38,7 +39,7 @@ static void init_dummy_sched(struct starpu_machine_topology_s *topology,
 	for (workerid = 0; workerid < topology->nworkers; workerid++)
 		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
 
-	fprintf(stderr, "Initialising Dummy scheduler\n");
+	FPRINTF(stderr, "Initialising Dummy scheduler\n");
 }
 
 static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
@@ -49,7 +50,7 @@ static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
 	pthread_cond_destroy(&sched_cond);
 	pthread_mutex_destroy(&sched_mutex);
 
-	fprintf(stderr, "Destroying Dummy scheduler\n");
+	FPRINTF(stderr, "Destroying Dummy scheduler\n");
 }
 
 static int push_task_dummy(struct starpu_task *task)
@@ -80,7 +81,6 @@ static struct starpu_sched_policy_s dummy_sched_policy = {
 	.init_sched = init_dummy_sched,
 	.deinit_sched = deinit_dummy_sched,
 	.push_task = push_task_dummy,
-	.push_prio_task = NULL,
 	.pop_task = pop_task_dummy,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
@@ -118,10 +118,16 @@ static starpu_codelet dummy_codelet =
 
 int main(int argc, char **argv)
 {
+	int ntasks = NTASKS;
+
 	starpu_init(&conf);
 
+#ifdef STARPU_SLOW_MACHINE
+	ntasks /= 100;
+#endif
+
 	unsigned i;
-	for (i = 0; i < NTASKS; i++)
+	for (i = 0; i < ntasks; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
 	

+ 51 - 0
examples/socl/Makefile.am

@@ -0,0 +1,51 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/socl/src/libsocl.la
+AM_CPPFLAGS = -I$(top_srcdir)/socl/include/ 
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+
+SOCL_EXAMPLES	=
+TESTS		=	$(SOCL_EXAMPLES)
+
+check_PROGRAMS	=	$(STARPU_EXAMPLES)
+
+examplebindir = $(libdir)/starpu/examples/socl/
+examplebin_PROGRAMS =
+
+
+examplebin_PROGRAMS +=				\
+	basic/basic		\
+	mandelbrot/mandelbrot		\
+	clinfo/clinfo
+
+
+SOCL_EXAMPLES +=				\
+	basic/basic		\
+	mandelbrot/mandelbrot		\
+	clinfo/clinfo
+
+basic_basic_SOURCES = basic/basic.c
+clinfo_clinfo_SOURCES = clinfo/clinfo.c
+mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
+
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+if HAVE_X11
+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+endif

+ 211 - 0
examples/socl/basic/basic.c

@@ -0,0 +1,211 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <CL/cl.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
+
+#ifdef UNUSED
+#elif defined(__GNUC__)
+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
+#else
+# define UNUSED(x) x
+#endif
+
+#define SIZE 1024
+#define TYPE float
+#define REALSIZE (SIZE * sizeof(TYPE))
+
+const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \
+   size_t x = get_global_id(0);\
+   size_t y = get_global_id(1);\
+   size_t w = get_global_size(0); \
+   int idx = y*w+x; \
+   d[idx] = s1[idx] + s2[idx];\
+}";
+
+
+
+int main(int UNUSED(argc), char** UNUSED(argv)) {
+   cl_platform_id platforms[15];
+   cl_uint num_platforms;
+   cl_device_id devices[15];
+   cl_uint num_devices;
+   cl_context context;
+   cl_program program;
+   cl_kernel kernel;
+   cl_mem s1m, s2m, dm;
+   cl_command_queue cq;
+   cl_int err;
+
+   TYPE s1[SIZE],s2[SIZE],d[SIZE];
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+         s1[i] = 2.0;
+         s2[i] = 7.0;
+         d[i] = 98.0;
+      }
+   }
+
+   printf("Querying platform...\n");
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   if (num_platforms == 0) {
+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+      exit(0);
+   }
+   err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
+   check(err, "clGetPlatformIDs");
+
+   printf("Querying devices...\n");
+   unsigned int platform_idx;
+   for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
+      err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
+      check(err, "clGetDeviceIDs");
+      if (num_devices != 0)
+         break;
+   }
+   if (num_devices == 0)
+      error("No OpenCL device found\n");
+
+   printf("Creating context...\n");
+   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
+   context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
+   check(err, "clCreateContext");
+
+   printf("Creating program...\n");
+   program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
+   check(err, "clCreateProgram");
+
+   printf("Building program...\n");
+   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+   check(err, "clBuildProgram");
+
+   printf("Creating kernel...\n");
+   kernel = clCreateKernel(program, "add", &err);
+   check(err, "clCreateKernel");
+
+   printf("Creating buffers...\n");
+   s1m = clCreateBuffer(context, CL_MEM_READ_WRITE, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s1");
+   s2m = clCreateBuffer(context, CL_MEM_READ_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s2");
+   dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer d");
+
+   printf("Creating command queue...\n");
+   cl_event eventW1, eventW2, eventK, eventR;
+
+#ifdef PROFILING
+   cq = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
+#else
+   cq = clCreateCommandQueue(context, devices[0], 0, &err);
+#endif
+   check(err, "clCreateCommandQueue");
+
+   printf("Enqueueing WriteBuffers...\n");
+   err = clEnqueueWriteBuffer(cq, s1m, CL_FALSE, 0, REALSIZE, s1, 0, NULL, &eventW1);
+   check(err, "clEnqueueWriteBuffer s1");
+   err = clEnqueueWriteBuffer(cq, s2m, CL_FALSE, 0, REALSIZE, s2, 0, NULL, &eventW2);
+   check(err, "clEnqueueWriteBuffer s2");
+
+   printf("Setting kernel arguments...\n");
+   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m);
+   check(err, "clSetKernelArg 0");
+   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m);
+   check(err, "clSetKernelArg 1");
+   err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm);
+   check(err, "clSetKernelArg 2");
+
+   printf("Enqueueing NDRangeKernel...\n");
+   size_t local[3] = {16, 1, 1};
+   size_t global[3] = {1024, 1, 1};
+   cl_event deps[] = {eventW1,eventW2};
+   err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, 2, deps, &eventK);
+   check(err, "clEnqueueNDRangeKernel");
+
+   printf("Enqueueing ReadBuffer...\n");
+   err = clEnqueueReadBuffer(cq, dm, CL_FALSE, 0, REALSIZE, d, 0, NULL, &eventR);
+   check(err, "clEnqueueReadBuffer");
+
+   clFinish(cq);
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+        printf("%f ", d[i]);
+      }
+      printf("\n");
+   }
+
+#ifdef PROFILING
+   #define DURATION(event,label) do { \
+      cl_ulong t0,t1; \
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t0, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
+   } while (0);
+
+   DURATION(eventW1, "first buffer writing");
+   DURATION(eventW2, "second buffer writing");
+   DURATION(eventK, "kernel execution");
+   DURATION(eventR, "result buffer reading");
+#endif
+
+   
+   printf("Releasing events...\n");
+   err = clReleaseEvent(eventW1);
+   err |= clReleaseEvent(eventW2);
+   err |= clReleaseEvent(eventK);
+   err |= clReleaseEvent(eventR);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing command queue...\n");
+   err = clReleaseCommandQueue(cq);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing buffers...\n");
+   err = clReleaseMemObject(s1m);
+   check(err, "clReleaseMemObject s1");
+   err = clReleaseMemObject(s2m);
+   check(err, "clReleaseMemObject s2");
+   err = clReleaseMemObject(dm);
+   check(err, "clReleaseMemObject d");
+
+   printf("Releasing kernel...\n");
+   err = clReleaseKernel(kernel);
+   check(err, "clReleaseKernel");
+
+   printf("Releasing program...\n");
+   err = clReleaseProgram(program);
+   check(err, "clReleaseProgram");
+
+   printf("Releasing context...\n");
+   err = clReleaseContext(context);
+   check(err, "clReleaseContext");
+
+   return 0;
+}

+ 299 - 0
examples/socl/clinfo/clinfo.c

@@ -0,0 +1,299 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <CL/cl.h>
+
+inline 
+void 
+checkErr(cl_int err, const char * name) {
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "ERROR: %s (%d)\n", name, err);
+        exit(1);
+    }
+}
+
+int
+main(void) {
+   cl_int err;
+   cl_uint num_platforms;
+   cl_platform_id *platforms;
+
+   // Plaform info
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   checkErr(err, "Unable to get platform count");
+
+   platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
+   err = clGetPlatformIDs(num_platforms, platforms, NULL);
+   checkErr(err, "Unable to get platform list");
+   
+   
+   // Iteratate over platforms
+   printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
+
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
+         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
+
+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
+         printf("  Plaform Version:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
+         printf("  Plaform Extensions:\t\t\t %s\n", str);    
+      }
+   }
+
+   printf("\n\n");
+
+   // Now Iteratate over each platform and its devices
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         cl_device_id * devices;
+         cl_uint num_devices;
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+         devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
+         
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+
+         printf("  Number of devices:\t\t\t\t %d\n", num_devices);
+         {
+            unsigned int j;
+            for (j=0; j<num_devices; j++) {
+               cl_device_type dev_type;
+               printf("\n  DEVICE %d\n", j);
+               
+               err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
+               checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
+
+               printf("  Device Type:\t\t\t\t\t ");
+               if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
+                  printf("CL_DEVICE_TYPE_ACCELERATOR ");
+               else if (dev_type & CL_DEVICE_TYPE_CPU)
+                  printf("CL_DEVICE_TYPE_CPU ");
+               else if (dev_type & CL_DEVICE_TYPE_GPU)
+                  printf("CL_DEVICE_TYPE_GPU ");
+               else if (dev_type & CL_DEVICE_TYPE_DEFAULT)
+                  printf("CL_DEVICE_TYPE_DEFAULT ");
+
+               printf("\n");
+
+               {
+                  cl_uint vendor_id;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
+               }
+               {
+                  cl_uint units;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
+                  printf("  Max compute units:\t\t\t\t %d\n", units); 
+               }
+
+               {
+                  cl_uint dims;
+                  size_t *sizes;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  sizes = (size_t*)malloc(dims * sizeof(size_t));
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  {
+                     unsigned int k;
+                     printf("    Max work items:\t\t\t\t (");
+                     for (k=0; k<dims; k++) {
+                        printf("%u", (unsigned int)sizes[k]);
+                        if (k != dims-1)
+                           printf(",");
+                     }
+                     printf(")\n");
+                  }
+               }
+
+#define GET_SIZET(CL_D,str) { \
+   size_t val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (unsigned int)val); \
+}
+
+#define GET_STRING(CL_D,str,size) { \
+   char val[size]; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_UINT(CL_D,str) { \
+   cl_uint val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_ULONG(CL_D,str) { \
+   cl_ulong val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_BOOL(CL_D,str) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? "Yes" : "No")); \
+}
+
+#define GET_BOOL_CUSTOM(CL_D,str,t,f) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? t : f)); \
+}
+
+#define GET_BITSET_AND(TYPE,CL_D,test,str) { \
+   TYPE val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
+}
+      
+               GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
+               
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "  Preferred vector width long:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "  Preferred vector width float:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
+               GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
+               GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
+
+               GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
+
+               GET_SIZET(CL_DEVICE_MAX_PARAMETER_SIZE, "  Max size of kernel argument:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_MEM_BASE_ADDR_ALIGN, "  Alignment of base addres:\t\t\t %u bits\n")
+               GET_UINT(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "  Minimum alignment for any datatype:\t\t %u bytes\n")
+
+               printf("  Single precision floating point capability\n");
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_DENORM, "    Denorms:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_INF_NAN, "    Quiet NaNs:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_NEAREST, "    Round to nearest even:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_ZERO, "    Round to zero:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_INF, "    Round to +ve and infinity:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_FMA, "    IEEE754-2008 fused multiply-add:\t\t %s\n")
+
+               {
+                  cl_device_mem_cache_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)");
+                  printf("  Cache type:\t\t\t\t\t ");
+                  switch (cache) {
+                     case CL_NONE:
+                        printf("None\n");
+                        break;
+                     case CL_READ_ONLY_CACHE:
+                        printf("Read only\n");
+                        break;
+                     case CL_READ_WRITE_CACHE:
+                        printf("Read/Write\n");
+                        break;
+                  }
+               }
+
+               GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
+               GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
+
+               {
+                  cl_device_local_mem_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_TYPE)");
+                  printf("  Local memory type:\t\t\t\t ");
+                  switch (cache) {
+                     case CL_LOCAL:
+                        printf("Local\n");
+                        break;
+                     case CL_GLOBAL:
+                        printf("Global\n");
+                        break;
+                  }
+               }
+
+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
+               GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
+               GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
+               GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")
+               GET_BOOL(CL_DEVICE_COMPILER_AVAILABLE, "  Compiler available:\t\t\t\t %s\n")
+
+               printf("  Execution capabilities:\t\t\t\t \n");
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_KERNEL, "  Execute OpenCL kernels:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_NATIVE_KERNEL, "  Execute native kernels:\t\t\t %s\n")
+
+               printf("  Queue properties:\t\t\t\t\n ");
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "   Out-of-Order:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, "    Profiling:\t\t\t\t\t %s\n")
+
+
+               GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
+               GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
+            
+               printf("\n");
+            }
+         }
+      }
+   }
+
+   return 0;
+}

+ 0 - 0
examples/socl/mandelbrot/mandelbrot.c


Some files were not shown because too many files changed in this diff