ソースを参照

merge with trunk - part 1

Andra Hugo 14 年 前
コミット
03b6b6a55b
共有100 個のファイルを変更した16047 個の追加1833 個の削除を含む
  1. 4 0
      .dir-locals.el
  2. 187 0
      .gitignore
  3. 4 0
      AUTHORS
  4. 7 6
      ChangeLog
  5. 32 4
      Makefile.am
  6. 28 41
      README
  7. 169 0
      README.dev
  8. 16 0
      acinclude.m4
  9. 528 205
      configure.ac
  10. 1098 630
      doc/starpu.texi
  11. 45 0
      doc/tutorial/Makefile
  12. 33 0
      doc/tutorial/README
  13. 70 0
      doc/tutorial/hello_world.c
  14. 124 0
      doc/tutorial/vector_scal.c
  15. 50 0
      doc/tutorial/vector_scal_cpu.c
  16. 43 0
      doc/tutorial/vector_scal_cuda.cu
  17. 60 0
      doc/tutorial/vector_scal_opencl.c
  18. 25 0
      doc/tutorial/vector_scal_opencl_kernel.cl
  19. 1 1
      doc/vector_scal_c.texi
  20. 2 1
      doc/vector_scal_cuda.texi
  21. 3 3
      doc/vector_scal_opencl.texi
  22. 1 0
      examples/.gitignore
  23. 197 235
      examples/Makefile.am
  24. 5 7
      examples/audio/starpu_audio_processing.c
  25. 18 13
      examples/axpy/axpy.c
  26. 13 10
      examples/basic_examples/block.c
  27. 7 8
      examples/basic_examples/block_opencl.c
  28. 21 15
      examples/basic_examples/hello_world.c
  29. 36 79
      examples/basic_examples/mult.c
  30. 9 6
      examples/basic_examples/variable.c
  31. 2 2
      examples/basic_examples/variable_kernels_opencl.c
  32. 10 5
      examples/basic_examples/vector_scal.c
  33. 2 2
      examples/basic_examples/vector_scal_cuda.cu
  34. 3 3
      examples/basic_examples/vector_scal_opencl.c
  35. 73 0
      examples/callback/callback.c
  36. 21 22
      examples/cg/cg.c
  37. 2 2
      examples/cg/cg.h
  38. 19 0
      examples/cg/cg_dot_kernel.cu
  39. 9 4
      examples/cg/cg_kernels.c
  40. 3 2
      examples/cholesky/cholesky.h
  41. 31 33
      examples/cholesky/cholesky_grain_tag.c
  42. 30 32
      examples/cholesky/cholesky_implicit.c
  43. 26 9
      examples/cholesky/cholesky_kernels.c
  44. 9 9
      examples/cholesky/cholesky_models.c
  45. 34 36
      examples/cholesky/cholesky_tag.c
  46. 7 7
      examples/cholesky/cholesky_tile_tag.c
  47. 2 2
      examples/common/blas.h
  48. 3 3
      examples/common/blas_model.c
  49. 2 2
      examples/common/blas_model.h
  50. 20 18
      examples/filters/fblock.c
  51. 2 2
      examples/filters/fblock_opencl.c
  52. 13 12
      examples/filters/fmatrix.c
  53. 11 10
      examples/filters/fvector.c
  54. 21 23
      examples/heat/dw_factolu.c
  55. 47 45
      examples/heat/dw_factolu.h
  56. 24 26
      examples/heat/dw_factolu_grain.c
  57. 17 17
      examples/heat/dw_factolu_kernels.c
  58. 18 20
      examples/heat/dw_factolu_tag.c
  59. 7 6
      examples/heat/dw_sparse_cg.c
  60. 3 3
      examples/heat/dw_sparse_cg.h
  61. 0 4
      examples/heat/dw_sparse_cg_kernels.c
  62. 10 10
      examples/heat/heat.c
  63. 5 3
      examples/heat/heat.h
  64. 6 6
      examples/heat/heat_display.c
  65. 14 14
      examples/heat/lu_kernels_model.c
  66. 2 2
      examples/heat/lu_kernels_model.h
  67. 13 9
      examples/incrementer/incrementer.c
  68. 2 2
      examples/incrementer/incrementer_kernels_opencl.c
  69. 3 3
      examples/lu/lu_example.c
  70. 9 11
      examples/lu/xlu.c
  71. 10 12
      examples/lu/xlu_implicit.c
  72. 10 12
      examples/lu/xlu_implicit_pivot.c
  73. 36 30
      examples/mandelbrot/mandelbrot.c
  74. 36 30
      examples/matvecmult/matvecmult.c
  75. 26 22
      examples/mult/xgemm.c
  76. 105 0
      examples/openmp/vector_scal.c
  77. 78 0
      examples/opt/Makefile.am
  78. 50 0
      examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
  79. 60 0
      examples/opt/pi/SobolQRNG/sobol.h
  80. 141 0
      examples/opt/pi/SobolQRNG/sobol_gold.c
  81. 61 0
      examples/opt/pi/SobolQRNG/sobol_gold.h
  82. 170 0
      examples/opt/pi/SobolQRNG/sobol_gpu.cu
  83. 61 0
      examples/opt/pi/SobolQRNG/sobol_gpu.h
  84. 10271 0
      examples/opt/pi/SobolQRNG/sobol_primitives.c
  85. 75 0
      examples/opt/pi/SobolQRNG/sobol_primitives.h
  86. 175 0
      examples/opt/pi/pi.c
  87. 33 0
      examples/opt/pi/pi.h
  88. 150 0
      examples/opt/pi/pi_kernel.cu
  89. 362 0
      examples/opt/pi/pi_redux.c
  90. 128 0
      examples/opt/pi/pi_redux_kernel.cu
  91. 3 3
      examples/ppm_downscaler/ppm_downscaler.c
  92. 7 11
      examples/ppm_downscaler/yuv_downscaler.c
  93. 10 8
      examples/profiling/profiling.c
  94. 36 11
      examples/reductions/dot_product.c
  95. 9 3
      examples/reductions/minmax_reduction.c
  96. 12 6
      examples/scheduler/dummy_sched.c
  97. 51 0
      examples/socl/Makefile.am
  98. 211 0
      examples/socl/basic/basic.c
  99. 299 0
      examples/socl/clinfo/clinfo.c
  100. 0 0
      examples/socl/mandelbrot/mandelbrot.c

+ 4 - 0
.dir-locals.el

@@ -0,0 +1,4 @@
+;; Hey Emacs, use the ugly style!
+
+((c-mode . ((c-file-style . "linux")
+	    (indent-tabs-mode . t))))

+ 187 - 0
.gitignore

@@ -0,0 +1,187 @@
+/configure
+/config.log
+/config.status
+/autom4te.cache
+/libtool
+/libstarpu.pc
+/aclocal.m4
+/build-aux
+/GPATH
+/GRTAGS
+/GTAGS
+/config.cache
+/doc/starpu.info
+*~
+,*
+Makefile
+Makefile.in
+.libs
+.deps
+*.o
+*.lo
+*.la
+*.swp
+.dirstamp
+stamp-h[0-9]*
+starpu.log
+/gcc-plugin/src/starpu-gcc-config.h
+/gcc-plugin/tests/*.c.[0-9]*.*
+/tests/datawizard/handle_to_pointer
+/tests/datawizard/data_lookup
+/doc/stamp-vti
+/doc/version.texi
+/examples/basic_examples/block
+/examples/basic_examples/hello_world
+/examples/basic_examples/mult
+/examples/basic_examples/variable
+/examples/basic_examples/vector_scal
+/examples/callback/callback
+/examples/filters/fblock
+/examples/filters/fmatrix
+/examples/filters/fvector
+/examples/incrementer/incrementer
+/examples/mandelbrot/mandelbrot
+/examples/matvecmult/matvecmult
+/examples/pi/pi
+/examples/pi/pi_redux
+/examples/ppm_downscaler/ppm_downscaler
+/examples/ppm_downscaler/yuv_downscaler
+/examples/profiling/profiling
+/examples/reductions/dot_product
+/examples/reductions/minmax_reduction
+/examples/scheduler/dummy_sched
+/examples/spmv/dw_spmv
+/examples/spmv/spmv
+/examples/stencil/stencil
+/examples/tag_example/tag_example
+/examples/tag_example/tag_example2
+/examples/tag_example/tag_example3
+/examples/tag_example/tag_restartable
+/mpi/examples/stencil/stencil5
+/mpi/tests/block_interface
+/mpi/tests/block_interface_pinned
+/mpi/tests/insert_task
+/mpi/tests/insert_task_block
+/mpi/tests/insert_task_cache
+/mpi/tests/insert_task_owner
+/mpi/tests/insert_task_owner2
+/mpi/tests/mpi_detached_tag
+/mpi/tests/mpi_irecv
+/mpi/tests/mpi_irecv_detached
+/mpi/tests/mpi_isend
+/mpi/tests/mpi_isend_detached
+/mpi/tests/mpi_test
+/mpi/tests/multiple_send
+/mpi/tests/pingpong
+/mpi/tests/ring
+/mpi/tests/ring_async
+/mpi/tests/ring_async_implicit
+/tests/core/declare_deps_after_submission
+/tests/core/declare_deps_after_submission_synchronous
+/tests/core/declare_deps_in_callback
+/tests/core/empty_task
+/tests/core/empty_task_chain
+/tests/core/empty_task_sync_point
+/tests/core/empty_task_sync_point_tasks
+/tests/core/execute_on_a_specific_worker
+/tests/core/get_current_task
+/tests/core/insert_task
+/tests/core/multithreaded
+/tests/core/multithreaded_init
+/tests/core/regenerate
+/tests/core/restart
+/tests/core/starpu_task_wait
+/tests/core/starpu_task_wait_for_all
+/tests/core/static_restartable
+/tests/core/static_restartable_tag
+/tests/core/static_restartable_using_initializer
+/tests/core/subgraph_repeat
+/tests/core/subgraph_repeat_regenerate
+/tests/core/tag_wait_api
+/tests/core/task_wait_api
+/tests/core/wait_all_regenerable_tasks
+/tests/datawizard/acquire_cb
+/tests/datawizard/acquire_release
+/tests/datawizard/acquire_release2
+/tests/datawizard/critical_section_with_void_interface
+/tests/datawizard/data_implicit_deps
+/tests/datawizard/data_invalidation
+/tests/datawizard/dining_philosophers
+/tests/datawizard/dsm_stress
+/tests/datawizard/increment_redux
+/tests/datawizard/increment_redux_v2
+/tests/datawizard/lazy_allocation.c
+/tests/datawizard/manual_reduction
+/tests/datawizard/mpi_like
+/tests/datawizard/mpi_like_async
+/tests/datawizard/readers_and_writers
+/tests/datawizard/reclaim
+/tests/datawizard/scratch
+/tests/datawizard/sync_and_notify_data
+/tests/datawizard/sync_and_notify_data_implicit
+/tests/datawizard/sync_with_data_with_mem
+/tests/datawizard/sync_with_data_with_mem_non_blocking
+/tests/datawizard/sync_with_data_with_mem_non_blocking_implicit
+/tests/datawizard/unpartition
+/tests/datawizard/user_interaction_implicit
+/tests/datawizard/write_only_tmp_buffer
+/tests/errorcheck/invalid_blocking_calls
+/tests/errorcheck/invalid_tasks
+/tests/errorcheck/starpu_init_noworker
+/tests/helper/cublas_init
+/tests/helper/execute_on_all
+/tests/helper/pinned_memory
+/tests/helper/starpu_create_sync_task
+/tests/helper/starpu_data_cpy
+/tests/microbenchs/async_tasks_overhead
+/tests/microbenchs/display_structures_size
+/tests/microbenchs/local_pingpong
+/tests/microbenchs/prefetch_data_on_node
+/tests/microbenchs/redundant_buffer
+/tests/microbenchs/sync_tasks_overhead
+/tests/microbenchs/tasks_overhead
+/tests/overlap/overlap
+/tests/parallel_tasks/explicit_combined_worker
+/tests/parallel_tasks/parallel_kernels
+/tests/parallel_tasks/parallel_kernels_spmd
+/tests/parallel_tasks/spmd_pgreedy
+/tests/perfmodels/non_linear_regression_based
+/tests/perfmodels/regression_based
+/tools/cbc2paje
+/tools/lp2paje
+/tools/starpu_calibrate_bus
+/tools/starpu_machine_display
+/tools/starpu_perfmodel_display
+/tools/starpu_regression_display
+/gcc-plugin/tests/scalar-tasks
+/gcc-plugin/tests/pointers
+/tests/datawizard/lazy_allocation
+/gcc-plugin/tests/pointer-tasks
+/gcc-plugin/tests/*.s
+/gcc-plugin/tests/base
+/gcc-plugin/tests/core
+/mpi/tests/insert_task_owner_data
+/mpi/examples/scatter_gather/mpi_scatter_gather
+/examples/top/hello_world_top
+/doc/starpu.aux
+/doc/starpu.cp
+/doc/starpu.cps
+/doc/starpu.fn
+/doc/starpu.fns
+/doc/starpu.html
+/doc/starpu.ky
+/doc/starpu.pdf
+/doc/starpu.pg
+/doc/starpu.toc
+/doc/starpu.tp
+/doc/starpu.tps
+/doc/starpu.vr
+/gcc-plugin/tests/register
+/tests/datawizard/acquire_cb_insert
+/tools/starpu_perfmodel_plot
+/gcc-plugin/tests/run-test
+/gcc-plugin/tests/register-errors
+/gcc-plugin/tests/acquire
+/gcc-plugin/tests/unregister
+/gcc-plugin/tests/lib-user
+/gcc-plugin/examples/matrix-mult

+ 4 - 0
AUTHORS

@@ -6,3 +6,7 @@ Sylvain Henry <sylvain.henry@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Mehdi Juhoor <mjuhoor@gmail.com>
 François Tessier <francois.tessier@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
+William Braik <wbraik@gmail.com>
+Yann Courtois <yann.courtois33@gmail.com>
+Jean-Marie Couteyen <jm.couteyen@gmail.com>
+Anthony Roy <theanthony33@gmail.com>

+ 7 - 6
ChangeLog

@@ -1,7 +1,7 @@
-StarPU 0.5 (svn revision ????)
+StarPU 0.9 (svn revision 3721)
 ==============================================
 ==============================================
-The yet-more-stuff release
-  
+The extensions release
+
   * Provide the STARPU_REDUX data access mode
   * Provide the STARPU_REDUX data access mode
   * Externalize the scheduler API.
   * Externalize the scheduler API.
   * Add theoretical bound computation
   * Add theoretical bound computation
@@ -19,11 +19,12 @@ The yet-more-stuff release
   * Add mandelbrot OpenCL example
   * Add mandelbrot OpenCL example
   * Add cg example
   * Add cg example
   * Add stencil MPI example
   * Add stencil MPI example
+  * Initial support for CUDA4
 
 
 StarPU 0.4 (svn revision 2535)
 StarPU 0.4 (svn revision 2535)
 ==============================================
 ==============================================
 The API strengthening release
 The API strengthening release
-  
+
   * Major API improvements
   * Major API improvements
     - Provide the STARPU_SCRATCH data access mode
     - Provide the STARPU_SCRATCH data access mode
     - Rework data filter interface
     - Rework data filter interface
@@ -41,7 +42,7 @@ The API strengthening release
   * Provide a library to help accelerating MPI applications
   * Provide a library to help accelerating MPI applications
   * Improve data transfers overhead prediction
   * Improve data transfers overhead prediction
     - Transparently benchmark buses to generate performance models
     - Transparently benchmark buses to generate performance models
-    - Bind accelerator-controlling threads with respect to NUMA locality 
+    - Bind accelerator-controlling threads with respect to NUMA locality
   * Improve StarPU's portability
   * Improve StarPU's portability
     - Add OpenCL support
     - Add OpenCL support
     - Add support for Windows
     - Add support for Windows
@@ -63,7 +64,7 @@ The asynchronous heterogeneous multi-accelerator release
     - All data transfers use data requests now
     - All data transfers use data requests now
     - Implement asynchronous data transfers
     - Implement asynchronous data transfers
     - Implement prefetch mechanism
     - Implement prefetch mechanism
-    - Chain data requests to support GPU->RAM->GPU transfers 
+    - Chain data requests to support GPU->RAM->GPU transfers
   * Make it possible to bypass the scheduler and to assign a task to a specific
   * Make it possible to bypass the scheduler and to assign a task to a specific
     worker
     worker
   * Support restartable tasks to reinstanciate dependencies task graphs
   * Support restartable tasks to reinstanciate dependencies task graphs

+ 32 - 4
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2009-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,22 @@ SUBDIRS = src
 if USE_MPI
 if USE_MPI
 SUBDIRS += mpi
 SUBDIRS += mpi
 endif
 endif
+
+if BUILD_SOCL
+SUBDIRS += socl
+endif
+
 SUBDIRS += tools examples tests doc
 SUBDIRS += tools examples tests doc
 
 
+if COND_OPT
+SUBDIRS += tests/opt examples/opt
+endif
+
+
+if BUILD_GCC_PLUGIN
+SUBDIRS += gcc-plugin
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpu.pc
 pkgconfig_DATA = libstarpu.pc
 
 
@@ -42,10 +56,24 @@ include_HEADERS = 				\
 	include/starpu_expert.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_bound.h			\
-	include/starpu_scheduler.h
+	include/starpu_scheduler.h		\
+	include/starpu_top.h
+
+if BUILD_STARPU_TOP
+all-local:
+	cd starpu-top ; $(QMAKE) ; $(MAKE)
+clean-local:
+	cd starpu-top ; $(MAKE) clean
+# TODO: resources
+install-exec-local:
+	$(MKDIR_P) $(DESTDIR)$(bindir)
+	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
+endif
 
 
+if STARPU_HAVE_WINDOWS
 txtdir = ${prefix}
 txtdir = ${prefix}
+else
+txtdir = ${docdir}
+endif
 txt_DATA = AUTHORS COPYING.LGPL README
 txt_DATA = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
 EXTRA_DIST = AUTHORS COPYING.LGPL README
-
-

+ 28 - 41
README

@@ -59,6 +59,12 @@ advantage of their specificities in a portable fashion.
    units according to the machine topology. For more details on hwloc, see
    units according to the machine topology. For more details on hwloc, see
    http://www.open-mpi.org/projects/hwloc/ .
    http://www.open-mpi.org/projects/hwloc/ .
 
 
+ * To build the StarPU-Top tool the following are also required:
+   * libqt4 >= 4.7
+   * libqt4-network
+   * libqt4-opengl
+   * libqt4-sql
+
 ++=====================++
 ++=====================++
 || III. Getting StarPU ||
 || III. Getting StarPU ||
 ++=====================++
 ++=====================++
@@ -97,61 +103,42 @@ we provide MinGW-built binaries.  The build process produces libstarpu.dll,
 libstarpu.def, and libstarpu.lib, which should be enough to use it from e.g.
 libstarpu.def, and libstarpu.lib, which should be enough to use it from e.g.
 Microsoft Visual Studio.
 Microsoft Visual Studio.
 
 
-A few details need to be fixed when building StarPU on windows:
-
-- To get a .def file built, make sure that MSVC's lib.exe tool is in PATH.
-
-- Update the video drivers to the latest stable release available for your
-  hardware. Older ATI drivers (< 2.3) contain bugs that cause OpenCL support in
-  StarPU to hang or exhibit incorrect behaviour.
+Update the video drivers to the latest stable release available for your
+hardware. Old ATI drivers (< 2.3) contain bugs that cause OpenCL support in
+StarPU to hang or exhibit incorrect behaviour.
 
 
-- c:\cuda\include\host_defines.h has a bogus CUDARTAPI definition which makes
-  linking fail completely. Replace the first occurence of
+For details on the Windows build process, see the README.dev file in the
+subversion tree.
 
 
-    #define CUDARTAPI
-    
-  with
-    
-    #ifdef _WIN32
-    #define CUDARTAPI __stdcall
-    #else
-    #define CUDARTAPI
-    #endif
-
-  While at it, you can also comment the __cdecl definition to avoid spurious
-  warnings.
-
-- If you have a non-english version of windows, use
+++==================++
+|| V. Documentation ||
+++==================++
 
 
-    export LANG=C
+Texinfo documentation is available in doc/ . If LaTeX is available on the
+machine, a pdf can be generated by running
 
 
-  else libtool has troubles parsing the translated output of the toolchain.
+  $ make -C doc pdf
 
 
-- libtool is not able to find the libraries automatically, you need to make some
-  copies:
+If makeinfo is available on the machine, html pages can be generated by running
 
 
-    copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
-    copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
-    copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
-    copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
-    copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
+  $ make -C doc html
 
 
-++===========++
-|| V. Trying ||
-++===========++
+++============++
+|| VI. Trying ||
+++============++
 
 
 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 
 
-++=============++
-|| VI. Upgrade ||
-++=============++
+++==============++
+|| VII. Upgrade ||
+++==============++
 
 
 To upgrade your source code from older version (there were quite a few
 To upgrade your source code from older version (there were quite a few
 renamings), use the tools/rename.sh script
 renamings), use the tools/rename.sh script
 
 
-++==============++
-|| VII. Contact ||
-++==============++
+++===============++
+|| VIII. Contact ||
+++===============++
 
 
 For any questions regarding StarPU, please contact the starpu-devel
 For any questions regarding StarPU, please contact the starpu-devel
 mailing-list at starpu-devel@lists.gforge.inria.fr .
 mailing-list at starpu-devel@lists.gforge.inria.fr .

+ 169 - 0
README.dev

@@ -0,0 +1,169 @@
+Installing StarPU on windows
+----------------------------
+
+If you are building from a tarball downloaded from the website, you can skip the
+cygwin part.
+
+1. Install cygwin
+
+http://cygwin.com/install.html
+
+Make sure the following packages are available:
+- (Devel)/subversion
+- (Devel)/libtool
+- (Devel)/gcc
+- (Devel)/make
+- your favorite editor (vi, emacs, ...)
+- (Devel)/gdb
+- (Archive)/zip
+- (Devel)/pkg-config
+
+2. Install mingw
+
+http://sourceforge.net/projects/mingw/
+
+3. Install hwloc (not mandatory)
+
+http://www.open-mpi.org/projects/hwloc
+
+4. Install Microsoft Visual C++ Studio Express
+
+   http://www.microsoft.com/express/Downloads
+
+   Add in your path the following directories.
+   (adjusting where necessary for the Installation location according to VC
+    version and on 64 and 32bit Windows versions)
+
+   On cygwin, with Visual C++ 2010 e.g.;
+
+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
+
+   On MingW, with Visual C++ 2010, e.g.;
+
+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
+
+   Try to call <lib.exe> and <link.exe> without any option to make sure these
+   dump their help output, else no .def or .lib file will be produced.
+
+5. Install GPU Drivers (not mandatory)
+
+  5.1 Install Cuda
+
+      http://developer.nvidia.com/object/cuda_3_2_downloads.html
+
+      You need to install at least the CUDA toolkit.
+
+      libtool is not able to find the libraries automatically, you
+      need to make some copies:
+
+      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
+      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
+      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
+      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
+      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
+
+      (and if the version of your CUDA driver is >= 3.2)
+
+      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
+
+      Add the CUDA bin directory in your path
+
+      export PATH=/cygdrive/c/CUDA/bin:$PATH
+
+      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
+      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
+      definition which makes linking fail completely. Replace the first
+      occurence of
+
+      #define CUDARTAPI
+
+      with
+
+      #ifdef _WIN32
+      #define CUDARTAPI __stdcall
+      #else
+      #define CUDARTAPI
+      #endif
+
+      While at it, you can also comment the __cdecl definition to avoid spurious
+      warnings.
+
+
+  5.2 Install OpenCL
+
+      http://developer.nvidia.com/object/opencl-download.html
+
+      You need to download the NVIDIA Drivers for your version of
+      Windows. Executing the file will extract all files in a given
+      directory. The the driver installation will start, it will fail
+      if no compatibles drivers can be found on your system.
+
+      Anyway, you should copy the *.dl_ files from the directory
+      (extraction path) in the bin directory of the CUDA installation
+      directory (the directory should be v3.2/bin/)
+
+  5.3 Install MsCompress
+
+      http://gnuwin32.sourceforge.net/packages/mscompress.htm
+
+      Go in the CUDA bin directory, uncompress .dl_ files and rename
+      them in .dll files
+
+      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
+      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
+
+If you are building from a tarball downloaded from the website, you can skip the
+autogen.sh part.
+
+6. Start autogen.sh from cygwin
+
+   cd starpu-trunk
+   ./autogen.sh
+
+7. Start a MinGW shell
+
+   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
+
+8. Configure, make, install from MinGW
+
+   If you have a non-english version of windows, use
+
+     export LANG=C
+
+   else libtool has troubles parsing the translated output of the toolchain.
+
+   cd starpu-trunk
+   mkdir build
+   cd build
+   ../configure --prefix=$PWD/target --disable-default-drand48 \
+        --with-hwloc=<HWLOC installation directory> \
+        --with-cuda-dir=<CUDA installation directory> \
+        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
+	--with-opencl-dir=<CUDA installation directory>
+   make
+   make install
+
+   Also convert a couple of files to CRLF:
+
+   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
+   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
+   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
+
+9. If you want your StarPU installation to be standalone, you need to
+   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
+   installation bin directory, as well as MinGW/bin/libpthread*dll
+
+   cp <CUDA directory>/bin/*dll target/bin
+   cp <HWLOC directory>/bin/*dll target/bin
+   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
+
+   and set the StarPU bin directory in your path.
+
+   export PATH=<StarPU installation directory>/bin:$PATH
+
+
+Developers warning
+------------------
+They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.

+ 16 - 0
acinclude.m4

@@ -1,3 +1,19 @@
+dnl Copyright (C) Free Software Foundation, Inc.
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+dnl
 dnl This test is taken from libgfortran
 dnl This test is taken from libgfortran
 
 
 dnl Check whether the target supports __sync_val_compare_and_swap.
 dnl Check whether the target supports __sync_val_compare_and_swap.

ファイルの差分が大きいため隠しています
+ 528 - 205
configure.ac


ファイルの差分が大きいため隠しています
+ 1098 - 630
doc/starpu.texi


+ 45 - 0
doc/tutorial/Makefile

@@ -0,0 +1,45 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+CFLAGS          +=      $$(pkg-config --cflags libstarpu)
+LDFLAGS         +=      $$(pkg-config --libs libstarpu)
+
+HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
+NVCC		?=	nvcc
+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
+
+%.o: %.cu
+	nvcc $(CFLAGS) $< -c
+
+all: hello_world vector_scal
+
+VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o 
+ifneq ($(strip $(HAS_CUDA)),)
+VECTOR_SCAL_PREREQUISITES	+=	vector_scal_cuda.o
+VECTOR_SCAL_COMPILER		=	$(NVCC)
+else
+VECTOR_SCAL_COMPILER		=	$(CC)
+endif
+ifneq ($(strip $(HAS_OPENCL)),)
+VECTOR_SCAL_PREREQUISITES += vector_scal_opencl.o
+endif
+
+vector_scal: $(VECTOR_SCAL_PREREQUISITES)
+	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
+
+clean:
+	rm -f hello_world vector_scal *.o
+

+ 33 - 0
doc/tutorial/README

@@ -0,0 +1,33 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+Instructions on how to compile and run StarPU examples
+------------------------------------------------------
+
+% export STARPU_DIR=<directory where StarPU is installed>
+% export PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
+% export LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
+
+% make hello_world
+% ./hello_world
+
+% make vector_scal
+% ./vector_scal
+
+% STARPU_NCPUS=0 ./vector_scal
+% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
+

+ 70 - 0
doc/tutorial/hello_world.c

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+struct params {
+    int i;
+    float f;
+};
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    struct params *params = cl_arg;
+
+    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+}
+
+starpu_codelet cl =
+{
+    .where = STARPU_CPU,
+    .cpu_func = cpu_func,
+    .nbuffers = 0
+};
+
+void callback_func(void *callback_arg)
+{
+    printf("Callback function (arg %x)\n", callback_arg);
+}
+
+int main(int argc, char **argv)
+{
+    /* initialize StarPU */
+    starpu_init(NULL);
+
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &cl; /* Pointer to the codelet defined above */
+
+    struct params params = { 1, 2.0f };
+    task->cl_arg = &params;
+    task->cl_arg_size = sizeof(params);
+
+    task->callback_func = callback_func;
+    task->callback_arg = 0x42;
+
+    /* starpu_task_submit will be a blocking call */
+    task->synchronous = 1;
+
+    /* submit the task to StarPU */
+    starpu_task_submit(task);
+
+    /* terminate StarPU */
+    starpu_shutdown();
+
+    return 0;
+}

+ 124 - 0
doc/tutorial/vector_scal.c

@@ -0,0 +1,124 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU to scale an array by a factor.
+ * It shows how to manipulate data with StarPU's data management library.
+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
+ *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
+ */
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+#define    NX    2048
+
+extern void scal_cpu_func(void *buffers[], void *_args);
+extern void scal_cuda_func(void *buffers[], void *_args);
+extern void scal_opencl_func(void *buffers[], void *_args);
+
+static starpu_codelet cl = {
+    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
+    /* CPU implementation of the codelet */
+    .cpu_func = scal_cpu_func,
+#ifdef STARPU_USE_CUDA
+    /* CUDA implementation of the codelet */
+    .cuda_func = scal_cuda_func,
+#endif
+#ifdef STARPU_USE_OPENCL
+    /* OpenCL implementation of the codelet */
+    .opencl_func = scal_opencl_func,
+#endif
+    .nbuffers = 1
+};
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program programs;
+#endif
+
+int main(int argc, char **argv)
+{
+    /* We consider a vector of float that is initialized just as any of C
+      * data */
+    float vector[NX];
+    unsigned i;
+    for (i = 0; i < NX; i++)
+        vector[i] = 1.0f;
+
+    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+
+    /* Initialize StarPU with default configuration */
+    starpu_init(NULL);
+
+#ifdef STARPU_USE_OPENCL
+        starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
+#endif
+
+    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
+     * identifier. When a task needs to access a piece of data, it should
+     * refer to the handle that is associated to it.
+     * In the case of the "vector" data interface:
+     *  - the first argument of the registration method is a pointer to the
+     *    handle that should describe the data
+     *  - the second argument is the memory node where the data (ie. "vector")
+     *    resides initially: 0 stands for an address in main memory, as
+     *    opposed to an adress on a GPU for instance.
+     *  - the third argument is the adress of the vector in RAM
+     *  - the fourth argument is the number of elements in the vector
+     *  - the fifth argument is the size of each element.
+     */
+    starpu_data_handle vector_handle;
+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+                                NX, sizeof(vector[0]));
+
+    float factor = 3.14;
+
+    /* create a synchronous task: any call to starpu_task_submit will block
+      * until it is terminated */
+    struct starpu_task *task = starpu_task_create();
+    task->synchronous = 1;
+
+    task->cl = &cl;
+
+    /* the codelet manipulates one buffer in RW mode */
+    task->buffers[0].handle = vector_handle;
+    task->buffers[0].mode = STARPU_RW;
+
+    /* an argument is passed to the codelet, beware that this is a
+     * READ-ONLY buffer and that the codelet may be given a pointer to a
+     * COPY of the argument */
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+
+    /* execute the task on any eligible computational ressource */
+    starpu_task_submit(task);
+
+    /* StarPU does not need to manipulate the array anymore so we can stop
+      * monitoring it */
+    starpu_data_unregister(vector_handle);
+
+#ifdef STARPU_USE_OPENCL
+    starpu_opencl_unload_opencl(&programs);
+#endif
+
+    /* terminate StarPU, no task can be submitted after */
+    starpu_shutdown();
+
+    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+
+    return 0;
+}

+ 50 - 0
doc/tutorial/vector_scal_cpu.c

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+/* This kernel takes a buffer and scales it by a constant factor */
+void scal_cpu_func(void *buffers[], void *cl_arg)
+{
+    unsigned i;
+    float *factor = cl_arg;
+
+    /*
+     * The "buffers" array matches the task->buffers array: for instance
+     * task->buffers[0].handle is a handle that corresponds to a data with
+     * vector "interface", so that the first entry of the array in the
+     * codelet  is a pointer to a structure describing such a vector (ie.
+     * struct starpu_vector_interface_s *). Here, we therefore manipulate
+     * the buffers[0] element as a vector: nx gives the number of elements
+     * in the array, ptr gives the location of the array (that was possibly
+     * migrated/replicated), and elemsize gives the size of each elements.
+     */
+    starpu_vector_interface_t *vector = buffers[0];
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+    /* get a pointer to the local copy of the vector : note that we have to
+     * cast it in (float *) since a vector could contain any type of
+     * elements so that the .ptr field is actually a uintptr_t */
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* scale the vector */
+    for (i = 0; i < n; i++)
+        val[i] *= *factor;
+}
+

+ 43 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+        if (i < n)
+               val[i] *= factor;
+}
+
+extern "C" void scal_cuda_func(void *buffers[], void *_args)
+{
+        float *factor = (float *)_args;
+
+        /* length of the vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the vector pointer */
+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+        unsigned threads_per_block = 64;
+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+

+ 60 - 0
doc/tutorial/vector_scal_opencl.c

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program programs;
+
+void scal_opencl_func(void *buffers[], void *_args)
+{
+    float *factor = _args;
+    int id, devid, err;
+    cl_kernel kernel;
+    cl_command_queue queue;
+    cl_event event;
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    /* OpenCL copy of the vector pointer */
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+    id = starpu_worker_get_id();
+    devid = starpu_worker_get_devid(id);
+
+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
+                    "vector_mult_opencl", devid);   /* Name of the codelet defined above */
+    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
+    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
+    if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+    {
+        size_t global=1;
+        size_t local=1;
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+    }
+
+    clFinish(queue);
+    starpu_opencl_collect_stats(event);
+    clReleaseEvent(event);
+
+    starpu_opencl_release_kernel(kernel);
+}

+ 25 - 0
doc/tutorial/vector_scal_opencl_kernel.cl

@@ -0,0 +1,25 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+{
+        const int i = get_global_id(0);
+        if (i < nx) {
+                val[i] *= factor;
+        }
+}
+

+ 1 - 1
doc/vector_scal_c.texi

@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         starpu_opencl_load_opencl_from_file(
         starpu_opencl_load_opencl_from_file(
-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs);
+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
 #endif
 #endif
 
 
     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
     /* Tell StaPU to associate the "vector" vector with the "vector_handle"

+ 2 - 1
doc/vector_scal_cuda.texi

@@ -1,9 +1,10 @@
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_cuda.h>
 
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
                                         float factor)
 @{
 @{
-        unsigned i;
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
         if (i < n)
                val[i] *= factor;
                val[i] *= factor;
 @}
 @}

+ 3 - 3
doc/vector_scal_opencl.texi

@@ -13,8 +13,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
 
     /* length of the vector */
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* local copy of the vector pointer */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    /* OpenCL copy of the vector pointer */
+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
 
 
     id = starpu_worker_get_id();
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);
     devid = starpu_worker_get_devid(id);
@@ -23,7 +23,7 @@ void scal_opencl_func(void *buffers[], void *_args)
                                     devid);
                                     devid);
     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
     err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
     err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
     if (err) STARPU_OPENCL_REPORT_ERROR(err);
     if (err) STARPU_OPENCL_REPORT_ERROR(err);

+ 1 - 0
examples/.gitignore

@@ -0,0 +1 @@
+/.deps

+ 197 - 235
examples/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
-# Copyright (C) 2010  Centre National de la Recherche Scientifique
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # it under the terms of the GNU Lesser General Public License as published by
@@ -14,24 +14,23 @@
 #
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
-AUTOMAKE_OPTIONS = subdir-objects
-
-AM_CFLAGS = $(HWLOC_CFLAGS) -Wall
-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
 
-TESTS	=	$(check_PROGRAMS)
+SUBDIRS = stencil
 
 
-SUBDIRS = stencil stencil_ctx
+if STARPU_USE_SOCL
+SUBDIRS += socl
+endif
 
 
 if STARPU_HAVE_FFTW
 if STARPU_HAVE_FFTW
-if STARPU_HAVE_FFTWL
+if STARPU_HAVE_FFTWF
 SUBDIRS += starpufft
 SUBDIRS += starpufft
 endif
 endif
 endif
 endif
 
 
-check_PROGRAMS =
-
 BUILT_SOURCES =
 BUILT_SOURCES =
 
 
 if STARPU_USE_OPENCL
 if STARPU_USE_OPENCL
@@ -40,7 +39,9 @@ endif
 
 
 EXTRA_DIST = 					\
 EXTRA_DIST = 					\
 	basic_examples/vector_scal_opencl_kernel.cl \
 	basic_examples/vector_scal_opencl_kernel.cl \
+	common/blas_model.c			\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_cuda.cu			\
+	spmv/spmv_opencl.cl			\
 	gordon/null_kernel_gordon.c		\
 	gordon/null_kernel_gordon.c		\
 	mult/xgemm.c				\
 	mult/xgemm.c				\
 	lu/xlu.c				\
 	lu/xlu.c				\
@@ -54,6 +55,7 @@ EXTRA_DIST = 					\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
 	matvecmult/matvecmult_kernel.cl				\
 	basic_examples/block_opencl_kernel.cl			\
 	basic_examples/block_opencl_kernel.cl			\
+	openmp/vector_scal.c			\
 	filters/fblock_opencl_kernel.cl
 	filters/fblock_opencl_kernel.cl
 
 
 CLEANFILES = 					\
 CLEANFILES = 					\
@@ -64,7 +66,7 @@ CLEANFILES += *.gcno *.gcda *.linkinfo
 
 
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
 
 
-NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  -arch sm_13
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS) -arch sm_13
 
 
 .cu.o:
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
@@ -106,11 +108,6 @@ noinst_HEADERS = 				\
 	lu/xlu_kernels.h			\
 	lu/xlu_kernels.h			\
 	lu/float.h				\
 	lu/float.h				\
 	lu/double.h				\
 	lu/double.h				\
-	pi/pi.h					\
-	pi/SobolQRNG/sobol.h			\
-	pi/SobolQRNG/sobol_gold.h		\
-	pi/SobolQRNG/sobol_gpu.h		\
-	pi/SobolQRNG/sobol_primitives.h		\
 	cholesky/cholesky.h			\
 	cholesky/cholesky.h			\
 	common/blas_model.h			\
 	common/blas_model.h			\
 	common/blas.h				\
 	common/blas.h				\
@@ -122,22 +119,134 @@ noinst_HEADERS = 				\
 	ppm_downscaler/yuv_downscaler.h		\
 	ppm_downscaler/yuv_downscaler.h		\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
-	spmv/dw_spmv.h				\
+	spmv/spmv.h				\
 	spmv/dw_block_spmv.h
 	spmv/dw_block_spmv.h
 
 
+#####################################
+# What to install and what to check #
+#####################################
 
 
-##################
-# Basic examples #
-##################
+STARPU_EXAMPLES	=
+TESTS		=	$(STARPU_EXAMPLES)
+
+if STARPU_HAVE_WINDOWS
+check_PROGRAMS	=	$(STARPU_EXAMPLES)
+else
+check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
+endif
+
+if !STARPU_HAVE_WINDOWS
+## test loader program
+LOADER			=	loader
+LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
+loader_SOURCES		=	../tests/loader.c
+TESTS_ENVIRONMENT	=	$(LOADER_BIN)
+endif
 
 
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
-	basic_examples/hello_world
+	basic_examples/hello_world		\
+	basic_examples/vector_scal		\
+	basic_examples/mult			\
+	basic_examples/block			\
+	basic_examples/variable			\
+	filters/fvector				\
+	filters/fblock				\
+	filters/fmatrix				\
+	tag_example/tag_example			\
+	tag_example/tag_example3		\
+	tag_example/tag_example2		\
+	tag_example/tag_restartable		\
+	spmv/spmv				\
+	callback/callback			\
+	incrementer/incrementer			\
+	matvecmult/matvecmult			\
+	profiling/profiling			\
+	scheduler/dummy_sched			\
+	reductions/dot_product			\
+	reductions/minmax_reduction		\
+	mandelbrot/mandelbrot			\
+	ppm_downscaler/ppm_downscaler		\
+	ppm_downscaler/yuv_downscaler
 
 
-basic_examples_hello_world_SOURCES =		\
-	basic_examples/hello_world.c
+if STARPU_HAVE_F77_H
+examplebin_PROGRAMS +=				\
+	basic_examples/vector_scal_fortran
+endif
+
+if !NO_BLAS_LIB
+examplebin_PROGRAMS +=				\
+	axpy/axpy				\
+	mult/sgemm 				\
+	mult/dgemm				\
+	cholesky/cholesky_tag			\
+	cholesky/cholesky_tile_tag		\
+	cholesky/cholesky_grain_tag		\
+	cholesky/cholesky_implicit		\
+	lu/lu_example_float			\
+	lu/lu_example_double			\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double		\
+	heat/heat				\
+	cg/cg
+endif
 
 
+if ATLAS_BLAS_LIB
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
-	basic_examples/vector_scal
+	spmv/dw_block_spmv
+endif
+
+STARPU_EXAMPLES +=				\
+	basic_examples/hello_world		\
+	basic_examples/vector_scal		\
+	basic_examples/mult			\
+	basic_examples/block			\
+	basic_examples/variable			\
+	filters/fvector				\
+	filters/fblock				\
+	filters/fmatrix				\
+	tag_example/tag_example			\
+	tag_example/tag_example3		\
+	tag_example/tag_example2		\
+	tag_example/tag_restartable		\
+	spmv/spmv				\
+	callback/callback			\
+	incrementer/incrementer			\
+	matvecmult/matvecmult			\
+	profiling/profiling			\
+	scheduler/dummy_sched			\
+	reductions/dot_product			\
+	reductions/minmax_reduction
+
+if STARPU_HAVE_F77_H
+STARPU_EXAMPLES +=				\
+	basic_examples/vector_scal_fortran
+endif
+
+if !NO_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	axpy/axpy				\
+	mult/sgemm 				\
+	mult/dgemm				\
+	cholesky/cholesky_tag			\
+	cholesky/cholesky_tile_tag		\
+	cholesky/cholesky_grain_tag		\
+	cholesky/cholesky_implicit		\
+	lu/lu_example_float			\
+	lu/lu_example_double			\
+	lu/lu_implicit_example_float		\
+	lu/lu_implicit_example_double		\
+	heat/heat				\
+	cg/cg
+endif
+
+if ATLAS_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	spmv/dw_block_spmv
+endif
+
+##################
+# Basic examples #
+##################
 
 
 basic_examples_vector_scal_SOURCES =		\
 basic_examples_vector_scal_SOURCES =		\
 	basic_examples/vector_scal.c		\
 	basic_examples/vector_scal.c		\
@@ -156,9 +265,6 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 endif
 endif
 
 
 if STARPU_HAVE_F77_H
 if STARPU_HAVE_F77_H
-examplebin_PROGRAMS +=				\
-	basic_examples/vector_scal_fortran
-
 basic_examples_vector_scal_fortran_SOURCES =	\
 basic_examples_vector_scal_fortran_SOURCES =	\
 	basic_examples/vector_scal_fortran.F	\
 	basic_examples/vector_scal_fortran.F	\
 	basic_examples/vector_scal_c.c		\
 	basic_examples/vector_scal_c.c		\
@@ -167,25 +273,15 @@ basic_examples_vector_scal_fortran_SOURCES =	\
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
 basic_examples_vector_scal_fortran_SOURCES +=	\
 basic_examples_vector_scal_fortran_SOURCES +=	\
 	basic_examples/vector_scal_cuda.cu
 	basic_examples/vector_scal_cuda.cu
+basic_examples_vector_scal_fortran_LDADD =	\
+	$(STARPU_CUDA_FORTRAN_LDFLAGS)
 endif
 endif
 endif
 endif
 
 
-examplebin_PROGRAMS +=				\
-	basic_examples/mult
-
-basic_examples_mult_SOURCES =			\
-	basic_examples/mult.c
-
 #################
 #################
 # block example #
 # block example #
 #################
 #################
 
 
-check_PROGRAMS +=				\
-	basic_examples/block
-
-examplebin_PROGRAMS +=				\
-	basic_examples/block
-
 basic_examples_block_SOURCES =			\
 basic_examples_block_SOURCES =			\
 	basic_examples/block.c			\
 	basic_examples/block.c			\
 	basic_examples/block_cpu.c
 	basic_examples/block_cpu.c
@@ -206,12 +302,6 @@ endif
 # Variable example #
 # Variable example #
 ####################
 ####################
 
 
-check_PROGRAMS +=				\
-	basic_examples/variable
-
-examplebin_PROGRAMS +=				\
-	basic_examples/variable
-
 basic_examples_variable_SOURCES =		\
 basic_examples_variable_SOURCES =		\
 	basic_examples/variable.c		\
 	basic_examples/variable.c		\
 	basic_examples/variable_kernels_cpu.c
 	basic_examples/variable_kernels_cpu.c
@@ -232,14 +322,6 @@ endif
 # Filters #
 # Filters #
 ###########
 ###########
 
 
-examplebin_PROGRAMS +=				\
-	filters/fvector				\
-	filters/fblock				\
-	filters/fmatrix
-
-filters_fvector_SOURCES =			\
-	filters/fvector.c
-
 filters_fblock_SOURCES =			\
 filters_fblock_SOURCES =			\
 	filters/fblock.c			\
 	filters/fblock.c			\
 	filters/fblock_cpu.c
 	filters/fblock_cpu.c
@@ -255,69 +337,17 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	filters/fblock_opencl_kernel.cl
 	filters/fblock_opencl_kernel.cl
 endif
 endif
 
 
-filters_fmatrix_SOURCES =			\
-	filters/fmatrix.c
-
-###################
-# PPM downscaling #
-###################
-
-examplebin_PROGRAMS +=				\
-	ppm_downscaler/ppm_downscaler
-
-ppm_downscaler_ppm_downscaler_SOURCES =		\
-	ppm_downscaler/ppm_downscaler.c
-
-examplebin_PROGRAMS +=				\
-	ppm_downscaler/yuv_downscaler
-
-ppm_downscaler_yuv_downscaler_SOURCES =		\
-	ppm_downscaler/yuv_downscaler.c
-
-######
-# Pi #
-######
-
-check_PROGRAMS +=				\
-	pi/pi_redux
-
-examplebin_PROGRAMS +=				\
-	pi/pi					\
-	pi/pi_redux
-
-pi_pi_SOURCES =					\
-	pi/pi.c					\
-	pi/SobolQRNG/sobol_gold.c		\
-	pi/SobolQRNG/sobol_primitives.c
-
-if STARPU_USE_CUDA
-pi_pi_SOURCES +=				\
-	pi/pi_kernel.cu				\
-	pi/SobolQRNG/sobol_gpu.cu
-endif
-
-pi_pi_redux_SOURCES =				\
-	pi/pi_redux.c
-
-if STARPU_USE_CUDA
-pi_pi_redux_SOURCES +=				\
-	pi/pi_redux_kernel.cu
-endif
-
-
 ################
 ################
 # AXPY example #
 # AXPY example #
 ################
 ################
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
-
-examplebin_PROGRAMS +=				\
-	axpy/axpy
-
 axpy_axpy_SOURCES =				\
 axpy_axpy_SOURCES =				\
 	axpy/axpy.c				\
 	axpy/axpy.c				\
 	common/blas.c
 	common/blas.c
 
 
+axpy_axpy_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 endif
 
 
 ################
 ################
@@ -326,18 +356,20 @@ endif
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
 
 
-examplebin_PROGRAMS += 				\
-	mult/sgemm 				\
-	mult/dgemm
-
 mult_sgemm_SOURCES = 				\
 mult_sgemm_SOURCES = 				\
 	mult/sgemm.c				\
 	mult/sgemm.c				\
 	common/blas.c
 	common/blas.c
 
 
+mult_sgemm_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
+
 mult_dgemm_SOURCES = 				\
 mult_dgemm_SOURCES = 				\
 	mult/dgemm.c				\
 	mult/dgemm.c				\
 	common/blas.c
 	common/blas.c
 
 
+mult_dgemm_LDADD =				\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 endif
 
 
 ####################
 ####################
@@ -346,36 +378,42 @@ endif
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
 
 
-examplebin_PROGRAMS += 				\
-	cholesky/cholesky_tag			\
-	cholesky/cholesky_tile_tag		\
-	cholesky/cholesky_grain_tag		\
-	cholesky/cholesky_implicit
-
 cholesky_cholesky_tag_SOURCES =			\
 cholesky_cholesky_tag_SOURCES =			\
 	cholesky/cholesky_tag.c			\
 	cholesky/cholesky_tag.c			\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 	common/blas.c
 
 
+cholesky_cholesky_tag_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_tile_tag_SOURCES =		\
 cholesky_cholesky_tile_tag_SOURCES =		\
 	cholesky/cholesky_tile_tag.c		\
 	cholesky/cholesky_tile_tag.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 	common/blas.c
 
 
+cholesky_cholesky_tile_tag_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_grain_tag_SOURCES =		\
 cholesky_cholesky_grain_tag_SOURCES =		\
 	cholesky/cholesky_grain_tag.c		\
 	cholesky/cholesky_grain_tag.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 	common/blas.c
 
 
+cholesky_cholesky_grain_tag_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 cholesky_cholesky_implicit_SOURCES =		\
 cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
 	cholesky/cholesky_kernels.c		\
 	common/blas.c
 	common/blas.c
 
 
+cholesky_cholesky_implicit_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 endif
 
 
 ##############
 ##############
@@ -384,14 +422,6 @@ endif
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
 
 
-check_PROGRAMS +=				\
-	lu/lu_example_float			\
-	lu/lu_implicit_example_float
-
-examplebin_PROGRAMS += 				\
-	lu/lu_example_float			\
-	lu/lu_example_double
-
 lu_lu_example_float_SOURCES =			\
 lu_lu_example_float_SOURCES =			\
 	lu/lu_example_float.c			\
 	lu/lu_example_float.c			\
 	lu/slu.c				\
 	lu/slu.c				\
@@ -399,6 +429,9 @@ lu_lu_example_float_SOURCES =			\
 	lu/slu_kernels.c			\
 	lu/slu_kernels.c			\
 	common/blas.c
 	common/blas.c
 
 
+lu_lu_example_float_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
+
 lu_lu_example_double_SOURCES =			\
 lu_lu_example_double_SOURCES =			\
 	lu/lu_example_double.c			\
 	lu/lu_example_double.c			\
 	lu/dlu.c				\
 	lu/dlu.c				\
@@ -406,9 +439,8 @@ lu_lu_example_double_SOURCES =			\
 	lu/dlu_kernels.c			\
 	lu/dlu_kernels.c			\
 	common/blas.c
 	common/blas.c
 
 
-examplebin_PROGRAMS += 				\
-	lu/lu_implicit_example_float		\
-	lu/lu_implicit_example_double
+lu_lu_example_double_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
 
 
 lu_lu_implicit_example_float_SOURCES =		\
 lu_lu_implicit_example_float_SOURCES =		\
 	lu/lu_example_float.c			\
 	lu/lu_example_float.c			\
@@ -417,6 +449,9 @@ lu_lu_implicit_example_float_SOURCES =		\
 	lu/slu_kernels.c			\
 	lu/slu_kernels.c			\
 	common/blas.c
 	common/blas.c
 
 
+lu_lu_implicit_example_float_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
 lu_lu_implicit_example_double_SOURCES =		\
 lu_lu_implicit_example_double_SOURCES =		\
 	lu/lu_example_double.c			\
 	lu/lu_example_double.c			\
 	lu/dlu_implicit.c			\
 	lu/dlu_implicit.c			\
@@ -424,6 +459,8 @@ lu_lu_implicit_example_double_SOURCES =		\
 	lu/dlu_kernels.c			\
 	lu/dlu_kernels.c			\
 	common/blas.c
 	common/blas.c
 
 
+lu_lu_implicit_example_double_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 endif
 
 
 ###########################
 ###########################
@@ -448,8 +485,6 @@ endif
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
 
 
-examplebin_PROGRAMS += heat/heat
-
 heat_heat_SOURCES =				\
 heat_heat_SOURCES =				\
 	heat/heat.c				\
 	heat/heat.c				\
 	heat/dw_factolu.c			\
 	heat/dw_factolu.c			\
@@ -462,6 +497,10 @@ heat_heat_SOURCES =				\
 	heat/dw_factolu_kernels.c		\
 	heat/dw_factolu_kernels.c		\
 	common/blas.c
 	common/blas.c
 
 
+heat_heat_LDADD =				\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)		\
+	$(STARPU_BLAS_LDFLAGS)
+
 endif
 endif
 
 
 ##############
 ##############
@@ -470,8 +509,6 @@ endif
 
 
 if !NO_BLAS_LIB
 if !NO_BLAS_LIB
 
 
-examplebin_PROGRAMS += cg/cg
-
 cg_cg_SOURCES =					\
 cg_cg_SOURCES =					\
 	cg/cg.c					\
 	cg/cg.c					\
 	cg/cg_kernels.c				\
 	cg/cg_kernels.c				\
@@ -482,62 +519,33 @@ cg_cg_SOURCES +=				\
 	cg/cg_dot_kernel.cu
 	cg/cg_dot_kernel.cu
 endif
 endif
 
 
-endif
-
+cg_cg_LDADD =					\
+	$(STARPU_BLAS_LDFLAGS)
 
 
-
-################
-# Tag examples #
-################
-
-check_PROGRAMS +=			\
-	tag_example/tag_example			\
-	tag_example/tag_example3			\
-	tag_example/tag_example2	\
-	tag_example/tag_restartable
-
-examplebin_PROGRAMS +=			\
-	tag_example/tag_example			\
-	tag_example/tag_example3		\
-	tag_example/tag_example2	\
-	tag_example/tag_restartable
-
-tag_example_tag_example_SOURCES =		\
-	tag_example/tag_example.c
-
-tag_example_tag_example2_SOURCES =		\
-	tag_example/tag_example2.c
-
-tag_example_tag_example3_SOURCES =		\
-	tag_example/tag_example3.c
-
-tag_example_tag_restartable_SOURCES =		\
-	tag_example/tag_restartable.c
+endif
 
 
 ################
 ################
 # SpMV example #
 # SpMV example #
 ################
 ################
 
 
-examplebin_PROGRAMS += 				\
-	spmv/dw_spmv
-
-spmv_dw_spmv_SOURCES = 				\
-	spmv/dw_spmv.c
+spmv_spmv_SOURCES = 				\
+	spmv/spmv.c				\
+	spmv/spmv_kernels.c
 
 
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
-spmv_dw_spmv_SOURCES +=				\
+spmv_spmv_SOURCES +=				\
 	spmv/spmv_cuda.cu
 	spmv/spmv_cuda.cu
 endif
 endif
 
 
 if ATLAS_BLAS_LIB
 if ATLAS_BLAS_LIB
-examplebin_PROGRAMS += 				\
-	spmv/dw_block_spmv
-
 spmv_dw_block_spmv_SOURCES =			\
 spmv_dw_block_spmv_SOURCES =			\
 	spmv/dw_block_spmv.c			\
 	spmv/dw_block_spmv.c			\
 	spmv/dw_block_spmv_kernels.c		\
 	spmv/dw_block_spmv_kernels.c		\
 	spmv/matrix_market/mm_to_bcsr.c		\
 	spmv/matrix_market/mm_to_bcsr.c		\
 	spmv/matrix_market/mmio.c
 	spmv/matrix_market/mmio.c
+
+spmv_dw_block_spmv_LDADD =			\
+	$(STARPU_BLAS_LDFLAGS)
 endif
 endif
 
 
 #######################
 #######################
@@ -545,12 +553,6 @@ endif
 #######################
 #######################
 
 
 
 
-check_PROGRAMS +=				\
-	incrementer/incrementer
-
-examplebin_PROGRAMS +=				\
-	incrementer/incrementer
-
 incrementer_incrementer_SOURCES =	\
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c
 	incrementer/incrementer.c
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
@@ -568,78 +570,38 @@ endif
 # matVecMult example #
 # matVecMult example #
 ######################
 ######################
 
 
-check_PROGRAMS +=				\
-	matvecmult/matvecmult
-
-examplebin_PROGRAMS +=				\
-	matvecmult/matvecmult
-
-matvecmult_matvecmult_SOURCES =	\
-	matvecmult/matvecmult.c
-
 if STARPU_USE_OPENCL
 if STARPU_USE_OPENCL
 nobase_STARPU_OPENCL_DATA_DATA += \
 nobase_STARPU_OPENCL_DATA_DATA += \
 	matvecmult/matvecmult_kernel.cl
 	matvecmult/matvecmult_kernel.cl
 endif
 endif
 
 
-#####################
-# profiling example #
-#####################
-
-check_PROGRAMS +=				\
-	profiling/profiling
-
-examplebin_PROGRAMS +=				\
-	profiling/profiling
-
-profiling_profiling_SOURCES =			\
-	profiling/profiling.c
-
-#####################
-# scheduler example #
-#####################
-
-check_PROGRAMS +=				\
-	scheduler/dummy_sched
-
-examplebin_PROGRAMS +=				\
-	scheduler/dummy_sched
-
-scheduler_dummy_sched_SOURCES =			\
-	scheduler/dummy_sched.c
-
 #######################
 #######################
 # dot_product example #
 # dot_product example #
 #######################
 #######################
 
 
-check_PROGRAMS +=				\
-	reductions/dot_product
-
-examplebin_PROGRAMS +=				\
-	reductions/dot_product
-
 reductions_dot_product_SOURCES =		\
 reductions_dot_product_SOURCES =		\
 	reductions/dot_product.c
 	reductions/dot_product.c
-
-#####################
-# Min/Max reduction #
-#####################
-
-check_PROGRAMS +=				\
-	reductions/minmax_reduction
-
-examplebin_PROGRAMS +=				\
-	reductions/minmax_reduction
-
-reductions_minmax_reduction_SOURCES =		\
-	reductions/minmax_reduction.c
+if STARPU_USE_CUDA
+reductions_dot_product_SOURCES +=		\
+	reductions/dot_product_kernels.cu
+endif
 
 
 ##################
 ##################
 # Mandelbrot Set #
 # Mandelbrot Set #
 ##################
 ##################
 
 
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+if HAVE_X11
+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+endif
+
+################
+# Top Examples #
+################
+
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
-	mandelbrot/mandelbrot
+	top/hello_world_top
 
 
-mandelbrot_mandelbrot_SOURCES =			\
-	mandelbrot/mandelbrot.c
+top_hello_world_top_SOURCES =			\
+	top/hello_world_top.c

+ 5 - 7
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
@@ -30,7 +30,7 @@
 #include <cufft.h>
 #include <cufft.h>
 #endif
 #endif
 
 
-//#define SAVE_RAW	1
+/* #define SAVE_RAW	1 */
 
 
 #define DEFAULTINPUTFILE	"input.wav"
 #define DEFAULTINPUTFILE	"input.wav"
 #define DEFAULTOUTPUTFILE	"output.wav"
 #define DEFAULTOUTPUTFILE	"output.wav"
@@ -328,14 +328,14 @@ static void init_problem(void)
 	/* allocate a buffer to store the content of input file */
 	/* allocate a buffer to store the content of input file */
 	if (use_pin)
 	if (use_pin)
 	{
 	{
-		starpu_data_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
+		starpu_malloc((void **)&A, length_data*sizeof(float));
 	}
 	}
 	else {
 	else {
 		A = malloc(length_data*sizeof(float));
 		A = malloc(length_data*sizeof(float));
 	}
 	}
 
 
 	/* allocate working buffer (this could be done online, but we'll keep it simple) */
 	/* allocate working buffer (this could be done online, but we'll keep it simple) */
-	//starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
+	/* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
 
 
 	/* read input data into buffer "A" */
 	/* read input data into buffer "A" */
 	read_16bit_wav(infile, length_data, A, infile_raw);
 	read_16bit_wav(infile, length_data, A, infile_raw);
@@ -396,9 +396,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f = 
 	struct starpu_data_filter f = 
 	{
 	{
 		.filter_func = starpu_block_filter_func_vector,
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = niter,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = niter
 	};
 	};
 
 
 	starpu_data_partition(A_handle, &f);
 	starpu_data_partition(A_handle, &f);

+ 18 - 13
examples/axpy/axpy.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,6 +36,8 @@
 
 
 #define NBLOCKS	8
 #define NBLOCKS	8
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 TYPE *vec_x, *vec_y;
 TYPE *vec_x, *vec_y;
 
 
 /* descriptors for StarPU */
 /* descriptors for StarPU */
@@ -93,21 +95,21 @@ int main(int argc, char **argv)
 		vec_a = malloc(N*sizeof(TYPE));
 		vec_a = malloc(N*sizeof(TYPE));
 		vec_b = malloc(N*sizeof(TYPE));
 		vec_b = malloc(N*sizeof(TYPE));
 	*/
 	*/
-	starpu_data_malloc_pinned_if_possible((void **)&vec_x, N*sizeof(TYPE));
+	starpu_malloc((void **)&vec_x, N*sizeof(TYPE));
 	assert(vec_x);
 	assert(vec_x);
 
 
-	starpu_data_malloc_pinned_if_possible((void **)&vec_y, N*sizeof(TYPE));
+	starpu_malloc((void **)&vec_y, N*sizeof(TYPE));
 	assert(vec_y);
 	assert(vec_y);
 
 
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < N; i++)
 	for (i = 0; i < N; i++)
 	{
 	{
-		vec_x[i] = 1.0f;//(TYPE)starpu_drand48();
-		vec_y[i] = 4.0f;//(TYPE)starpu_drand48();
+		vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
+		vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
 	}
 	}
 
 
-	fprintf(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
-	fprintf(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
+	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
+	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
 
 
 	/* Declare the data to StarPU */
 	/* Declare the data to StarPU */
 	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
 	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
@@ -116,9 +118,7 @@ int main(int argc, char **argv)
 	/* Divide the vector into blocks */
 	/* Divide the vector into blocks */
 	struct starpu_data_filter block_filter = {
 	struct starpu_data_filter block_filter = {
 		.filter_func = starpu_block_filter_func_vector,
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = NBLOCKS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = NBLOCKS
 	};
 	};
 
 
 	starpu_data_partition(handle_x, &block_filter);
 	starpu_data_partition(handle_x, &block_filter);
@@ -151,16 +151,21 @@ int main(int argc, char **argv)
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
+	starpu_data_unpartition(handle_x, 0);
 	starpu_data_unpartition(handle_y, 0);
 	starpu_data_unpartition(handle_y, 0);
+	starpu_data_unregister(handle_x);
 	starpu_data_unregister(handle_y);
 	starpu_data_unregister(handle_y);
 
 
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
                                         (end.tv_usec - start.tv_usec));
                                         (end.tv_usec - start.tv_usec));
 
 
-	fprintf(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
+
+	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
 
 
-	fprintf(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
+	starpu_free((void *)vec_x);
+	starpu_free((void *)vec_y);
 
 
 	/* Stop StarPU */
 	/* Stop StarPU */
 	starpu_shutdown();
 	starpu_shutdown();

+ 13 - 10
examples/basic_examples/block.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <pthread.h>
 #include <pthread.h>
 #include <math.h>
 #include <math.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 extern void cpu_codelet(void *descr[], void *_args);
 extern void cpu_codelet(void *descr[], void *_args);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 extern void cuda_codelet(void *descr[], void *_args);
 extern void cuda_codelet(void *descr[], void *_args);
@@ -52,24 +54,23 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
         task->buffers[0].handle = block_handle;
         task->buffers[0].handle = block_handle;
         task->buffers[0].mode = STARPU_RW;
         task->buffers[0].mode = STARPU_RW;
 	task->cl_arg = &multiplier;
 	task->cl_arg = &multiplier;
+	task->cl_arg_size = sizeof(multiplier);
 
 
         int ret = starpu_task_submit(task);
         int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 return 1;
                 return 1;
 	}
 	}
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
 	/* update the array in RAM */
 	/* update the array in RAM */
-        starpu_data_acquire(block_handle, STARPU_R);
+	starpu_data_unregister(block_handle);
 
 
         for(i=0 ; i<pnx*pny*pnz; i++) {
         for(i=0 ; i<pnx*pny*pnz; i++) {
-          fprintf(stderr, "%f ", block[i]);
+          FPRINTF(stderr, "%f ", block[i]);
         }
         }
-        fprintf(stderr, "\n");
-
-        starpu_data_release(block_handle);
+        FPRINTF(stderr, "\n");
 
 
         return 0;
         return 0;
 }
 }
@@ -98,7 +99,7 @@ int main(int argc, char **argv)
         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
         if (!ret) multiplier *= 1.0;
         if (!ret) multiplier *= 1.0;
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code);
+        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
         if (!ret) multiplier *= 2.0;
         if (!ret) multiplier *= 2.0;
 #endif
 #endif
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
         if (!ret) multiplier *= 3.0;
         if (!ret) multiplier *= 3.0;
 #endif
 #endif
 
 
-        // Check result is correct
+        /* Check result is correct */
         ret=1;
         ret=1;
         for(i=0 ; i<nx*ny*nz ; i++) {
         for(i=0 ; i<nx*ny*nz ; i++) {
           if (block[i] != (i+1) * multiplier) {
           if (block[i] != (i+1) * multiplier) {
@@ -116,7 +117,9 @@ int main(int argc, char **argv)
           }
           }
         }
         }
 
 
-        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+        FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
+	free(block);
+
         starpu_shutdown();
         starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 7 - 8
examples/basic_examples/block_opencl.c

@@ -25,8 +25,8 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_kernel kernel;
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_command_queue queue;
 	cl_event event;
 	cl_event event;
-	int id, devid, err, n;
-	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
+	int id, devid, err;
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
@@ -41,14 +41,13 @@ void opencl_codelet(void *descr[], void *_args)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	err = 0;
 	err = 0;
-        n=0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
-	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
-	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
-	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
+	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
+	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
+	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
 	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
 	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
 	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
 	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
-	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
+	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	{
 	{

+ 21 - 15
examples/basic_examples/hello_world.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,12 +31,14 @@
 #include <stdint.h>
 #include <stdint.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 /* When the task is done, task->callback_func(task->callback_arg) is called. Any
 /* When the task is done, task->callback_func(task->callback_arg) is called. Any
  * callback function must have the prototype void (*)(void *).
  * callback function must have the prototype void (*)(void *).
  * NB: Callback are NOT allowed to perform potentially blocking operations */
  * NB: Callback are NOT allowed to perform potentially blocking operations */
 void callback_func(void *callback_arg)
 void callback_func(void *callback_arg)
 {
 {
-	printf("Callback function got argument %p\n", callback_arg);
+        FPRINTF(stdout, "Callback function got argument %p\n", callback_arg);
 }
 }
 
 
 /* Every implementation of a codelet must have this prototype, the first
 /* Every implementation of a codelet must have this prototype, the first
@@ -52,22 +54,16 @@ void cpu_func(void *buffers[], void *cl_arg)
 {
 {
 	struct params *params = cl_arg;
 	struct params *params = cl_arg;
 
 
-	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 }
 
 
-starpu_codelet cl =
-{
-	/* this codelet may only be executed on a CPU, and its cpu
- 	 * implementation is function "cpu_func" */
-	.where = STARPU_CPU,
-	.cpu_func = cpu_func,
-	/* the codelet does not manipulate any data that is managed
-	 * by our DSM */
-	.nbuffers = 0
-};
+starpu_codelet cl;
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
+	struct starpu_task *task;
+	struct params params = {1, 2.0f};
+
 	/* initialize StarPU : passing a NULL argument means that we use
 	/* initialize StarPU : passing a NULL argument means that we use
  	* default configuration for the scheduling policies and the number of
  	* default configuration for the scheduling policies and the number of
 	* processors/accelerators */
 	* processors/accelerators */
@@ -76,7 +72,15 @@ int main(int argc, char **argv)
 	/* create a new task that is non-blocking by default : the task is not
 	/* create a new task that is non-blocking by default : the task is not
 	 * submitted to the scheduler until the starpu_task_submit function is
 	 * submitted to the scheduler until the starpu_task_submit function is
 	 * called */
 	 * called */
-	struct starpu_task *task = starpu_task_create();
+	task = starpu_task_create();
+
+	/* this codelet may only be executed on a CPU, and its cpu
+ 	 * implementation is function "cpu_func" */
+	cl.where = STARPU_CPU;
+	cl.cpu_func = cpu_func;
+	/* the codelet does not manipulate any data that is managed
+	 * by our DSM */
+	cl.nbuffers = 0;
 
 
 	/* the task uses codelet "cl" */
 	/* the task uses codelet "cl" */
 	task->cl = &cl;
 	task->cl = &cl;
@@ -89,7 +93,6 @@ int main(int argc, char **argv)
 	 * is read-only so that any modification is not passed to other copies
 	 * is read-only so that any modification is not passed to other copies
 	 * of the buffer.  For this reason, a buffer passed as a codelet
 	 * of the buffer.  For this reason, a buffer passed as a codelet
 	 * argument (cl_arg) is NOT a valid synchronization medium! */
 	 * argument (cl_arg) is NOT a valid synchronization medium! */
-	struct params params = { 1, 2.0f };
 	task->cl_arg = &params;
 	task->cl_arg = &params;
 	task->cl_arg_size = sizeof(params);
 	task->cl_arg_size = sizeof(params);
 		
 		
@@ -103,6 +106,9 @@ int main(int argc, char **argv)
 	
 	
 	/* submit the task to StarPU */
 	/* submit the task to StarPU */
 	starpu_task_submit(task);
 	starpu_task_submit(task);
+
+	/* destroy the task */
+	starpu_task_destroy(task);
 	
 	
 	/* terminate StarPU: statistics and other debug outputs are not
 	/* terminate StarPU: statistics and other debug outputs are not
 	 * guaranteed to be generated unless this function is called. Once it
 	 * guaranteed to be generated unless this function is called. Once it

+ 36 - 79
examples/basic_examples/mult.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,8 +28,7 @@
  *    monitoring data (starpu_data_unregister)
  *    monitoring data (starpu_data_unregister)
  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
  *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
  *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
- *  - how to submit asynchronous tasks and how to use callback to handle task
- *    termination
+ *  - how to submit asynchronous tasks
  */
  */
 
 
 #include <string.h>
 #include <string.h>
@@ -44,11 +43,6 @@
 static float *A, *B, *C;
 static float *A, *B, *C;
 static starpu_data_handle A_handle, B_handle, C_handle;
 static starpu_data_handle A_handle, B_handle, C_handle;
 
 
-static pthread_mutex_t mutex;
-static pthread_cond_t cond;
-static unsigned taskcounter;
-static unsigned terminated = 0;
-
 static unsigned nslicesx = 4;
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
 static unsigned nslicesy = 4;
 static unsigned xdim = 1024;
 static unsigned xdim = 1024;
@@ -77,37 +71,11 @@ static unsigned zdim = 512;
 
 
  */
  */
 
 
-static void callback_func(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	int *counterptr = arg;
-
-	/* counterptr points to a variable with the number of remaining tasks,
- 	 * when it reaches 0, all tasks are done */
-	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
-	if (counter == 0)
-	{
-		/* IMPORTANT : note that we CANNOT call blocking operations
-		 * within callbacks as it may lead to a deadlock of StarPU.
-		 * starpu_data_unpartition is for instance called by the main
-		 * thread since it may cause /potentially/ blocking operations
-		 * such as memory transfers from a GPU to a CPU. */
-		
-		/* wake the application to notify the termination of all the
- 		 * tasks */
-		pthread_mutex_lock(&mutex);
-		terminated = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
-	}
-}
-
 /*
 /*
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * The codelet is passed 3 matrices, the "descr" union-type field gives a
  * description of the layout of those 3 matrices in the local memory (ie. RAM
  * description of the layout of those 3 matrices in the local memory (ie. RAM
  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
- * registered data with the "blas" data interface, we manipulate the .blas
- * field of the descr[x] elements which are union types.
+ * registered data with the "matrix" data interface, we use the matrix macros.
  */
  */
 
 
 static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
 static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
@@ -218,18 +186,14 @@ static void partition_mult_data(void)
 	/* StarPU supplies some basic filters such as the partition of a matrix
 	/* StarPU supplies some basic filters such as the partition of a matrix
 	 * into blocks, note that we are using a FORTRAN ordering so that the
 	 * into blocks, note that we are using a FORTRAN ordering so that the
 	 * name of the filters are a bit misleading */
 	 * name of the filters are a bit misleading */
-	struct starpu_data_filter f = {
+	struct starpu_data_filter vert = {
 		.filter_func = starpu_vertical_block_filter_func,
 		.filter_func = starpu_vertical_block_filter_func,
-		.nchildren = nslicesx,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = nslicesx
 	};
 	};
 		
 		
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter horiz = {
 		.filter_func = starpu_block_filter_func,
 		.filter_func = starpu_block_filter_func,
-		.nchildren = nslicesy,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = nslicesy
 	};
 	};
 		
 		
 /*
 /*
@@ -269,17 +233,17 @@ static void partition_mult_data(void)
  *	enforce memory consistency.
  *	enforce memory consistency.
  */
  */
 
 
-	starpu_data_partition(B_handle, &f);
-	starpu_data_partition(A_handle, &f2);
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
 
 
 	/* starpu_data_map_filters is a variable-arity function, the first argument
 	/* starpu_data_map_filters is a variable-arity function, the first argument
 	 * is the handle of the data to partition, the second argument is the
 	 * is the handle of the data to partition, the second argument is the
 	 * number of filters to apply recursively. Filters are applied in the
 	 * number of filters to apply recursively. Filters are applied in the
 	 * same order as the arguments.
 	 * same order as the arguments.
-	 * This would be equivalent to starpu_data_partition(C_handle, &f) and
-	 * then applying f2 on each sub-data (ie. each column of C)
+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
+	 * then applying horiz on each sub-data (ie. each column of C)
 	 */
 	 */
-	starpu_data_map_filters(C_handle, 2, &f, &f2);
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 }
 }
 
 
 static struct starpu_perfmodel_t mult_perf_model = {
 static struct starpu_perfmodel_t mult_perf_model = {
@@ -287,28 +251,23 @@ static struct starpu_perfmodel_t mult_perf_model = {
 	.symbol = "mult_perf_model"
 	.symbol = "mult_perf_model"
 };
 };
 
 
+static starpu_codelet cl = {
+        /* we can only execute that kernel on a CPU yet */
+        .where = STARPU_CPU,
+        /* CPU implementation of the codelet */
+        .cpu_func = cpu_mult,
+        /* the codelet manipulates 3 buffers that are managed by the
+         * DSM */
+        .nbuffers = 3,
+        /* in case the scheduling policy may use performance models */
+        .model = &mult_perf_model
+};
+
 static void launch_tasks(void)
 static void launch_tasks(void)
 {
 {
 	/* partition the work into slices */
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 	unsigned taskx, tasky;
 
 
-	/* the callback decrements this value every time a task is terminated
-	 * and notify the termination of the computation to the application
-	 * when the counter reaches 0 */
-	taskcounter = nslicesx * nslicesy;
-
-	starpu_codelet cl = {
-		/* we can only execute that kernel on a CPU yet */
-		.where = STARPU_CPU,
-		/* CPU implementation of the codelet */
-		.cpu_func = cpu_mult,
-		/* the codelet manipulates 3 buffers that are managed by the
- 		 * DSM */
-		.nbuffers = 3,
-		/* in case the scheduling policy may use performance models */
-		.model = &mult_perf_model
-	};
-
 	for (taskx = 0; taskx < nslicesx; taskx++) 
 	for (taskx = 0; taskx < nslicesx; taskx++) 
 	{
 	{
 		for (tasky = 0; tasky < nslicesy; tasky++)
 		for (tasky = 0; tasky < nslicesy; tasky++)
@@ -322,9 +281,6 @@ static void launch_tasks(void)
 			/* this task implements codelet "cl" */
 			/* this task implements codelet "cl" */
 			task->cl = &cl;
 			task->cl = &cl;
 
 
-			task->callback_func = callback_func;
-			task->callback_arg = &taskcounter;
-
 			/*
 			/*
 			 *              |---|---|---|---|
 			 *              |---|---|---|---|
 			 *              |   | * |   |   | B
 			 *              |   | * |   |   | B
@@ -371,9 +327,6 @@ static void launch_tasks(void)
 int main(__attribute__ ((unused)) int argc, 
 int main(__attribute__ ((unused)) int argc, 
 	 __attribute__ ((unused)) char **argv)
 	 __attribute__ ((unused)) char **argv)
 {
 {
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
-
 	/* start the runtime */
 	/* start the runtime */
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
@@ -387,26 +340,30 @@ int main(__attribute__ ((unused)) int argc,
 	/* submit all tasks in an asynchronous fashion */
 	/* submit all tasks in an asynchronous fashion */
 	launch_tasks();
 	launch_tasks();
 
 
-	/* the different tasks are asynchronous so we use a callback to get
-	 * notified of the termination of the computation */
-	pthread_mutex_lock(&mutex);
-	if (!terminated)
-		pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	/* wait for termination */
+        starpu_task_wait_for_all();
 
 
 	/* remove the filters applied by the means of starpu_data_map_filters; now
 	/* remove the filters applied by the means of starpu_data_map_filters; now
  	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
  	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
 	 * starpu_data_map_filters is called again on C_handle.
 	 * starpu_data_map_filters is called again on C_handle.
 	 * The second argument is the memory node where the different subsets
 	 * The second argument is the memory node where the different subsets
 	 * should be reassembled, 0 = main memory (RAM) */
 	 * should be reassembled, 0 = main memory (RAM) */
+	starpu_data_unpartition(A_handle, 0);
+	starpu_data_unpartition(B_handle, 0);
 	starpu_data_unpartition(C_handle, 0);
 	starpu_data_unpartition(C_handle, 0);
 
 
 	/* stop monitoring matrix C : after this, it is not possible to pass C 
 	/* stop monitoring matrix C : after this, it is not possible to pass C 
 	 * (or any subset of C) as a codelet input/output. This also implements
 	 * (or any subset of C) as a codelet input/output. This also implements
 	 * a barrier so that the piece of data is put back into main memory in
 	 * a barrier so that the piece of data is put back into main memory in
 	 * case it was only available on a GPU for instance. */
 	 * case it was only available on a GPU for instance. */
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(B_handle);
 	starpu_data_unregister(C_handle);
 	starpu_data_unregister(C_handle);
-	
+
+	free(A);
+	free(B);
+	free(C);
+
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 9 - 6
examples/basic_examples/variable.c

@@ -18,6 +18,8 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <pthread.h>
 #include <pthread.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static unsigned niter = 50000;
 static unsigned niter = 50000;
 
 
 extern void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 extern void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args);
@@ -41,6 +43,9 @@ int main(int argc, char **argv)
 
 
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
+#ifdef STARPU_SLOW_MACHINE
+	niter /= 100;
+#endif
         if (argc == 2) niter = atoi(argv[1]);
         if (argc == 2) niter = atoi(argv[1]);
         foo = 0.0f;
         foo = 0.0f;
 
 
@@ -48,7 +53,7 @@ int main(int argc, char **argv)
                                       (uintptr_t)&foo, sizeof(float));
                                       (uintptr_t)&foo, sizeof(float));
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 #endif
 
 
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
@@ -77,7 +82,7 @@ int main(int argc, char **argv)
 		ret = starpu_task_submit(task);
 		ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 			exit(0);
 		}
 		}
 	}
 	}
@@ -85,11 +90,9 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
 	/* update the array in RAM */
 	/* update the array in RAM */
-	starpu_data_acquire(float_array_handle, STARPU_R);
-
-	fprintf(stderr, "variable -> %f\n", foo);
+	starpu_data_unregister(float_array_handle);
 
 
-	starpu_data_release(float_array_handle);
+	FPRINTF(stderr, "variable -> %f\n", foo);
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 2 - 2
examples/basic_examples/variable_kernels_opencl.c

@@ -21,7 +21,7 @@
 extern struct starpu_opencl_program opencl_program;
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)
 void opencl_codelet(void *descr[], void *_args)
 {
 {
-	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_command_queue queue;
 	cl_event event;
 	cl_event event;
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	err = 0;
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	{
 	{

+ 10 - 5
examples/basic_examples/vector_scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,7 @@
 #include <stdio.h>
 #include <stdio.h>
 
 
 #define	NX	2048
 #define	NX	2048
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
@@ -71,16 +72,17 @@ int main(int argc, char **argv)
 	float vector[NX];
 	float vector[NX];
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < NX; i++)
 	for (i = 0; i < NX; i++)
-		vector[i] = 1.0f;
+                vector[i] = (i+1.0f);
 
 
-	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
 
 
 	/* Initialize StarPU with default configuration */
 	/* Initialize StarPU with default configuration */
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
 	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
-					    &opencl_program);
+					    &opencl_program, NULL);
 #endif
 #endif
 
 
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
@@ -125,6 +127,8 @@ int main(int argc, char **argv)
  	 * monitoring it */
  	 * monitoring it */
 	starpu_data_unregister(vector_handle);
 	starpu_data_unregister(vector_handle);
 
 
+	starpu_task_destroy(task);
+
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         starpu_opencl_unload_opencl(&opencl_program);
         starpu_opencl_unload_opencl(&opencl_program);
 #endif
 #endif
@@ -132,7 +136,8 @@ int main(int argc, char **argv)
 	/* terminate StarPU, no task can be submitted after */
 	/* terminate StarPU, no task can be submitted after */
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
 
 
 	return 0;
 	return 0;
 }
 }

+ 2 - 2
examples/basic_examples/vector_scal_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,7 +25,7 @@
 static __global__ void vector_mult_cuda(float *val, unsigned n,
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
                                         float factor)
 {
 {
-        unsigned i = threadIdx.x;
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
 
 
 	if (i < n)
 	if (i < n)
                val[i] *= factor;
                val[i] *= factor;

+ 3 - 3
examples/basic_examples/vector_scal_opencl.c

@@ -36,8 +36,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
 
 	/* length of the vector */
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-	/* local copy of the vector pointer */
-	float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+	/* OpenCL copy of the vector pointer */
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
 
 
 	id = starpu_worker_get_id();
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);
 	devid = starpu_worker_get_devid(id);
@@ -45,7 +45,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
 	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);

+ 73 - 0
examples/callback/callback.c

@@ -0,0 +1,73 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <pthread.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+starpu_data_handle handle;
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	*val += 1;
+}
+
+starpu_codelet cl =
+{
+	.where = STARPU_CPU,
+	.cpu_func = cpu_codelet,
+	.nbuffers = 1
+};
+
+void callback_func(void *callback_arg)
+{
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->buffers[0].handle = handle;
+	task->buffers[0].mode = STARPU_RW;
+	starpu_task_submit(task);
+}
+
+int main(int argc, char **argv)
+{
+	int v=40;
+
+	starpu_init(NULL);
+	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
+
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	task->buffers[0].handle = handle;
+	task->buffers[0].mode = STARPU_RW;
+
+	starpu_task_submit(task);
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle);
+
+	FPRINTF(stderr, "v -> %d\n", v);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 21 - 22
examples/cg/cg.c

@@ -13,6 +13,7 @@
  *
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
+
 #include <math.h>
 #include <math.h>
 #include <assert.h>
 #include <assert.h>
 #include <sys/time.h>
 #include <sys/time.h>
@@ -24,6 +25,8 @@
 #include <cublas.h>
 #include <cublas.h>
 #endif
 #endif
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 /*
 /*
  *	Conjugate Gradient
  *	Conjugate Gradient
  *
  *
@@ -92,23 +95,19 @@ extern starpu_codelet bzero_vector_cl;
 
 
 static void generate_random_problem(void)
 static void generate_random_problem(void)
 {
 {
-	srand48(0xdeadbeef);
-
 	int i, j;
 	int i, j;
 
 
-	starpu_data_malloc_pinned_if_possible((void **)&A, n*n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&b, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&x, n*sizeof(TYPE));
+	starpu_malloc((void **)&A, n*n*sizeof(TYPE));
+	starpu_malloc((void **)&b, n*sizeof(TYPE));
+	starpu_malloc((void **)&x, n*sizeof(TYPE));
 	assert(A && b && x);
 	assert(A && b && x);
 
 
-	/* Create a random matrix (A) and two random vectors (x and b) */
 	for (j = 0; j < n; j++)
 	for (j = 0; j < n; j++)
 	{
 	{
 		b[j] = (TYPE)1.0;
 		b[j] = (TYPE)1.0;
 		x[j] = (TYPE)0.0;
 		x[j] = (TYPE)0.0;
 
 
 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
-
 		for (i = 0; i < n; i++)
 		for (i = 0; i < n; i++)
 		{
 		{
 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));
 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));
@@ -116,9 +115,9 @@ static void generate_random_problem(void)
 	}
 	}
 
 
 	/* Internal vectors */
 	/* Internal vectors */
-	starpu_data_malloc_pinned_if_possible((void **)&r, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&d, n*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&q, n*sizeof(TYPE));
+	starpu_malloc((void **)&r, n*sizeof(TYPE));
+	starpu_malloc((void **)&d, n*sizeof(TYPE));
+	starpu_malloc((void **)&q, n*sizeof(TYPE));
 	assert(r && d && q);
 	assert(r && d && q);
 
 
 	memset(r, 0, n*sizeof(TYPE));
 	memset(r, 0, n*sizeof(TYPE));
@@ -205,12 +204,12 @@ static void display_vector(starpu_data_handle handle, TYPE *ptr)
 		starpu_data_acquire(starpu_data_get_sub_data(handle, 1, b), STARPU_R);
 		starpu_data_acquire(starpu_data_get_sub_data(handle, 1, b), STARPU_R);
 		for (ind = 0; ind < block_size; ind++)
 		for (ind = 0; ind < block_size; ind++)
 		{
 		{
-			fprintf(stderr, "%2.2e ", ptr[b*block_size + ind]);
+			FPRINTF(stderr, "%2.2e ", ptr[b*block_size + ind]);
 		}
 		}
-		fprintf(stderr, "| ");
+		FPRINTF(stderr, "| ");
 		starpu_data_release(starpu_data_get_sub_data(handle, 1, b));
 		starpu_data_release(starpu_data_get_sub_data(handle, 1, b));
 	}
 	}
-	fprintf(stderr, "\n");
+	FPRINTF(stderr, "\n");
 }
 }
 
 
 static void display_matrix(void)
 static void display_matrix(void)
@@ -220,9 +219,9 @@ static void display_matrix(void)
 	{
 	{
 		for (j = 0; j < n; j++)
 		for (j = 0; j < n; j++)
 		{
 		{
-			fprintf(stderr, "%2.2e ", A[j*n + i]);
+			FPRINTF(stderr, "%2.2e ", A[j*n + i]);
 		}
 		}
-		fprintf(stderr, "\n");
+		FPRINTF(stderr, "\n");
 	}
 	}
 }
 }
 #endif
 #endif
@@ -255,8 +254,8 @@ static void cg(void)
 	delta_0 = delta_new;
 	delta_0 = delta_new;
 	starpu_data_release(rtr_handle);
 	starpu_data_release(rtr_handle);
 
 
-	fprintf(stderr, "*************** INITIAL ************ \n");
-	fprintf(stderr, "Delta 0: %e\n", delta_new);
+	FPRINTF(stderr, "*************** INITIAL ************ \n");
+	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
 
 
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
@@ -307,8 +306,8 @@ static void cg(void)
 		{
 		{
 			/* We here take the error as ||r||_2 / (n||b||_2) */
 			/* We here take the error as ||r||_2 / (n||b||_2) */
 			double error = sqrt(delta_new/delta_0)/(1.0*n);
 			double error = sqrt(delta_new/delta_0)/(1.0*n);
-			fprintf(stderr, "*****************************************\n");
-			fprintf(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+			FPRINTF(stderr, "*****************************************\n");
+			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
 		}
 		}
 
 
 		i++;
 		i++;
@@ -317,8 +316,8 @@ static void cg(void)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
-	fprintf(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	fprintf(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
+	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
+	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
 }
 }
 
 
 static int check(void)
 static int check(void)
@@ -351,7 +350,7 @@ static void parse_args(int argc, char **argv)
 		}
 		}
 
 
 	        if (strcmp(argv[i], "-h") == 0) {
 	        if (strcmp(argv[i], "-h") == 0) {
-			fprintf(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 			exit(-1);
 			continue;
 			continue;
 		}
 		}

+ 2 - 2
examples/cg/cg.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -82,4 +82,4 @@ void copy_handle(starpu_data_handle dst,
 		starpu_data_handle src,
 		starpu_data_handle src,
 		unsigned nblocks);
 		unsigned nblocks);
 
 
-#endif // __STARPU_EXAMPLE_CG_H__
+#endif /* __STARPU_EXAMPLE_CG_H__ */

+ 19 - 0
examples/cg/cg_dot_kernel.cu

@@ -126,3 +126,22 @@ extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
 
 
 	cudaFree(per_block_sum);
 	cudaFree(per_block_sum);
 }
 }
+
+static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
+{
+	unsigned i;
+	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	for (i = first_i; i < nelems; i += nelems_per_thread)
+		x[i] = 0.0;
+}
+
+extern "C" void zero_vector(TYPE *x, unsigned nelems)
+{
+	unsigned nblocks = STARPU_MIN(128, nelems);
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
+
+	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
+
+	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
+}

+ 9 - 4
examples/cg/cg_kernels.c

@@ -16,6 +16,7 @@
 
 
 #include "cg.h"
 #include "cg.h"
 #include <math.h>
 #include <math.h>
+#include <limits.h>
 
 
 #if 0
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 static void print_vector_from_descr(unsigned nx, TYPE *v)
@@ -123,11 +124,14 @@ starpu_codelet accumulate_vector_cl = {
  */
  */
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+extern void zero_vector(TYPE *x, unsigned nelems);
+
 static void bzero_variable_cuda(void *descr[], void *cl_arg)
 static void bzero_variable_cuda(void *descr[], void *cl_arg)
 {
 {
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	zero_vector(v, 1);
  
  
-	cublasscal (1, (TYPE)0.0, v, 1);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
@@ -159,7 +163,8 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  
  
-	cublasscal (n, (TYPE)0.0, v, 1);
+	zero_vector(v, n);
+
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
@@ -578,8 +583,8 @@ static void copy_handle_cuda(void *descr[], void *cl_arg)
 	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
 
 
-	cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 3 - 2
examples/cholesky/cholesky.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,6 +31,7 @@
 #include <common/blas.h>
 #include <common/blas.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define NMAXBLOCKS	32
 #define NMAXBLOCKS	32
 
 
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
@@ -112,4 +113,4 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	}
 	}
 }
 }
 
 
-#endif // __DW_CHOLESKY_H__
+#endif /* __DW_CHOLESKY_H__ */

+ 31 - 33
examples/cholesky/cholesky_grain_tag.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
 
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
 {
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 
 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
 	
 	
@@ -121,7 +121,7 @@ static starpu_codelet cl22 =
 
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
 
 
 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
 
 
@@ -173,17 +173,15 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
@@ -214,7 +212,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	int ret = starpu_task_submit(entry_task);
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 		exit(-1);
 	}
 	}
 
 
@@ -261,7 +259,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 
 	if (pinned)
 	if (pinned)
 	{
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
+		starpu_malloc((void **)A, dim*dim*sizeof(float));
 	} 
 	} 
 	else {
 	else {
 		*A = malloc(dim*dim*sizeof(float));
 		*A = malloc(dim*dim*sizeof(float));
@@ -280,11 +278,11 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	double flop = (1.0f*size*size*size)/3.0f;
 	double flop = (1.0f*size*size*size)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 
 	starpu_helper_cublas_shutdown();
 	starpu_helper_cublas_shutdown();
 
 
@@ -311,26 +309,26 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
 		{
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
 
 
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 
@@ -338,43 +336,43 @@ int main(int argc, char **argv)
 	cholesky_grain(mat, size, size, nblocks, nbigblocks);
 	cholesky_grain(mat, size, size, nblocks, nbigblocks);
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 
 
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
-	fprintf(stderr, "compute explicit LLt ...\n");
+	FPRINTF(stderr, "compute explicit LLt ...\n");
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
 	SSYRK("L", "N", size, size, 1.0f, 
 	SSYRK("L", "N", size, size, 1.0f, 
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
-	fprintf(stderr, "comparing results ...\n");
+	FPRINTF(stderr, "comparing results ...\n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 

+ 30 - 32
examples/cholesky/cholesky_implicit.c

@@ -126,13 +126,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	unsigned long n = starpu_matrix_get_nx(dataA);
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 
 	double flop = (1.0f*n*n*n)/3.0f;
 	double flop = (1.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
@@ -143,17 +143,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	 * one block is now determined by 2 unsigned (i,j) */
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
@@ -174,7 +172,7 @@ int main(int argc, char **argv)
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
 	float *mat;
 	float *mat;
-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
 
 
 	unsigned i,j;
 	unsigned i,j;
 	for (i = 0; i < size; i++)
 	for (i = 0; i < size; i++)
@@ -182,58 +180,58 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
 		{
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
 
 
-//#define PRINT_OUTPUT
+/* #define PRINT_OUTPUT */
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 
 	cholesky(mat, size, size, nblocks);
 	cholesky(mat, size, size, nblocks);
 
 
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 
 	if (check)
 	if (check)
 	{
 	{
-		fprintf(stderr, "compute explicit LLt ...\n");
+		FPRINTF(stderr, "compute explicit LLt ...\n");
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
 		{
 		{
 			for (i = 0; i < size; i++)
 			for (i = 0; i < size; i++)
 			{
 			{
 				if (i > j) {
 				if (i > j) {
-					mat[j+i*size] = 0.0f; // debug
+					mat[j+i*size] = 0.0f; /* debug */
 				}
 				}
 			}
 			}
 		}
 		}
@@ -243,20 +241,20 @@ int main(int argc, char **argv)
 		SSYRK("L", "N", size, size, 1.0f,
 		SSYRK("L", "N", size, size, 1.0f,
 					mat, size, 0.0f, test_mat, size);
 					mat, size, 0.0f, test_mat, size);
 	
 	
-		fprintf(stderr, "comparing results ...\n");
+		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
 #ifdef PRINT_OUTPUT
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
 		{
 		{
 			for (i = 0; i < size; i++)
 			for (i = 0; i < size; i++)
 			{
 			{
 				if (i <= j) {
 				if (i <= j) {
-					printf("%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 				}
 				}
 				else {
 				else {
-					printf(".\t");
+					FPRINTF(stdout, ".\t");
 				}
 				}
 			}
 			}
-			printf("\n");
+			FPRINTF(stdout, "\n");
 		}
 		}
 #endif
 #endif
 	
 	
@@ -268,7 +266,7 @@ int main(int argc, char **argv)
 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 	                                float err = abs(test_mat[j +i*size] - orig);
 	                                float err = abs(test_mat[j +i*size] - orig);
 	                                if (err > 0.00001) {
 	                                if (err > 0.00001) {
-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
 	                                        assert(0);
 	                                        assert(0);
 	                                }
 	                                }
 	                        }
 	                        }

+ 26 - 9
examples/cholesky/cholesky_kernels.c

@@ -20,6 +20,10 @@
 #include "../common/blas.h"
 #include "../common/blas.h"
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
 #endif
 #endif
 
 
 /*
 /*
@@ -28,7 +32,7 @@
 
 
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
 {
 {
-	//printf("22\n");
+	/* printf("22\n"); */
 	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
 	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
@@ -88,7 +92,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 {
 {
 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
 }
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 
 /* 
 /* 
  * U21
  * U21
@@ -96,7 +100,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
 
 static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
 static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
 {
 {
-//	printf("21\n");
+/*	printf("21\n"); */
 	float *sub11;
 	float *sub11;
 	float *sub21;
 	float *sub21;
 
 
@@ -143,7 +147,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 
 
 static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
 static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
 {
 {
-//	printf("11\n");
+/*	printf("11\n"); */
 	float *sub11;
 	float *sub11;
 
 
 	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
 	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
@@ -179,13 +183,27 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			break;
 			break;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
+#ifdef STARPU_HAVE_MAGMA
 			{
 			{
+			int ret;
+			int info;
+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+			if (ret != MAGMA_SUCCESS) {
+				fprintf(stderr, "Error in Magma: %d\n", ret);
+				STARPU_ABORT();
+			}
+			cudaError_t cures = cudaThreadSynchronize();
+			STARPU_ASSERT(!cures);
+			}
+#else
+			{
+
 			float *lambda11;
 			float *lambda11;
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
 
 
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
-
+				
 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 
@@ -193,7 +211,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				
 				
 				*lambda11 = sqrt(*lambda11);
 				*lambda11 = sqrt(*lambda11);
 
 
-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
 				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 
 
 				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
 				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
@@ -206,8 +224,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			cudaFreeHost(lambda11);
 			cudaFreeHost(lambda11);
 			}
 			}
-		
-
+#endif
 			break;
 			break;
 #endif
 #endif
 		default:
 		default:
@@ -227,4 +244,4 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 {
 {
 	chol_common_codelet_update_u11(descr, 1, _args);
 	chol_common_codelet_update_u11(descr, 1, _args);
 }
 }
-#endif// STARPU_USE_CUDA
+#endif/* STARPU_USE_CUDA */

+ 9 - 9
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
-//#define USE_PERTURBATION	1
+/* #define USE_PERTURBATION	1 */
 
 
 #ifdef USE_PERTURBATION
 #ifdef USE_PERTURBATION
 #define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
 #define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
@@ -43,7 +43,7 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
@@ -58,7 +58,7 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
@@ -73,7 +73,7 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
@@ -88,7 +88,7 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
@@ -103,7 +103,7 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
@@ -118,7 +118,7 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 
 
 #ifdef STARPU_MODEL_DEBUG
 #ifdef STARPU_MODEL_DEBUG
-	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
 #endif
 #endif
 
 
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);

+ 34 - 36
examples/cholesky/cholesky_tag.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
 
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 {
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 
 	struct starpu_task *task = create_task(TAG11(k));
 	struct starpu_task *task = create_task(TAG11(k));
 	
 	
@@ -108,7 +108,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 
 
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
                 exit(0);
         }
         }
 
 
@@ -127,7 +127,7 @@ static starpu_codelet cl22 =
 
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 {
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
 
@@ -155,7 +155,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
 
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
                 exit(0);
         }
         }
 }
 }
@@ -189,7 +189,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 		else {
 		else {
 			int ret = starpu_task_submit(task);
 			int ret = starpu_task_submit(task);
                         if (STARPU_UNLIKELY(ret == -ENODEV)) {
                         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                                fprintf(stderr, "No worker may execute this task\n");
+                                FPRINTF(stderr, "No worker may execute this task\n");
                                 exit(0);
                                 exit(0);
                         }
                         }
 
 
@@ -210,7 +210,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	/* schedule the codelet */
 	/* schedule the codelet */
 	int ret = starpu_task_submit(entry_task);
 	int ret = starpu_task_submit(entry_task);
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
         if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
+                FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
                 exit(0);
         }
         }
 
 
@@ -224,13 +224,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	unsigned n = starpu_matrix_get_nx(dataA);
 
 
 	double flop = (1.0f*n*n*n)/3.0f;
 	double flop = (1.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
@@ -241,7 +241,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 
 	if (pinned)
 	if (pinned)
 	{
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
 	} 
 	} 
 	else {
 	else {
 		*A = malloc(dim*dim*sizeof(float));
 		*A = malloc(dim*dim*sizeof(float));
@@ -258,17 +258,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
@@ -299,26 +297,26 @@ int main(int argc, char **argv)
 		for (j = 0; j < size; j++)
 		for (j = 0; j < size; j++)
 		{
 		{
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 		}
 	}
 	}
 
 
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT
-	printf("Input :\n");
+	FPRINTF(stdout, "Input :\n");
 
 
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 
@@ -326,43 +324,43 @@ int main(int argc, char **argv)
 	cholesky(mat, size, size, nblocks);
 	cholesky(mat, size, size, nblocks);
 
 
 #ifdef CHECK_OUTPUT
 #ifdef CHECK_OUTPUT
-	printf("Results :\n");
+	FPRINTF(stdout, "Results :\n");
 
 
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
-	fprintf(stderr, "compute explicit LLt ...\n");
+	FPRINTF(stderr, "compute explicit LLt ...\n");
 	float *test_mat = malloc(size*size*sizeof(float));
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 	STARPU_ASSERT(test_mat);
 
 
 	SSYRK("L", "N", size, size, 1.0f, 
 	SSYRK("L", "N", size, size, 1.0f, 
 				mat, size, 0.0f, test_mat, size);
 				mat, size, 0.0f, test_mat, size);
 
 
-	fprintf(stderr, "comparing results ...\n");
+	FPRINTF(stderr, "comparing results ...\n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			if (i <= j) {
 			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
 			}
 			else {
 			else {
-				printf(".\t");
+				FPRINTF(stdout, ".\t");
 			}
 			}
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 

+ 7 - 7
examples/cholesky/cholesky_tile_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -59,7 +59,7 @@ static starpu_codelet cl11 =
 
 
 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 {
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 
 	struct starpu_task *task = create_task(TAG11(k));
 	struct starpu_task *task = create_task(TAG11(k));
 	
 	
@@ -145,7 +145,7 @@ static starpu_codelet cl22 =
 
 
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 {
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
 
@@ -224,11 +224,11 @@ static void cholesky_no_stride(void)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	double flop = (1.0f*size*size*size)/3.0f;
 	double flop = (1.0f*size*size*size)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
@@ -239,7 +239,7 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 	assert(nblocks <= NMAXBLOCKS);
 	assert(nblocks <= NMAXBLOCKS);
 
 
-	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
 
 
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 

+ 2 - 2
examples/common/blas.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -154,4 +154,4 @@ extern void dswap_(const int *n, double *x, const int *incx, double *y, const in
 
 
 #endif
 #endif
 
 
-#endif // __BLAS_H__
+#endif /* __BLAS_H__ */

+ 3 - 3
examples/common/blas_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -37,11 +37,11 @@ double gemm_cost(starpu_buffer_descr *descr)
 	nyC = starpu_matrix_get_ny(descr[2].handle);
 	nyC = starpu_matrix_get_ny(descr[2].handle);
 	nxA = starpu_matrix_get_nx(descr[0].handle);
 	nxA = starpu_matrix_get_nx(descr[0].handle);
 
 
-//	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+/*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
 
 
 	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
 	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
 
 
-//	printf("cost %e \n", cost);
+/*	printf("cost %e \n", cost); */
 
 
 	return cost;
 	return cost;
 }
 }

+ 2 - 2
examples/common/blas_model.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -54,4 +54,4 @@ static struct starpu_perfmodel_t starpu_dgemm_model_common = {
 	.type = STARPU_COMMON,
 	.type = STARPU_COMMON,
 };
 };
 
 
-#endif // __BLAS_MODEL_H__
+#endif /* __BLAS_MODEL_H__ */

+ 20 - 18
examples/filters/fblock.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,6 +23,8 @@
 #define NZ    3
 #define NZ    3
 #define PARTS 2
 #define PARTS 2
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 extern void cpu_func(void *buffers[], void *cl_arg);
 extern void cpu_func(void *buffers[], void *cl_arg);
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -36,17 +38,17 @@ extern void opencl_func(void *buffers[], void *cl_arg);
 void print_block(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz)
 void print_block(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz)
 {
 {
         int i, j, k;
         int i, j, k;
-        fprintf(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%d ldz=%d\n", block, nx, ny, nz, ldy, ldz);
+        FPRINTF(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%u ldz=%u\n", block, nx, ny, nz, ldy, ldz);
         for(k=0 ; k<nz ; k++) {
         for(k=0 ; k<nz ; k++) {
                 for(j=0 ; j<ny ; j++) {
                 for(j=0 ; j<ny ; j++) {
                         for(i=0 ; i<nx ; i++) {
                         for(i=0 ; i<nx ; i++) {
-                                fprintf(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
+                                FPRINTF(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
                         }
                         }
-                        fprintf(stderr,"\n");
+                        FPRINTF(stderr,"\n");
                 }
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 }
 }
 
 
 void print_data(starpu_data_handle block_handle)
 void print_data(starpu_data_handle block_handle)
@@ -96,30 +98,28 @@ int main(int argc, char **argv)
         starpu_init(NULL);
         starpu_init(NULL);
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 #endif
 
 
         /* Declare data to StarPU */
         /* Declare data to StarPU */
         starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
         starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
-        fprintf(stderr, "IN  Block\n");
+        FPRINTF(stderr, "IN  Block\n");
         print_data(handle);
         print_data(handle);
 
 
         /* Partition the block in PARTS sub-blocks */
         /* Partition the block in PARTS sub-blocks */
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
 		.filter_func = starpu_block_filter_func_block,
 		.filter_func = starpu_block_filter_func_block,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
 	};
         starpu_data_partition(handle, &f);
         starpu_data_partition(handle, &f);
 
 
-        fprintf(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
+        FPRINTF(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
 
 
         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
         {
         {
                 starpu_data_handle sblock = starpu_data_get_sub_data(handle, 1, i);
                 starpu_data_handle sblock = starpu_data_get_sub_data(handle, 1, i);
-                fprintf(stderr, "Sub block %d\n", i);
+                FPRINTF(stderr, "Sub block %d\n", i);
                 print_data(sblock);
                 print_data(sblock);
         }
         }
 
 
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
                 int ret,multiplier=i;
                 int ret,multiplier=i;
                 struct starpu_task *task = starpu_task_create();
                 struct starpu_task *task = starpu_task_create();
 
 
-                fprintf(stderr,"Dealing with sub-block %d\n", i);
+                FPRINTF(stderr,"Dealing with sub-block %d\n", i);
                 task->cl = &cl;
                 task->cl = &cl;
                 task->synchronous = 1;
                 task->synchronous = 1;
                 task->callback_func = NULL;
                 task->callback_func = NULL;
@@ -139,9 +139,10 @@ int main(int argc, char **argv)
 
 
                 ret = starpu_task_submit(task);
                 ret = starpu_task_submit(task);
                 if (ret) {
                 if (ret) {
-                        fprintf(stderr, "Error when submitting task\n");
+                        FPRINTF(stderr, "Error when submitting task\n");
                         exit(ret);
                         exit(ret);
                 }
                 }
+		starpu_task_destroy(task);
         }
         }
 
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
         /* Unpartition the data, unregister it from StarPU and shutdown */
@@ -150,10 +151,11 @@ int main(int argc, char **argv)
         starpu_data_unregister(handle);
         starpu_data_unregister(handle);
 
 
         /* Print result block */
         /* Print result block */
-        fprintf(stderr, "OUT Block\n");
+        FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);
         print_block(block, NX, NY, NZ, NX, NX*NY);
 
 
-	starpu_shutdown();
+	free(block);
 
 
+	starpu_shutdown();
 	return 0;
 	return 0;
 }
 }

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -28,7 +28,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 	cl_event event;
 	cl_event event;
 
 
         int *factor = cl_arg;
         int *factor = cl_arg;
-	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
@@ -42,7 +42,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	err = 0;
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
 	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
 	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
 	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
 	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
 	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
 	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);

+ 13 - 12
examples/filters/fmatrix.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #define NY    4
 #define NY    4
 #define PARTS 2
 #define PARTS 2
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
         unsigned i, j;
         unsigned i, j;
@@ -43,15 +45,15 @@ int main(int argc, char **argv)
 	unsigned i, j, n=1;
 	unsigned i, j, n=1;
         int matrix[NX*NY];
         int matrix[NX*NY];
 
 
-        fprintf(stderr,"IN  Matrix: \n");
+        FPRINTF(stderr,"IN  Matrix: \n");
         for(j=0 ; j<NY ; j++) {
         for(j=0 ; j<NY ; j++) {
                 for(i=0 ; i<NX ; i++) {
                 for(i=0 ; i<NX ; i++) {
                         matrix[(j*NX)+i] = n++;
                         matrix[(j*NX)+i] = n++;
-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
                 }
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 
 
         starpu_data_handle handle;
         starpu_data_handle handle;
         starpu_codelet cl = {
         starpu_codelet cl = {
@@ -68,9 +70,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
 		.filter_func = starpu_block_filter_func,
 		.filter_func = starpu_block_filter_func,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
 	};
 	starpu_data_partition(handle, &f);
 	starpu_data_partition(handle, &f);
 
 
@@ -86,6 +86,7 @@ int main(int argc, char **argv)
                 task->cl_arg = &factor;
                 task->cl_arg = &factor;
                 task->cl_arg_size = sizeof(factor);
                 task->cl_arg_size = sizeof(factor);
 		starpu_task_submit(task);
 		starpu_task_submit(task);
+		starpu_task_destroy(task);
 	}
 	}
 
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
         /* Unpartition the data, unregister it from StarPU and shutdown */
@@ -94,14 +95,14 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 	starpu_shutdown();
 
 
         /* Print result matrix */
         /* Print result matrix */
-        fprintf(stderr,"OUT Matrix: \n");
+        FPRINTF(stderr,"OUT Matrix: \n");
         for(j=0 ; j<NY ; j++) {
         for(j=0 ; j<NY ; j++) {
                 for(i=0 ; i<NX ; i++) {
                 for(i=0 ; i<NX ; i++) {
-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
                 }
                 }
-                fprintf(stderr,"\n");
+                FPRINTF(stderr,"\n");
         }
         }
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"\n");
 
 
 	return 0;
 	return 0;
 }
 }

+ 11 - 10
examples/filters/fvector.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,8 @@
 #define NX    21
 #define NX    21
 #define PARTS 3
 #define PARTS 3
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 void cpu_func(void *buffers[], void *cl_arg)
 void cpu_func(void *buffers[], void *cl_arg)
 {
 {
         unsigned i;
         unsigned i;
@@ -47,9 +49,9 @@ int main(int argc, char **argv)
         };
         };
 
 
         for(i=0 ; i<NX ; i++) vector[i] = i;
         for(i=0 ; i<NX ; i++) vector[i] = i;
-        fprintf(stderr,"IN  Vector: ");
-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"IN  Vector: ");
+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
+        FPRINTF(stderr,"\n");
 
 
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
@@ -60,9 +62,7 @@ int main(int argc, char **argv)
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
 	{
 	{
 		.filter_func = starpu_block_filter_func_vector,
 		.filter_func = starpu_block_filter_func_vector,
-		.nchildren = PARTS,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
+		.nchildren = PARTS
 	};
 	};
 	starpu_data_partition(handle, &f);
 	starpu_data_partition(handle, &f);
 
 
@@ -81,15 +81,16 @@ int main(int argc, char **argv)
                 task->cl_arg_size = sizeof(factor);
                 task->cl_arg_size = sizeof(factor);
 
 
 		starpu_task_submit(task);
 		starpu_task_submit(task);
+		starpu_task_destroy(task);
 	}
 	}
 
 
 	starpu_data_unpartition(handle, 0);
 	starpu_data_unpartition(handle, 0);
         starpu_data_unregister(handle);
         starpu_data_unregister(handle);
 	starpu_shutdown();
 	starpu_shutdown();
 
 
-        fprintf(stderr,"OUT Vector: ");
-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
-        fprintf(stderr,"\n");
+        FPRINTF(stderr,"OUT Vector: ");
+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
+        FPRINTF(stderr,"\n");
 
 
 	return 0;
 	return 0;
 }
 }

+ 21 - 23
examples/heat/dw_factolu.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -624,12 +624,12 @@ void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
@@ -666,7 +666,7 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 	int ret = starpu_task_submit(task);
 	int ret = starpu_task_submit(task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(0);
 		exit(0);
 	}
 	}
 
 
@@ -681,12 +681,12 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
@@ -697,8 +697,8 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
 
 	if (pinned)
 	if (pinned)
 	{
 	{
-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
-		starpu_data_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
+		starpu_malloc((void **)B, (size_t)dim*sizeof(float));
 	} 
 	} 
 	else {
 	else {
 		*A = malloc((size_t)dim*dim*sizeof(float));
 		*A = malloc((size_t)dim*dim*sizeof(float));
@@ -714,7 +714,7 @@ void dw_factoLU(float *matA, unsigned size,
 {
 {
 
 
 #ifdef CHECK_RESULTS
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	float *Asaved;
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
 
@@ -730,17 +730,15 @@ void dw_factoLU(float *matA, unsigned size,
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
 			size, size, sizeof(float));
 			size, size, sizeof(float));
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 

+ 47 - 45
examples/heat/dw_factolu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,6 +36,8 @@
 
 
 #include "lu_kernels_model.h"
 #include "lu_kernels_model.h"
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define BLAS3_FLOP(n1,n2,n3)    \
 #define BLAS3_FLOP(n1,n2,n3)    \
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
 
 
@@ -82,53 +84,53 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
 
 #if 0
 #if 0
 	/* display L */
 	/* display L */
-	printf("(LU): \n");
+	FPRINTF(stdout, "(LU): \n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", LU[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", LU[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
 
 
 
 
 	/* display L */
 	/* display L */
-	printf("L: \n");
+	FPRINTF(stdout, "L: \n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", L[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
 	/* display U */
 	/* display U */
-	printf("U: \n");
+	FPRINTF(stdout, "U: \n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
-//			if (i <= j) {
-				printf("%2.2f\t", U[j +i*size]);
-//			}
-//			else {
-//				printf(".\t");
-//			}
+/*			if (i <= j) { */
+				FPRINTF(stdout, "%2.2f\t", U[j +i*size]);
+/*			}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
 #endif
 #endif
@@ -148,42 +150,42 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
 
 #if 0
 #if 0
 	/* display A */
 	/* display A */
-	printf("A: \n");
+	FPRINTF(stdout, "A: \n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
-	//		if (i <= j) {
-	      			printf("%2.2f\t", A[j +i*size]);
-	//		}
-	//		else {
-	//			printf(".\t");
-	//		}
+	/*		if (i <= j) { */
+	      			FPRINTF(stdout, "%2.2f\t", A[j +i*size]);
+	/*		}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 
 
 
 
 	/* display LU */
 	/* display LU */
-	printf("LU: \n");
+	FPRINTF(stdout, "LU: \n");
 	for (j = 0; j < size; j++)
 	for (j = 0; j < size; j++)
 	{
 	{
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
-	//		if (i <= j) {
-	      			printf("%2.2f\t", L[j +i*size]);
-	//		}
-	//		else {
-	//			printf(".\t");
-	//		}
+	/*		if (i <= j) { */
+	      			FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
+	/*		}
+			else {
+				FPRINTF(stdout, ".\t");
+			} */
 		}
 		}
-		printf("\n");
+		FPRINTF(stdout, "\n");
 	}
 	}
 #endif
 #endif
 
 
-	printf("max error between A and L*U = %f \n", max_err);
+	FPRINTF(stdout, "max error between A and L*U = %f \n", max_err);
 }
 }
-#endif // CHECK_RESULTS
+#endif /* CHECK_RESULTS */
 
 
 void dw_cpu_codelet_update_u11(void **, void *);
 void dw_cpu_codelet_update_u11(void **, void *);
 void dw_cpu_codelet_update_u12(void **, void *);
 void dw_cpu_codelet_update_u12(void **, void *);
@@ -211,4 +213,4 @@ extern struct starpu_perfmodel_t model_12;
 extern struct starpu_perfmodel_t model_21;
 extern struct starpu_perfmodel_t model_21;
 extern struct starpu_perfmodel_t model_22;
 extern struct starpu_perfmodel_t model_22;
 
 
-#endif // __DW_FACTO_LU_H__
+#endif /* __DW_FACTO_LU_H__ */

+ 24 - 26
examples/heat/dw_factolu_grain.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -54,7 +54,7 @@ static starpu_codelet cl11 = {
 
 
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
 {
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 
 	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
 	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
 
 
@@ -87,7 +87,7 @@ static starpu_codelet cl12 = {
 
 
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
 {
 {
-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+/*	FPRINTF(stdout, "task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
 
 
 	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
 	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
 	
 	
@@ -163,7 +163,7 @@ static starpu_codelet cl22 = {
 
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
 {
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 
 	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
 	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
 
 
@@ -207,17 +207,15 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	unsigned nblocks = size / blocksize;
 	unsigned nblocks = size / blocksize;
 	unsigned maxk = inner_size / blocksize;
 	unsigned maxk = inner_size / blocksize;
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 
@@ -262,7 +260,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 	int ret = starpu_task_submit(entry_task);
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 		exit(-1);
 	}
 	}
 
 
@@ -299,13 +297,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
 
 		float *newmatA = &matA[inner_size*(ld+1)];
 		float *newmatA = &matA[inner_size*(ld+1)];
 
 
-//		if (tag_prefix < 2)
-//		{
-//			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
-//		}
-//		else {
+/*		if (tag_prefix < 2)
+		{
+			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
+		}
+		else { */
 			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
 			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
-//		}
+/*		} */
 	}
 	}
 
 
 }
 }
@@ -314,7 +312,7 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 {
 {
 
 
 #ifdef CHECK_RESULTS
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	float *Asaved;
 	Asaved = malloc(ld*ld*sizeof(float));
 	Asaved = malloc(ld*ld*sizeof(float));
 
 
@@ -333,12 +331,12 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
 
 	unsigned n = size;
 	unsigned n = size;
 	double flop = (2.0f*n*n*n)/3.0f;
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
 
 #ifdef CHECK_RESULTS
 #ifdef CHECK_RESULTS
 	compare_A_LU(Asaved, matA, size, ld);
 	compare_A_LU(Asaved, matA, size, ld);

+ 17 - 17
examples/heat/dw_factolu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +33,7 @@ void display_stat_heat(void)
 {
 {
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 
 
-	fprintf(stderr, "STATS : \n");
+	FPRINTF(stderr, "STATS : \n");
 
 
 	unsigned worker;
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
@@ -49,7 +49,7 @@ void display_stat_heat(void)
 		count_22_total += count_22_per_worker[worker];
 		count_22_total += count_22_per_worker[worker];
 	}
 	}
 
 
-	fprintf(stderr, "\t11 (diagonal block LU)\n");
+	FPRINTF(stderr, "\t11 (diagonal block LU)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		if (count_total_per_worker[worker])
 		if (count_total_per_worker[worker])
@@ -57,11 +57,11 @@ void display_stat_heat(void)
 			char name[32];
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			starpu_worker_get_name(worker, name, 32);
 			
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
 		}
 		}
 	}
 	}
 
 
-	fprintf(stderr, "\t12 (TRSM)\n");
+	FPRINTF(stderr, "\t12 (TRSM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		if (count_total_per_worker[worker])
 		if (count_total_per_worker[worker])
@@ -69,12 +69,12 @@ void display_stat_heat(void)
 			char name[32];
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			starpu_worker_get_name(worker, name, 32);
 			
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
 		}
 		}
 	}
 	}
 	
 	
 	
 	
-	fprintf(stderr, "\t21 (TRSM)\n");
+	FPRINTF(stderr, "\t21 (TRSM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		if (count_total_per_worker[worker])
 		if (count_total_per_worker[worker])
@@ -82,11 +82,11 @@ void display_stat_heat(void)
 			char name[32];
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			starpu_worker_get_name(worker, name, 32);
 			
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
 		}
 		}
 	}
 	}
 	
 	
-	fprintf(stderr, "\t22 (SGEMM)\n");
+	FPRINTF(stderr, "\t22 (SGEMM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		if (count_total_per_worker[worker])
 		if (count_total_per_worker[worker])
@@ -94,7 +94,7 @@ void display_stat_heat(void)
 			char name[32];
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
 			starpu_worker_get_name(worker, name, 32);
 			
 			
-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
 		}
 		}
 	}
 	}
 }
 }
@@ -162,7 +162,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	int id = starpu_worker_get_id();
 	count_22_per_worker[id]++;
 	count_22_per_worker[id]++;
 }
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 
 /*
 /*
  * U12
  * U12
@@ -225,7 +225,7 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	int id = starpu_worker_get_id();
 	count_12_per_worker[id]++;
 	count_12_per_worker[id]++;
 }
 }
-#endif // STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */
 
 
 /* 
 /* 
  * U21
  * U21
@@ -298,12 +298,12 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 	{
 	{
 		for (i = 0; i < n; i++)
 		for (i = 0; i < n; i++)
 		{
 		{
-			fprintf(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
+			FPRINTF(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
 		}
 		}
-		fprintf(stderr, "\n");
+		FPRINTF(stderr, "\n");
 	}
 	}
 	
 	
-	fprintf(stderr, "\n");
+	FPRINTF(stderr, "\n");
 }
 }
 
 
 static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
 static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
@@ -378,4 +378,4 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 	int id = starpu_worker_get_id();
 	int id = starpu_worker_get_id();
 	count_11_per_worker[id]++;
 	count_11_per_worker[id]++;
 }
 }
-#endif// STARPU_USE_CUDA
+#endif /* STARPU_USE_CUDA */

+ 18 - 20
examples/heat/dw_factolu_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -56,7 +56,7 @@ static starpu_codelet cl11 = {
 
 
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 {
 {
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+/*	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 
 	struct starpu_task *task = create_task(TAG11(k));
 	struct starpu_task *task = create_task(TAG11(k));
 
 
@@ -90,7 +90,7 @@ static starpu_codelet cl12 = {
 
 
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 {
 {
-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+/*	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
 
 
 	struct starpu_task *task = create_task(TAG12(k, i));
 	struct starpu_task *task = create_task(TAG12(k, i));
 	
 	
@@ -166,7 +166,7 @@ static starpu_codelet cl22 = {
 
 
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
 {
 {
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+/*	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 	struct starpu_task *task = create_task(TAG22(k, i, j));
 
 
@@ -241,7 +241,7 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 	int ret = starpu_task_submit(entry_task);
 	int ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 	{
-		fprintf(stderr, "No worker may execute this task\n");
+		FPRINTF(stderr, "No worker may execute this task\n");
 		exit(-1);
 		exit(-1);
 	}
 	}
 
 
@@ -253,19 +253,19 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
+	FPRINTF(stderr, "Computation took (in ms)\n");
 	printf("%2.2f\n", timing/1000);
 	printf("%2.2f\n", timing/1000);
 
 
 	unsigned n = starpu_matrix_get_nx(dataA);
 	unsigned n = starpu_matrix_get_nx(dataA);
 	double flop = (2.0f*n*n*n)/3.0f;
 	double flop = (2.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 }
 
 
 void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
 void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
 {
 {
 
 
 #ifdef CHECK_RESULTS
 #ifdef CHECK_RESULTS
-	fprintf(stderr, "Checking results ...\n");
+	FPRINTF(stderr, "Checking results ...\n");
 	float *Asaved;
 	float *Asaved;
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
 
@@ -280,17 +280,15 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	 * one block is now determined by 2 unsigned (i,j) */
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 

+ 7 - 6
examples/heat/dw_sparse_cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,7 @@
  */
  */
 
 
 #include "dw_sparse_cg.h"
 #include "dw_sparse_cg.h"
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_task *create_task(starpu_tag_t id)
 {
 {
@@ -298,13 +299,13 @@ void iteration_cg(void *problem)
 {
 {
 	struct cg_problem *pb = problem;
 	struct cg_problem *pb = problem;
 
 
-	printf("i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
+	FPRINTF(stdout, "i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
 
 
 	if ((pb->i < MAXITER) && 
 	if ((pb->i < MAXITER) && 
 		(pb->delta_new > pb->epsilon) )
 		(pb->delta_new > pb->epsilon) )
 	{
 	{
 		if (pb->i % 1000 == 0)
 		if (pb->i % 1000 == 0)
-			printf("i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
+			FPRINTF(stdout, "i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
 
 
 		pb->i++;
 		pb->i++;
 
 
@@ -313,8 +314,8 @@ void iteration_cg(void *problem)
 	}
 	}
 	else {
 	else {
 		/* we may stop */
 		/* we may stop */
-		printf("We are done ... after %d iterations \n", pb->i - 1);
-		printf("i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
+		FPRINTF(stdout, "We are done ... after %d iterations \n", pb->i - 1);
+		FPRINTF(stdout, "i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
 		sem_post(pb->sem);
 		sem_post(pb->sem);
 	}
 	}
 }
 }
@@ -353,7 +354,7 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 		ptr_vecq[i] = 0.0f;
 		ptr_vecq[i] = 0.0f;
 	}
 	}
 
 
-	printf("nrow = %d \n", nrow);
+	FPRINTF(stdout, "nrow = %u \n", nrow);
 
 
 	/* and register them as well */
 	/* and register them as well */
 	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));
 	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));

+ 3 - 3
examples/heat/dw_sparse_cg.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,7 +101,7 @@ static void __attribute__ ((unused)) print_results(float *result, unsigned size)
 
 
 	for (i = 0; i < STARPU_MIN(size, 16); i++)
 	for (i = 0; i < STARPU_MIN(size, 16); i++)
 	{
 	{
-		printf("%d -> %f\n", i, result[i]);
+		printf("%u -> %f\n", i, result[i]);
 	}
 	}
 }
 }
 
 
@@ -134,4 +134,4 @@ void iteration_cg(void *problem);
 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
 			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
 
 
-#endif // __DW_SPARSE_CG_H__
+#endif /* __DW_SPARSE_CG_H__ */

+ 0 - 4
examples/heat/dw_sparse_cg_kernels.c

@@ -64,10 +64,8 @@ void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
 	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
 	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
 
 
 
 
-	uint32_t nnz;
 	uint32_t nrow;
 	uint32_t nrow;
 
 
-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 
 
 	unsigned row;
 	unsigned row;
@@ -173,10 +171,8 @@ void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
 	float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
 	float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
 	float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
 	float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
 
 
-	uint32_t nnz;
 	uint32_t nrow;
 	uint32_t nrow;
 
 
-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 	nrow = STARPU_CSR_GET_NROW(descr[0]);
 
 
 	unsigned row;
 	unsigned row;

+ 10 - 10
examples/heat/heat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -321,7 +321,7 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	/* solve the actual problem LU X = B */
 	/* solve the actual problem LU X = B */
         /* solve LX' = Y with X' = UX */
         /* solve LX' = Y with X' = UX */
         /* solve UX = X' */
         /* solve UX = X' */
-	fprintf(stderr, "Solving the problem ...\n");
+	FPRINTF(stderr, "Solving the problem ...\n");
 
 
 	float *savedB;
 	float *savedB;
 	float *LUB;
 	float *LUB;
@@ -360,10 +360,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 	
 	
 		/* check if LUB is close to the 0 vector */
 		/* check if LUB is close to the 0 vector */
 		int maxind = ISAMAX(subsize, LUB, 1);
 		int maxind = ISAMAX(subsize, LUB, 1);
-		fprintf(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
+		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
 
 		float sum = SASUM(subsize, LUB, 1);
 		float sum = SASUM(subsize, LUB, 1);
-		fprintf(stderr,"avg. error %e\n", sum/subsize);
+		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
 	
 	
 		free(LUB);
 		free(LUB);
 		free(savedB);
 		free(savedB);
@@ -494,10 +494,10 @@ static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned n
 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
 				{
 				{
 					/* we got a possible neighbour */
 					/* we got a possible neighbour */
-					unsigned node = 
+					unsigned pnode = 
 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
 
 
-					neighbours[nneighbours++] = TRANSLATEBACK(node);
+					neighbours[nneighbours++] = TRANSLATEBACK(pnode);
 				}
 				}
 			}
 			}
 		}
 		}
@@ -569,10 +569,10 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 
 
 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
 		{
 		{
-			unsigned i = neighbours[neighbour]; 
-			if (i >= newsize)
+			unsigned n = neighbours[neighbour]; 
+			if (n >= newsize)
 			{
 			{
-				B[j] -= compute_A_value(TRANSLATE(i), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(i)];
+				B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
 			}
 			}
 		}
 		}
 	}
 	}
@@ -729,7 +729,7 @@ int main(int argc, char **argv)
 
 
 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
 
 
-		fprintf(stderr, "Problem size : %dx%d (%dx%d) (%ld MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
+		FPRINTF(stderr, "Problem size : %ux%u (%ux%u) (%lu MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
 
 
 		STARPU_ASSERT(newsize % nblocks == 0);
 		STARPU_ASSERT(newsize % nblocks == 0);
 
 

+ 5 - 3
examples/heat/heat.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,7 +24,7 @@
 #include <assert.h>
 #include <assert.h>
 #include <math.h>
 #include <math.h>
 
 
-// needed for STARPU_OPENGL_RENDER
+/* needed for STARPU_OPENGL_RENDER */
 #include <starpu_config.h>
 #include <starpu_config.h>
 #include <starpu.h>
 #include <starpu.h>
 
 
@@ -36,6 +36,8 @@
 #include <GL/glut.h>
 #include <GL/glut.h>
 #endif
 #endif
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define X	0
 #define X	0
 #define Y	1
 #define Y	1
 
 
@@ -66,4 +68,4 @@ void display_stat_heat(void);
 extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
 extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
 #endif
 #endif
 
 
-#endif // __HEAT_H__
+#endif /* __HEAT_H__ */

+ 6 - 6
examples/heat/heat_display.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -133,8 +133,8 @@ static void display(void)
 	float factor = 1.0/amplitude;
 	float factor = 1.0/amplitude;
 	glScalef (factor, factor, factor);      /* modeling transformation */
 	glScalef (factor, factor, factor);      /* modeling transformation */
 	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
 	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
-//	printf("factor %f\n", factor);
-	//   glRotatef(-0,0.0,0.0,0.0);
+/*	printf("factor %f\n", factor);
+	   glRotatef(-0,0.0,0.0,0.0); */
 	generate_graph();
 	generate_graph();
 	glFlush ();
 	glFlush ();
 }
 }
@@ -211,7 +211,7 @@ void find_limits(void)
 
 
 void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
 void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
 {
 {
-	fprintf(stderr, "OpenGL rendering ... \n");
+	FPRINTF(stderr, "OpenGL rendering ... \n");
 
 
 	ntheta = _ntheta;
 	ntheta = _ntheta;
 	nthick = _nthick;
 	nthick = _nthick;
@@ -236,4 +236,4 @@ void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_p
 	glutReshapeFunc(reshape);
 	glutReshapeFunc(reshape);
 	glutMainLoop();
 	glutMainLoop();
 }
 }
-#endif // STARPU_OPENGL_RENDER
+#endif /* STARPU_OPENGL_RENDER */

+ 14 - 14
examples/heat/lu_kernels_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,7 +26,7 @@
  *	Number of flops of Gemm 
  *	Number of flops of Gemm 
  */
  */
 
 
-//#define USE_PERTURBATION	1
+/* #define USE_PERTURBATION	1 */
 
 
 
 
 #ifdef USE_PERTURBATION
 #ifdef USE_PERTURBATION
@@ -58,10 +58,10 @@ double task_12_cost(starpu_buffer_descr *descr)
 
 
 	n = starpu_matrix_get_nx(descr[0].handle);
 	n = starpu_matrix_get_nx(descr[0].handle);
 
 
-//	double cost = ((n*n*n)/1744.695);
+/*	double cost = ((n*n*n)/1744.695); */
 	double cost = ((n*n*n)/3210.80);
 	double cost = ((n*n*n)/3210.80);
 
 
-	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -72,10 +72,10 @@ double task_21_cost(starpu_buffer_descr *descr)
 
 
 	n = starpu_matrix_get_nx(descr[0].handle);
 	n = starpu_matrix_get_nx(descr[0].handle);
 
 
-//	double cost = ((n*n*n)/1744.695);
+/*	double cost = ((n*n*n)/1744.695); */
 	double cost = ((n*n*n)/3691.53);
 	double cost = ((n*n*n)/3691.53);
 
 
-	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -109,7 +109,7 @@ double task_11_cost_cuda(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/1853.7806);
 	double cost = ((n*n*n)/1853.7806);
 
 
-//	printf("CUDA task 11 ; predict %e\n", cost);
+/*	printf("CUDA task 11 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -121,7 +121,7 @@ double task_12_cost_cuda(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/42838.5718);
 	double cost = ((n*n*n)/42838.5718);
 
 
-//	printf("CUDA task 12 ; predict %e\n", cost);
+/*	printf("CUDA task 12 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -134,7 +134,7 @@ double task_21_cost_cuda(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/49208.667);
 	double cost = ((n*n*n)/49208.667);
 
 
-//	printf("CUDA task 21 ; predict %e\n", cost);
+/*	printf("CUDA task 21 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -150,7 +150,7 @@ double task_22_cost_cuda(starpu_buffer_descr *descr)
 
 
 	double cost = ((nx*ny*nz)/57523.560);
 	double cost = ((nx*ny*nz)/57523.560);
 
 
-//	printf("CUDA task 22 ; predict %e\n", cost);
+/*	printf("CUDA task 22 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -168,7 +168,7 @@ double task_11_cost_cpu(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/537.5);
 	double cost = ((n*n*n)/537.5);
 
 
-//	printf("CPU task 11 ; predict %e\n", cost);
+/*	printf("CPU task 11 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -180,7 +180,7 @@ double task_12_cost_cpu(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/6668.224);
 	double cost = ((n*n*n)/6668.224);
 
 
-//	printf("CPU task 12 ; predict %e\n", cost);
+/*	printf("CPU task 12 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -193,7 +193,7 @@ double task_21_cost_cpu(starpu_buffer_descr *descr)
 
 
 	double cost = ((n*n*n)/6793.8423);
 	double cost = ((n*n*n)/6793.8423);
 
 
-//	printf("CPU task 21 ; predict %e\n", cost);
+/*	printf("CPU task 21 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 
@@ -209,7 +209,7 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
 
 	double cost = ((nx*ny*nz)/4203.0175);
 	double cost = ((nx*ny*nz)/4203.0175);
 
 
-//	printf("CPU task 22 ; predict %e\n", cost);
+/*	printf("CPU task 22 ; predict %e\n", cost); */
 	return PERTURBATE(cost);
 	return PERTURBATE(cost);
 }
 }
 
 

+ 2 - 2
examples/heat/lu_kernels_model.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,4 +20,4 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
-#endif // __LU_KERNELS_MODEL_H__
+#endif /* __LU_KERNELS_MODEL_H__ */

+ 13 - 9
examples/incrementer/incrementer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 #include <sys/time.h>
 #include <sys/time.h>
 
 
 static unsigned niter = 50000;
 static unsigned niter = 50000;
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
@@ -42,6 +43,9 @@ int main(int argc, char **argv)
 {
 {
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
+#ifdef STARPU_SLOW_MACHINE
+	niter /= 100;
+#endif
 	if (argc == 2)
 	if (argc == 2)
 		niter = atoi(argv[1]);
 		niter = atoi(argv[1]);
 
 
@@ -52,7 +56,7 @@ int main(int argc, char **argv)
 			(uintptr_t)&float_array, 4, sizeof(float));
 			(uintptr_t)&float_array, 4, sizeof(float));
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program);
+        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
 #endif
 #endif
 
 
 	starpu_codelet cl =
 	starpu_codelet cl =
@@ -88,7 +92,7 @@ int main(int argc, char **argv)
 		int ret = starpu_task_submit(task);
 		int ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 			exit(0);
 		}
 		}
 	}
 	}
@@ -96,24 +100,24 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
 	/* update the array in RAM */
 	/* update the array in RAM */
-	starpu_data_acquire(float_array_handle, STARPU_R);
+	starpu_data_unregister(float_array_handle);
 
 
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 
 
-	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
                 float_array[1], float_array[2], float_array[3]);
 
 
+	STARPU_ASSERT(float_array[0] == niter);
+
 	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
 	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
-		fprintf(stderr, "Incorrect result\n");
+		FPRINTF(stderr, "Incorrect result\n");
 		return 1;
 		return 1;
 	}
 	}
 
 
-	starpu_data_release(float_array_handle);
-
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
 					(end.tv_usec - start.tv_usec));
 					(end.tv_usec - start.tv_usec));
 
 
-	fprintf(stderr, "%d elems took %lf ms\n", niter, timing/1000);
+	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 2 - 2
examples/incrementer/incrementer_kernels_opencl.c

@@ -21,7 +21,7 @@
 extern struct starpu_opencl_program opencl_program;
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)
 void opencl_codelet(void *descr[], void *_args)
 {
 {
-	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_kernel kernel;
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_command_queue queue;
 	cl_event event;
 	cl_event event;
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	err = 0;
 	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	{
 	{

+ 3 - 3
examples/lu/lu_example.c

@@ -137,7 +137,7 @@ void copy_matrix_into_blocks(void)
 	for (bj = 0; bj < nblocks; bj++)
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
 	{
-		starpu_data_malloc_pinned_if_possible((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
+		starpu_malloc((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
 
 
 		for (j = 0; j < blocksize; j++)
 		for (j = 0; j < blocksize; j++)
 		for (i = 0; i < blocksize; i++)
 		for (i = 0; i < blocksize; i++)
@@ -151,7 +151,7 @@ void copy_matrix_into_blocks(void)
 static void init_matrix(void)
 static void init_matrix(void)
 {
 {
 	/* allocate matrix */
 	/* allocate matrix */
-	starpu_data_malloc_pinned_if_possible((void **)&A, (size_t)size*size*sizeof(TYPE));
+	starpu_malloc((void **)&A, (size_t)size*size*sizeof(TYPE));
 	STARPU_ASSERT(A);
 	STARPU_ASSERT(A);
 
 
 	starpu_srand48((long int)time(NULL));
 	starpu_srand48((long int)time(NULL));
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
 		} else {
 		} else {
 			starpu_bound_compute(&min, NULL, 0);
 			starpu_bound_compute(&min, NULL, 0);
 			if (min != 0.)
 			if (min != 0.)
-				FPRINTF(stderr, "theoretical min: %lf ms\n", min);
+				FPRINTF(stderr, "theoretical min: %f ms\n", min);
 		}
 		}
 	}
 	}
 
 

+ 9 - 11
examples/lu/xlu.c

@@ -236,17 +236,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 	/* We already enforce deps by hand */
 	/* We already enforce deps by hand */
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 

+ 10 - 12
examples/lu/xlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
@@ -143,17 +143,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 	 * one block is now determined by 2 unsigned (i,j) */
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 	
 	
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 

+ 10 - 12
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
@@ -189,17 +189,15 @@ void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size
 	 * one block is now determined by 2 unsigned (i,j) */
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
 
 
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
+	struct starpu_data_filter f = {
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 = {
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
 
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 

+ 36 - 30
examples/mandelbrot/mandelbrot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,14 +29,15 @@ int use_x11 = 1;
 #endif
 #endif
 
 
 int demo = 0;
 int demo = 0;
+static double demozoom = 0.05;
 
 
 /* NB: The X11 code is inspired from the http://locklessinc.com/articles/mandelbrot/ article */
 /* NB: The X11 code is inspired from the http://locklessinc.com/articles/mandelbrot/ article */
 
 
 static int nblocks = 20;
 static int nblocks = 20;
 static int height = 400;
 static int height = 400;
 static int width = 640;
 static int width = 640;
-static int maxIt = 20000; // max number of iteration in the Mandelbrot function
-static int niter = -1; // number of loops in case we don't use X11, -1 means infinite
+static int maxIt = 20000; /* max number of iteration in the Mandelbrot function */
+static int niter = -1; /* number of loops in case we don't use X11, -1 means infinite */
 static int use_spmd = 0;
 static int use_spmd = 0;
 
 
 static double leftX = -0.745;
 static double leftX = -0.745;
@@ -233,7 +234,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 {
 {
 	int iby, block_size;
 	int iby, block_size;
 	double stepX, stepY;
 	double stepX, stepY;
-	int *pcnt; // unused for CUDA tasks
+	int *pcnt; /* unused for CUDA tasks */
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 
 
 	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
 	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
@@ -247,15 +248,15 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
 
 	starpu_opencl_load_kernel(&kernel, &queue, &opencl_programs, "mandelbrot_kernel", devid);
 	starpu_opencl_load_kernel(&kernel, &queue, &opencl_programs, "mandelbrot_kernel", devid);
 
 
-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
-	clSetKernelArg(kernel, 1, sizeof(double), &leftX);
-	clSetKernelArg(kernel, 2, sizeof(double), &topY);
-	clSetKernelArg(kernel, 3, sizeof(double), &stepX);
-	clSetKernelArg(kernel, 4, sizeof(double), &stepY);
-	clSetKernelArg(kernel, 5, sizeof(int), &maxIt);
-	clSetKernelArg(kernel, 6, sizeof(int), &iby);
-	clSetKernelArg(kernel, 7, sizeof(int), &block_size);
-	clSetKernelArg(kernel, 8, sizeof(int), &width);
+	clSetKernelArg(kernel, 0, sizeof(data), &data);
+	clSetKernelArg(kernel, 1, sizeof(leftX), &leftX);
+	clSetKernelArg(kernel, 2, sizeof(topY), &topY);
+	clSetKernelArg(kernel, 3, sizeof(stepX), &stepX);
+	clSetKernelArg(kernel, 4, sizeof(stepY), &stepY);
+	clSetKernelArg(kernel, 5, sizeof(maxIt), &maxIt);
+	clSetKernelArg(kernel, 6, sizeof(iby), &iby);
+	clSetKernelArg(kernel, 7, sizeof(block_size), &block_size);
+	clSetKernelArg(kernel, 8, sizeof(width), &width);
 
 
 	unsigned dim = 16;
 	unsigned dim = 16;
 	size_t local[2] = {dim, 1};
 	size_t local[2] = {dim, 1};
@@ -278,7 +279,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
 
 	int iby, block_size;
 	int iby, block_size;
 	double stepX, stepY;
 	double stepX, stepY;
-	int *pcnt; // unused for sequential tasks
+	int *pcnt; /* unused for sequential tasks */
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
 
 
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
@@ -291,7 +292,7 @@ static void compute_block(void *descr[], void *cl_arg)
 		{
 		{
 			double cx = leftX + ix * stepX;
 			double cx = leftX + ix * stepX;
 			double cy = topY - iy * stepY;
 			double cy = topY - iy * stepY;
-			// Z = X+I*Y
+			/* Z = X+I*Y */
 			double x = 0;
 			double x = 0;
 			double y = 0;
 			double y = 0;
 			int it;
 			int it;
@@ -300,13 +301,13 @@ static void compute_block(void *descr[], void *cl_arg)
 				double x2 = x*x;
 				double x2 = x*x;
 				double y2 = y*y;
 				double y2 = y*y;
 
 
-				// Stop iterations when |Z| > 2
+				/* Stop iterations when |Z| > 2 */
 				if (x2 + y2 > 4.0)
 				if (x2 + y2 > 4.0)
 					break;
 					break;
 
 
 				double twoxy = 2.0*x*y;
 				double twoxy = 2.0*x*y;
 
 
-				// Z = Z^2 + C
+				/* Z = Z^2 + C */
 				x = x2 - y2 + cx;
 				x = x2 - y2 + cx;
 				y = twoxy + cy;
 				y = twoxy + cy;
 			}
 			}
@@ -327,8 +328,8 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
 
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	int ix, iy; // global coordinates
-	int local_iy; // current line
+	int ix, iy; /* global coordinates */
+	int local_iy; /* current line */
 
 
 	while (1)
 	while (1)
 	{
 	{
@@ -342,7 +343,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 		{
 		{
 			double cx = leftX + ix * stepX;
 			double cx = leftX + ix * stepX;
 			double cy = topY - iy * stepY;
 			double cy = topY - iy * stepY;
-			// Z = X+I*Y
+			/* Z = X+I*Y */
 			double x = 0;
 			double x = 0;
 			double y = 0;
 			double y = 0;
 			int it;
 			int it;
@@ -351,13 +352,13 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 				double x2 = x*x;
 				double x2 = x*x;
 				double y2 = y*y;
 				double y2 = y*y;
 
 
-				// Stop iterations when |Z| > 2
+				/* Stop iterations when |Z| > 2 */
 				if (x2 + y2 > 4.0)
 				if (x2 + y2 > 4.0)
 					break;
 					break;
 
 
 				double twoxy = 2.0*x*y;
 				double twoxy = 2.0*x*y;
 
 
-				// Z = Z^2 + C
+				/* Z = Z^2 + C */
 				x = x2 - y2 + cx;
 				x = x2 - y2 + cx;
 				y = twoxy + cy;
 				y = twoxy + cy;
 			}
 			}
@@ -396,7 +397,7 @@ static void parse_args(int argc, char **argv)
 	int i;
 	int i;
 	for (i = 1; i < argc; i++) {
 	for (i = 1; i < argc; i++) {
 		if (strcmp(argv[i], "-h") == 0) {
 		if (strcmp(argv[i], "-h") == 0) {
-			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd]\n", argv[0]);
+			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd] [-demo] [-demozoom 0.2]\n", argv[0]);
 			exit(-1);
 			exit(-1);
 		}
 		}
 
 
@@ -434,6 +435,11 @@ static void parse_args(int argc, char **argv)
 
 
 		}
 		}
 
 
+		if (strcmp(argv[i], "-demozoom") == 0) {
+			char *argptr;
+			demozoom = strtof(argv[++i], &argptr);
+		}
+
 		if (strcmp(argv[i], "-no-x11") == 0) {
 		if (strcmp(argv[i], "-no-x11") == 0) {
 #ifdef STARPU_HAVE_X11
 #ifdef STARPU_HAVE_X11
 			use_x11 = 0;
 			use_x11 = 0;
@@ -461,7 +467,7 @@ int main(int argc, char **argv)
 	starpu_init(&conf);
 	starpu_init(&conf);
 
 
 	unsigned *buffer;
 	unsigned *buffer;
-	starpu_data_malloc_pinned_if_possible((void **)&buffer, height*width*sizeof(unsigned));
+	starpu_malloc((void **)&buffer, height*width*sizeof(unsigned));
 
 
 #ifdef STARPU_HAVE_X11
 #ifdef STARPU_HAVE_X11
 	if (use_x11)
 	if (use_x11)
@@ -472,7 +478,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT((height % nblocks) == 0);
 	STARPU_ASSERT((height % nblocks) == 0);
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs);
+	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs, NULL);
 #endif
 #endif
 
 
 	starpu_data_handle block_handles[nblocks];
 	starpu_data_handle block_handles[nblocks];
@@ -520,24 +526,24 @@ int main(int argc, char **argv)
 
 
 		for (iby = 0; iby < nblocks; iby++)
 		for (iby = 0; iby < nblocks; iby++)
 		{
 		{
-			starpu_data_acquire(block_handles[iby], STARPU_R);
 #ifdef STARPU_HAVE_X11
 #ifdef STARPU_HAVE_X11
 			if (use_x11)
 			if (use_x11)
 			{
 			{
+				starpu_data_acquire(block_handles[iby], STARPU_R);
 				XPutImage(dpy, win, gc, bitmap,
 				XPutImage(dpy, win, gc, bitmap,
 					0, iby*block_size,
 					0, iby*block_size,
 					0, iby*block_size,
 					0, iby*block_size,
 					width, block_size);
 					width, block_size);
+				starpu_data_release(block_handles[iby]);
 			}
 			}
 #endif
 #endif
-			starpu_data_release(block_handles[iby]);
 		}
 		}
 
 
 
 
 		if (demo)
 		if (demo)
 		{
 		{
 			/* Zoom in */
 			/* Zoom in */
-			double zoom_factor = 0.05;
+			double zoom_factor = demozoom;
 			double widthX = rightX - leftX;
 			double widthX = rightX - leftX;
 			double heightY = topY - bottomY;
 			double heightY = topY - bottomY;
 
 
@@ -554,7 +560,7 @@ int main(int argc, char **argv)
 				gettimeofday(&end, NULL);
 				gettimeofday(&end, NULL);
 				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-				fprintf(stderr, "Time to generate %d frames : %f s\n", iter, timing/1000000.0);
+				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
 
 
 				/* Reset counters */
 				/* Reset counters */
@@ -583,7 +589,7 @@ int main(int argc, char **argv)
 	for (iby = 0; iby < nblocks; iby++)
 	for (iby = 0; iby < nblocks; iby++)
 		starpu_data_unregister(block_handles[iby]);
 		starpu_data_unregister(block_handles[iby]);
 
 
-//	starpu_data_free_pinned_if_possible(buffer);
+/*	starpu_data_free_pinned_if_possible(buffer); */
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 36 - 30
examples/matvecmult/matvecmult.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <pthread.h>
 #include <pthread.h>
 #include <math.h>
 #include <math.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 struct starpu_opencl_program opencl_code;
 struct starpu_opencl_program opencl_code;
 void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
@@ -27,9 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 	cl_kernel kernel;
 	cl_kernel kernel;
 	cl_command_queue queue;
 	cl_command_queue queue;
 	int id, devid, err, n;
 	int id, devid, err, n;
-	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
-	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
+	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_PTR(descr[0]);
+	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
+	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 	cl_event event;
 	cl_event event;
@@ -41,11 +43,11 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
 
         n=0;
         n=0;
-        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&nx);
-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&ny);
-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
+        err = clSetKernelArg(kernel, n++, sizeof(matrix), &matrix);
+        err |= clSetKernelArg(kernel, n++, sizeof(vector), &vector);
+        err |= clSetKernelArg(kernel, n++, sizeof(nx), (void*)&nx);
+        err |= clSetKernelArg(kernel, n++, sizeof(ny), (void*)&ny);
+	err |= clSetKernelArg(kernel, n++, sizeof(mult), &mult);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 
 	{
 	{
@@ -73,9 +75,9 @@ void fillArray(float* pfData, int iSize) {
 void printArray(float* pfData, int iSize) {
 void printArray(float* pfData, int iSize) {
     int i;
     int i;
     for (i = 0; i < iSize; ++i) {
     for (i = 0; i < iSize; ++i) {
-            fprintf(stderr, "%f ", pfData[i]);
+            FPRINTF(stderr, "%f ", pfData[i]);
     }
     }
-    fprintf(stderr, "\n");
+    FPRINTF(stderr, "\n");
 }
 }
 
 
 void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
 void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
@@ -121,8 +123,8 @@ int main(int argc, char **argv)
                 .nopencl = 1,
                 .nopencl = 1,
 	};
 	};
 
 
-        //int width=1100;
-        //int height=244021;
+        /* int width=1100; */
+        /* int height=244021; */
         int width=20;
         int width=20;
         int height=4;
         int height=4;
 
 
@@ -131,8 +133,14 @@ int main(int argc, char **argv)
         unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
         unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
 
 
 	starpu_data_handle matrix_handle, vector_handle, mult_handle;
 	starpu_data_handle matrix_handle, vector_handle, mult_handle;
+	int ret, submit;
 
 
-        starpu_init(&conf);
+        ret = starpu_init(&conf);
+	if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
+		starpu_shutdown();
+		exit(0);
+	}
 
 
         mem_size_matrix = width * height * sizeof(float);
         mem_size_matrix = width * height * sizeof(float);
         matrix = (float*)malloc(mem_size_matrix);
         matrix = (float*)malloc(mem_size_matrix);
@@ -157,7 +165,7 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
 	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code);
+        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code, NULL);
 #endif
 #endif
 
 
 	cl.where = STARPU_OPENCL;
 	cl.where = STARPU_OPENCL;
@@ -177,30 +185,28 @@ int main(int argc, char **argv)
         task->buffers[2].handle = mult_handle;
         task->buffers[2].handle = mult_handle;
         task->buffers[2].mode = STARPU_RW;
         task->buffers[2].mode = STARPU_RW;
 
 
-        int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
-                exit(0);
+        submit = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(submit == -ENODEV)) {
+                FPRINTF(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
+	}
+	else {
+		starpu_task_wait_for_all();
 	}
 	}
 
 
-	starpu_task_wait_for_all();
+	starpu_data_unregister(matrix_handle);
+	starpu_data_unregister(vector_handle);
+	starpu_data_unregister(mult_handle);
 
 
-	/* update the array in RAM */
-        starpu_data_acquire(matrix_handle, STARPU_R);
-        starpu_data_acquire(vector_handle, STARPU_R);
-        starpu_data_acquire(mult_handle, STARPU_R);
+        if (STARPU_LIKELY(submit != -ENODEV)) {
+		int res = compareL2fe(correctResult, mult, height, 1e-6f);
+		FPRINTF(stdout, "TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
+	}
 
 
-        int res = compareL2fe(correctResult, mult, height, 1e-6f);
-        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
 #if 0
 #if 0
         printArray(matrix, width*height);
         printArray(matrix, width*height);
         printArray(vector, width);
         printArray(vector, width);
         printArray(mult, height);
         printArray(mult, height);
 #endif
 #endif
-        starpu_data_release(matrix_handle);
-        starpu_data_release(vector_handle);
-        starpu_data_release(mult_handle);
-
         starpu_shutdown();
         starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 26 - 22
examples/mult/xgemm.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,8 @@ static unsigned check = 0;
 static TYPE *A, *B, *C;
 static TYPE *A, *B, *C;
 static starpu_data_handle A_handle, B_handle, C_handle;
 static starpu_data_handle A_handle, B_handle, C_handle;
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static void check_output(void)
 static void check_output(void)
 {
 {
 	/* compute C = C - AB */
 	/* compute C = C - AB */
@@ -52,14 +54,14 @@ static void check_output(void)
 	err = CPU_ASUM(xdim*ydim, C, 1);
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
 
 	if (err < xdim*ydim*0.001) {
 	if (err < xdim*ydim*0.001) {
-		fprintf(stderr, "Results are OK\n");
+		FPRINTF(stderr, "Results are OK\n");
 	}
 	}
 	else {
 	else {
 		int max;
 		int max;
 		max = CPU_IAMAX(xdim*ydim, C, 1);
 		max = CPU_IAMAX(xdim*ydim, C, 1);
 
 
-		fprintf(stderr, "There were errors ... err = %f\n", err);
-		fprintf(stderr, "Max error : %e\n", C[max]);
+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
+		FPRINTF(stderr, "Max error : %e\n", C[max]);
 	}
 	}
 }
 }
 
 
@@ -67,9 +69,9 @@ static void init_problem_data(void)
 {
 {
 	unsigned i,j;
 	unsigned i,j;
 
 
-	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
-	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
+	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
+	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
+	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
 
 
 	/* fill the A and B matrices */
 	/* fill the A and B matrices */
 	for (j=0; j < ydim; j++) {
 	for (j=0; j < ydim; j++) {
@@ -100,20 +102,20 @@ static void partition_mult_data(void)
 	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
 	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
 		ydim, ydim, xdim, sizeof(TYPE));
 		ydim, ydim, xdim, sizeof(TYPE));
 
 
-	struct starpu_data_filter f;
-	memset(&f, 0, sizeof(f));
-	f.filter_func = starpu_vertical_block_filter_func;
-	f.nchildren = nslicesx;
+	struct starpu_data_filter vert;
+	memset(&vert, 0, sizeof(vert));
+	vert.filter_func = starpu_vertical_block_filter_func;
+	vert.nchildren = nslicesx;
 		
 		
-	struct starpu_data_filter f2;
-	memset(&f2, 0, sizeof(f2));
-	f2.filter_func = starpu_block_filter_func;
-	f2.nchildren = nslicesy;
+	struct starpu_data_filter horiz;
+	memset(&horiz, 0, sizeof(horiz));
+	horiz.filter_func = starpu_block_filter_func;
+	horiz.nchildren = nslicesy;
 		
 		
-	starpu_data_partition(B_handle, &f);
-	starpu_data_partition(A_handle, &f2);
+	starpu_data_partition(B_handle, &vert);
+	starpu_data_partition(A_handle, &horiz);
 
 
-	starpu_data_map_filters(C_handle, 2, &f, &f2);
+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 }
 }
 
 
 static void mult_kernel_common(void *descr[], int type)
 static void mult_kernel_common(void *descr[], int type)
@@ -145,10 +147,12 @@ static void mult_kernel_common(void *descr[], int type)
 			int block_size = (nyC + worker_size - 1)/worker_size;
 			int block_size = (nyC + worker_size - 1)/worker_size;
 			int new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
 			int new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
 
 
-			TYPE *new_subA = &subA[block_size*rank];
+			STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1]));
+
+			TYPE *new_subB = &subB[block_size*rank];
 			TYPE *new_subC = &subC[block_size*rank];
 			TYPE *new_subC = &subC[block_size*rank];
 
 
-			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, new_subA, ldA, subB, ldB, (TYPE)0.0, new_subC, ldC);
+			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
 		}
 		}
 	}
 	}
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -282,11 +286,11 @@ int main(int argc, char **argv)
 	gettimeofday(&end, NULL);
 	gettimeofday(&end, NULL);
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
 
-	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
 
 
 	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
 	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
 				*((unsigned long)ydim)*((unsigned long)zdim);
 				*((unsigned long)ydim)*((unsigned long)zdim);
-	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
+	FPRINTF(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
 
 
 	starpu_data_unpartition(C_handle, 0);
 	starpu_data_unpartition(C_handle, 0);
 	starpu_data_unregister(C_handle);
 	starpu_data_unregister(C_handle);

+ 105 - 0
examples/openmp/vector_scal.c

@@ -0,0 +1,105 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* gcc build:
+
+   gcc -fopenmp vector_scal.c -o vector_scal $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu)
+
+ */
+
+#include <starpu.h>
+#include <stdio.h>
+#include <limits.h>
+
+#define	NX	2048
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void scal_cpu_func(void *buffers[], void *_args) {
+	unsigned i;
+	float *factor = _args;
+	starpu_vector_interface_t *vector = buffers[0];
+	unsigned n = STARPU_VECTOR_GET_NX(vector);
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+	FPRINTF(stderr, "running task with %d CPUs.\n", starpu_combined_worker_get_size());
+
+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
+	for (i = 0; i < n; i++)
+		val[i] *= *factor;
+}
+
+static struct starpu_perfmodel_t vector_scal_model = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "vector_scale_parallel"
+};
+
+static starpu_codelet cl = {
+	.where = STARPU_CPU,
+	.type = STARPU_FORKJOIN,
+	.max_parallelism = INT_MAX,
+	.cpu_func = scal_cpu_func,
+	.nbuffers = 1,
+	.model = &vector_scal_model,
+};
+
+int main(int argc, char **argv)
+{
+	struct starpu_conf conf;
+	float vector[NX];
+	unsigned i;
+	for (i = 0; i < NX; i++)
+                vector[i] = (i+1.0f);
+
+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
+
+	starpu_conf_init(&conf);
+
+	/* Most OpenMP implementations do not support concurrent parallel
+	 * sections, so only create one big worker */
+	conf.single_combined_worker = 1;
+
+	starpu_init(&conf);
+
+	starpu_data_handle vector_handle;
+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+
+	float factor = 3.14;
+
+	struct starpu_task *task = starpu_task_create();
+	task->synchronous = 1;
+
+	task->cl = &cl;
+
+	task->buffers[0].handle = vector_handle;
+	task->buffers[0].mode = STARPU_RW;
+	task->cl_arg = &factor;
+	task->cl_arg_size = sizeof(factor);
+
+	starpu_task_submit(task);
+	starpu_data_unregister(vector_handle);
+
+	starpu_task_destroy(task);
+
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
+
+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
+
+	return 0;
+}

+ 78 - 0
examples/opt/Makefile.am

@@ -0,0 +1,78 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+if STARPU_USE_CUDA
+
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/ $(HWLOC_CFLAGS) -arch sm_13
+
+.cu.o:
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
+
+endif
+
+TESTS	=	$(check_PROGRAMS)
+
+check_PROGRAMS =
+
+examplebindir = $(libdir)/starpu/examples/
+
+examplebin_PROGRAMS =
+
+noinst_HEADERS = 				\
+	pi/SobolQRNG/sobol.h			\
+	pi/SobolQRNG/sobol_gold.h		\
+	pi/SobolQRNG/sobol_gpu.h		\
+	pi/SobolQRNG/sobol_primitives.h
+
+######
+# Pi #
+######
+
+check_PROGRAMS +=				\
+	pi/pi					\
+	pi/pi_redux
+
+examplebin_PROGRAMS +=				\
+	pi/pi					\
+	pi/pi_redux
+
+pi_pi_SOURCES =					\
+	pi/pi.c					\
+	pi/SobolQRNG/sobol_gold.c		\
+	pi/SobolQRNG/sobol_primitives.c
+
+if STARPU_USE_CUDA
+pi_pi_SOURCES +=				\
+	pi/pi_kernel.cu				\
+	pi/SobolQRNG/sobol_gpu.cu
+endif
+
+pi_pi_redux_SOURCES =				\
+	pi/pi_redux.c
+
+if STARPU_USE_CUDA
+pi_pi_redux_SOURCES +=				\
+	pi/pi_redux_kernel.cu
+pi_pi_redux_LDADD =				\
+	$(STARPU_CURAND_LDFLAGS)
+endif
+
+

ファイルの差分が大きいため隠しています
+ 50 - 0
examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt


+ 60 - 0
examples/opt/pi/SobolQRNG/sobol.h

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#ifndef SOBOL_H
+#define SOBOL_H
+
+/* Number of direction vectors is fixed to 32 */
+#define n_directions 32
+
+#endif

+ 141 - 0
examples/opt/pi/SobolQRNG/sobol_gold.c

@@ -0,0 +1,141 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "sobol.h"
+#include "sobol_gold.h"
+#include "sobol_primitives.h"
+
+#define k_2powneg32 2.3283064E-10F
+
+/* Create the direction numbers, based on the primitive polynomials. */
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
+{
+    unsigned int *v = directions;
+
+    int dim;
+    for (dim = 0 ; dim < n_dimensions ; dim++)
+    {
+        /* First dimension is a special case */
+        if (dim == 0)
+        {
+            int i;
+            for (i = 0 ; i < n_directions ; i++)
+            {
+                /* All m's are 1 */
+                v[i] = 1 << (31 - i);
+            }
+        }
+        else
+        {
+            int d = sobol_primitives[dim].degree;
+            /* The first direction numbers (up to the degree of the polynomial) 
+               are simply v[i] = m[i] / 2^i (stored in Q0.32 format) */
+            int i;
+            for (i = 0 ; i < d ; i++)
+            {
+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
+            }
+            /* The remaining direction numbers are computed as described in
+               the Bratley and Fox paper. */
+            /* v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d */
+            for (i = d ; i < n_directions ; i++)
+            {
+                /* First do the v[i-d] ^ v[i-d]/2^d part */
+                v[i] = v[i - d] ^ (v[i - d] >> d);
+                /* Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
+                   Note that the coefficients a[] are zero or one and for compactness in
+                   the input tables they are stored as bits of a single integer. To extract
+                   the relevant bit we use right shift and mask with 1.
+                   For example, for a 10 degree polynomial there are ten useful bits in a,
+                   so to get a[2] we need to right shift 7 times (to get the 8th bit into
+                   the LSB) and then mask with 1. */
+                int j;
+                for (j = 1 ; j < d ; j++)
+                {
+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
+                }
+            }
+        }
+        v += n_directions;
+    }
+}
+
+/* Reference model for generating Sobol numbers on the host */
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
+{
+    unsigned int *v = directions;
+
+    int d;
+    for (d = 0 ; d < n_dimensions ; d++)
+    {
+        unsigned int X = 0;
+        /* x[0] is zero (in all dimensions) */
+        output[n_vectors * d] = 0.0;        
+        int i;
+        for (i = 1 ; i < n_vectors ; i++)
+        {
+            /* x[i] = x[i-1] ^ v[c]
+                where c is the index of the rightmost zero bit in i
+                minus 1 (since C arrays count from zero)
+               In the Bratley and Fox paper this is equation (**) */
+            X ^= v[ffs(~(i - 1)) - 1];
+            output[i + n_vectors * d] = (float)X * k_2powneg32;
+        }
+        v += n_directions;
+    }
+}

+ 61 - 0
examples/opt/pi/SobolQRNG/sobol_gold.h

@@ -0,0 +1,61 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GOLD_H
+#define SOBOL_GOLD_H
+
+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
+
+#endif

+ 170 - 0
examples/opt/pi/SobolQRNG/sobol_gpu.cu

@@ -0,0 +1,170 @@
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#include "sobol.h"
+#include "sobol_gpu.h"
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+#define k_2powneg32 2.3283064E-10F
+
+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
+{
+    __shared__ unsigned int v[n_directions];
+
+    // Offset into the correct dimension as specified by the
+    // block y coordinate
+    d_directions = d_directions + n_directions * blockIdx.y;
+    d_output = d_output +  n_vectors * blockIdx.y;
+
+    // Copy the direction numbers for this dimension into shared
+    // memory - there are only 32 direction numbers so only the
+    // first 32 (n_directions) threads need participate.
+    if (threadIdx.x < n_directions)
+    {
+	    v[threadIdx.x] = d_directions[threadIdx.x];
+    }
+    __syncthreads();
+
+    // Set initial index (i.e. which vector this thread is
+    // computing first) and stride (i.e. step to the next vector
+    // for this thread)
+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = gridDim.x * blockDim.x;
+
+    // Get the gray code of the index
+    // c.f. Numerical Recipes in C, chapter 20
+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
+    unsigned int g = i0 ^ (i0 >> 1);
+
+    // Initialisation for first point x[i0]
+    // In the Bratley and Fox paper this is equation (*), where
+    // we are computing the value for x[n] without knowing the
+    // value of x[n-1].
+    unsigned int X = 0;
+    unsigned int mask;
+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
+    {
+        // We want X ^= g_k * v[k], where g_k is one or zero.
+        // We do this by setting a mask with all bits equal to
+        // g_k. In reality we keep shifting g so that g_k is the
+        // LSB of g. This way we avoid multiplication.
+        mask = - (g & 1);
+        X ^= mask & v[k];
+        g = g >> 1;
+    }
+    if (i0 < n_vectors)
+    {
+        d_output[i0] = (float)X * k_2powneg32;
+    }
+
+    // Now do rest of points, using the stride
+    // Here we want to generate x[i] from x[i-stride] where we
+    // don't have any of the x in between, therefore we have to
+    // revisit the equation (**), this is easiest with an example
+    // so assume stride is 16.
+    // From x[n] to x[n+16] there will be:
+    //   8 changes in the first bit
+    //   4 changes in the second bit
+    //   2 changes in the third bit
+    //   1 change in the fourth
+    //   1 change in one of the remaining bits
+    //
+    // What this means is that in the equation:
+    //   x[n+1] = x[n] ^ v[p]
+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
+    //   ...
+    // We will apply xor with v[1] eight times, v[2] four times,
+    // v[3] twice, v[4] once and one other direction number once.
+    // Since two xors cancel out, we can skip even applications
+    // and just apply xor with v[4] (i.e. log2(16)) and with
+    // the current applicable direction number.
+    // Note that all these indices count from 1, so we need to
+    // subtract 1 from them all to account for C arrays counting
+    // from zero.
+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
+    unsigned int v_stridemask = stride - 1;
+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
+    {
+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
+        //  where b is log2(stride) minus 1 for C array indexing
+        //  where c is the index of the rightmost zero bit in i,
+        //  not including the bottom log2(stride) bits, minus 1
+        //  for C array indexing
+        // In the Bratley and Fox paper this is equation (**)
+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
+        d_output[i] = (float)X * k_2powneg32;
+    }
+}
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
+{
+    const int threadsperblock = 64;
+
+    // Set up the execution configuration
+    dim3 dimGrid;
+    dim3 dimBlock;
+
+    // This implementation of the generator outputs all the draws for
+    // one dimension in a contiguous region of memory, followed by the
+    // next dimension and so on.
+    // Therefore all threads within a block will be processing different
+    // vectors from the same dimension. As a result we want the total
+    // number of blocks to be a multiple of the number of dimensions.
+    dimGrid.y = n_dimensions;
+
+    // If the number of dimensions is large then we will set the number
+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
+    // but if the number of dimensions is small (e.g. less than 32) then
+    // we'll partition the vectors across blocks (as well as threads).
+    // We also need to cap the dimGrid.x where the number of vectors
+    // is too small to be partitioned.
+    dimGrid.x = 1 + 31 / n_dimensions;
+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
+    {
+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
+    }
+    
+    // Fix the number of threads
+    dimBlock.x = threadsperblock;
+
+    // Execute GPU kernel
+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+}

+ 61 - 0
examples/opt/pi/SobolQRNG/sobol_gpu.h

@@ -0,0 +1,61 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_GPU_H
+#define SOBOL_GPU_H
+
+extern "C"
+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
+
+#endif

ファイルの差分が大きいため隠しています
+ 10271 - 0
examples/opt/pi/SobolQRNG/sobol_primitives.c


+ 75 - 0
examples/opt/pi/SobolQRNG/sobol_primitives.h

@@ -0,0 +1,75 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+/*
+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ *
+ * NVIDIA Corporation and its licensors retain all intellectual property and 
+ * proprietary rights in and to this software and related documentation and 
+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
+ * of this software and related documentation without an express license 
+ * agreement from NVIDIA Corporation is strictly prohibited.
+ * 
+ */
+ 
+ /*
+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
+ *
+ * Sobol Quasi-random Number Generator example
+ *
+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
+ * http://people.maths.ox.ac.uk/~gilesm/
+ *
+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
+ * and Frances Kuo, University of New South Wales, Australia
+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
+ *
+ * For theoretical background see:
+ *
+ * P. Bratley and B.L. Fox.
+ * Implementing Sobol's quasirandom sequence generator
+ * http://portal.acm.org/citation.cfm?id=42288
+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
+ *
+ * S. Joe and F. Kuo.
+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
+ * http://portal.acm.org/citation.cfm?id=641879
+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
+ *
+ */
+
+#ifndef SOBOL_PRIMITIVES_H
+#define SOBOL_PRIMITIVES_H
+
+#define max_m 17
+
+/* Each primitive is stored as a struct where
+   dimension is the dimension number of the polynomial (unused)
+   degree is the degree of the polynomial
+   a is a binary word representing the coefficients 
+   m is the array of m values */
+struct primitive
+{
+    unsigned int dimension;
+    unsigned int degree;
+    unsigned int a;
+    unsigned int m[max_m];
+};
+
+extern const struct primitive sobol_primitives[];
+
+#endif

+ 175 - 0
examples/opt/pi/pi.c

@@ -0,0 +1,175 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "SobolQRNG/sobol.h"
+#include "SobolQRNG/sobol_gold.h"
+#include "pi.h"
+#include <sys/time.h>
+
+#ifdef STARPU_USE_CUDA
+void cuda_kernel(void **descr, void *cl_arg);
+#endif
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+/* default value */
+static unsigned ntasks = 1024;
+
+static void cpu_kernel(void *descr[], void *cl_arg)
+{
+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
+
+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
+
+	unsigned current_cnt = 0;
+
+	unsigned i;
+	for (i = 0; i < nx; i++)
+	{
+		TYPE x = random_numbers_x[i];
+		TYPE y = random_numbers_y[i];
+
+		TYPE dist = (x*x + y*y);
+
+		unsigned success = (dist <= 1.0);
+		current_cnt += success;
+	}
+
+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
+	*cnt = current_cnt;
+
+	free(random_numbers);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-ntasks") == 0) {
+			char *argptr;
+			ntasks = strtol(argv[++i], &argptr, 10);
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+
+	parse_args(argc, argv);
+
+	starpu_init(NULL);
+
+	/* Initialize the random number generator */
+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
+	STARPU_ASSERT(sobol_qrng_directions);
+
+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
+
+	/* Any worker may use that array now */
+	starpu_data_handle sobol_qrng_direction_handle;
+	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
+
+	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
+	STARPU_ASSERT(cnt_array);
+	starpu_data_handle cnt_array_handle;
+	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
+
+	/* Use a write-through policy : when the data is modified on an
+	 * accelerator, we know that it will only be modified once and be
+	 * accessed by the CPU later on */
+	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
+
+	struct starpu_data_filter f = {
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = ntasks
+	};
+	
+	starpu_data_partition(cnt_array_handle, &f);
+
+	static struct starpu_perfmodel_t model = {
+		.type = STARPU_HISTORY_BASED,
+		.symbol = "monte_carlo_pi"
+	};
+
+	struct starpu_codelet_t cl = {
+		.where = STARPU_CPU|STARPU_CUDA,
+		.cpu_func = cpu_kernel,
+#ifdef STARPU_USE_CUDA
+		.cuda_func = cuda_kernel,
+#endif
+		.nbuffers = 2,
+		.model = &model
+	};
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &cl;
+
+		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
+
+		task->buffers[0].handle = sobol_qrng_direction_handle;
+		task->buffers[0].mode   = STARPU_R;
+		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
+		task->buffers[1].mode   = STARPU_W;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+	starpu_task_wait_for_all();
+
+	/* Get the cnt_array back in main memory */
+	starpu_data_unpartition(cnt_array_handle, 0);
+	starpu_data_unregister(cnt_array_handle);
+
+	/* Count the total number of entries */
+	unsigned long total_cnt = 0;
+	for (i = 0; i < ntasks; i++)
+		total_cnt += cnt_array[i];
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
+
+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
+
+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 33 - 0
examples/opt/pi/pi.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __PI_H__
+#define __PI_H__
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <stdio.h>
+
+#define NSHOT_PER_TASK	(16*1024*1024ULL)
+
+#define TYPE	float
+
+/* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */
+
+static int n_dimensions = 100;
+
+#endif /* __PI_H__ */

+ 150 - 0
examples/opt/pi/pi_kernel.cu

@@ -0,0 +1,150 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "SobolQRNG/sobol_gpu.h"
+#include "pi.h"
+#include <starpu_cuda.h>
+
+#define MAXNBLOCKS	128
+#define MAXTHREADSPERBLOCK	256
+
+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
+						unsigned n, unsigned *output_cnt)
+{
+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
+
+	/* Do we have a successful shot ? */
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	const int nthreads = gridDim.x * blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < MAXTHREADSPERBLOCK)
+		scnt[threadIdx.x] = 0;
+
+	__syncthreads();
+	int ind;
+	for (ind = tid; ind < n; ind += nthreads)
+	{ 
+		TYPE x = random_numbers_x[ind];
+		TYPE y = random_numbers_y[ind];
+		TYPE dist = (x*x + y*y);
+
+		unsigned success = (dist <= 1.0f)?1:0;
+
+		scnt[threadIdx.x] += success;
+
+	}
+
+	__syncthreads();
+
+	/* Perform a reduction to compute the sum on each thread within that block */
+
+	/* NB: We assume that the number of threads per block is a power of 2 ! */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* report the number of successful shots in the block */
+	if (threadIdx.x == 0)
+		output_cnt[blockIdx.x] = scnt[0];
+
+	__syncthreads();
+}
+
+static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
+{
+	__shared__ unsigned accumulator[MAXNBLOCKS];
+
+	unsigned i;
+
+	/* Load the values from global mem */
+	for (i = 0; i < blockDim.x; i++)
+		accumulator[i] = output_cnt[i];
+
+	__syncthreads();
+
+	/* Perform a reduction in shared memory */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* Save the result in global memory */
+	if (threadIdx.x == 0)
+		*cnt = accumulator[0];
+}
+
+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
+{
+	cudaError_t cures;
+
+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned nx = NSHOT_PER_TASK;
+
+	/* Generate Random numbers */
+	float *random_numbers;
+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
+	STARPU_ASSERT(random_numbers);
+	
+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+	TYPE *random_numbers_x = &random_numbers[0];
+	TYPE *random_numbers_y = &random_numbers[nx];
+
+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	/* How many blocks do we use ? */ 
+	unsigned nblocks = 128; // TODO
+
+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
+	
+	unsigned *per_block_cnt;
+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
+
+	STARPU_ASSERT((nx % nblocks) == 0);
+
+	/* How many threads per block ? At most 256, but no more threads than
+	 * there are entries to process per block. */
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
+
+	/* each entry of per_block_cnt contains the number of successful shots
+	 * in the corresponding block. */
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+
+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
+
+	/* compute the total number of successful shots by adding the elements
+	 * of the per_block_cnt array */
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cudaFree(per_block_cnt);
+	cudaFree(random_numbers);
+}

+ 362 - 0
examples/opt/pi/pi_redux.c

@@ -0,0 +1,362 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <starpu_config.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define PI	3.14159265358979323846
+
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
+#warning CURAND is required to run that example on CUDA devices
+#endif
+
+#ifdef STARPU_HAVE_CURAND
+#include <cuda.h>
+#include <curand.h>
+#include <starpu_cuda.h>
+#endif
+
+#define NSHOT_PER_TASK	(1024*1024)
+
+/* default value */
+static unsigned long ntasks = 1024;
+static unsigned long ntasks_warmup = 0;
+
+static unsigned use_redux = 1;
+static unsigned do_warmup = 0;
+
+/*
+ *	Initialization of the Random Number Generators (RNG)
+ */
+
+#ifdef STARPU_HAVE_CURAND
+/* RNG for the CURAND library */
+static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
+#endif 
+
+/* state for the erand48 function : note the huge padding to avoid false-sharing */
+#define PADDING	1024
+static unsigned short xsubi[STARPU_NMAXWORKERS*PADDING];
+static struct drand48_data randbuffer[STARPU_NMAXWORKERS*PADDING];
+
+/* Function to initialize the random number generator in the current worker */
+static void init_rng(void *arg __attribute__((unused)))
+{
+#ifdef STARPU_HAVE_CURAND
+	curandStatus_t res;
+#endif
+
+	int workerid = starpu_worker_get_id();
+
+	switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+			/* create a seed */
+			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
+
+			xsubi[0 + PADDING*workerid] = (unsigned short)workerid;
+			xsubi[1 + PADDING*workerid] = (unsigned short)workerid;
+			xsubi[2 + PADDING*workerid] = (unsigned short)workerid;
+			break;
+#ifdef STARPU_HAVE_CURAND
+		case STARPU_CUDA_WORKER:
+
+			/* Create a RNG */
+			res = curandCreateGenerator(&curandgens[workerid],
+						CURAND_RNG_PSEUDO_DEFAULT);
+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+
+			/* Seed it with worker's id */
+			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
+							(unsigned long long)workerid);
+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-ntasks") == 0) {
+			char *argptr;
+			ntasks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-noredux") == 0) {
+			use_redux = 0;
+		}
+
+		if (strcmp(argv[i], "-warmup") == 0) {
+			do_warmup = 1;
+			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
+			exit(-1);
+		}
+	}
+}
+
+/*
+ *	Monte-carlo kernel
+ */
+
+static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
+{
+	int workerid = starpu_worker_get_id();
+
+	unsigned short *worker_xsub;
+	worker_xsub = &xsubi[PADDING*workerid];
+	
+	struct drand48_data *buffer;
+	buffer = &randbuffer[PADDING*workerid];
+
+	unsigned long local_cnt = 0;
+
+	/* Fill the scratchpad with random numbers */
+	int i;
+	for (i = 0; i < NSHOT_PER_TASK; i++)
+	{
+		double randx, randy;
+
+		starpu_erand48_r(worker_xsub, buffer, &randx);
+		starpu_erand48_r(worker_xsub, buffer, &randy);
+
+		double x = (2.0*randx - 1.0);
+		double y = (2.0*randy - 1.0);
+
+		double dist = x*x + y*y;
+		if (dist < 1.0)
+			local_cnt++;
+	}
+
+	/* Put the contribution of that task into the counter */
+	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	*cnt = *cnt + local_cnt;
+}
+
+extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
+
+#ifdef STARPU_HAVE_CURAND
+static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
+{
+	cudaError_t cures;
+	curandStatus_t res;	
+
+	int workerid = starpu_worker_get_id();
+
+	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
+	 * cudaGetLastError resets the last error value. */
+	cures = cudaGetLastError();
+/*	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures); */
+
+	/* Fill the scratchpad with random numbers. Note that both x and y
+	 * arrays are in stored the same vector. */
+	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
+	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
+
+	float *x = &scratchpad_xy[0];
+	float *y = &scratchpad_xy[NSHOT_PER_TASK];
+
+	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
+}
+#endif
+
+static struct starpu_codelet_t pi_cl = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+	.cpu_func = pi_func_cpu,
+#ifdef STARPU_HAVE_CURAND
+	.cuda_func = pi_func_cuda,
+#endif
+	.nbuffers = 2,
+	.model = NULL
+};
+
+/*
+ *	Codelets to implement reduction
+ */
+
+static void init_cpu_func(void *descr[], void *cl_arg)
+{
+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        *val = 0;
+}
+
+#ifdef STARPU_HAVE_CURAND
+static void init_cuda_func(void *descr[], void *cl_arg)
+{
+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        cudaMemset(val, 0, sizeof(unsigned long));
+        cudaThreadSynchronize();
+}
+#endif
+
+static struct starpu_codelet_t init_codelet = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+        .cpu_func = init_cpu_func,
+#ifdef STARPU_HAVE_CURAND
+        .cuda_func = init_cuda_func,
+#endif
+        .nbuffers = 1
+};
+
+#ifdef STARPU_HAVE_CURAND
+/* Dummy implementation of the addition of two unsigned longs in CUDA */
+static void redux_cuda_func(void *descr[], void *cl_arg)
+{
+	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	unsigned long h_a, h_b;
+	
+	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
+	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
+
+	h_a += h_b;
+
+	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
+};
+#endif
+
+static void redux_cpu_func(void *descr[], void *cl_arg)
+{
+	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*a = *a + *b;
+};
+
+static struct starpu_codelet_t redux_codelet = {
+	.where =
+#ifdef STARPU_HAVE_CURAND
+		STARPU_CUDA|
+#endif
+		STARPU_CPU,
+	.cpu_func = redux_cpu_func,
+#ifdef STARPU_HAVE_CURAND
+	.cuda_func = redux_cuda_func,
+#endif
+	.nbuffers = 2
+};
+
+/*
+ *	Main program
+ */
+
+int main(int argc, char **argv)
+{
+	unsigned i;
+
+	parse_args(argc, argv);
+
+	starpu_init(NULL);
+
+	/* Launch a Random Number Generator (RNG) on each worker */
+	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
+
+	/* Create a scratchpad data */
+	starpu_data_handle xy_scratchpad_handle;
+	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
+		2*NSHOT_PER_TASK, sizeof(float));
+
+	/* Create a variable that will be used to count the number of shots
+	 * that actually hit the unit circle when shooting randomly in
+	 * [-1,1]^2. */
+	unsigned long shot_cnt = 0;
+	starpu_data_handle shot_cnt_handle;
+	starpu_variable_data_register(&shot_cnt_handle, 0,
+			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
+
+	starpu_data_set_reduction_methods(shot_cnt_handle,
+					&redux_codelet, &init_codelet);
+
+	struct timeval start;
+	struct timeval end;
+
+	for (i = 0; i < ntasks_warmup; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &pi_cl;
+
+		task->buffers[0].handle = xy_scratchpad_handle;
+		task->buffers[0].mode   = STARPU_SCRATCH;
+		task->buffers[1].handle = shot_cnt_handle;
+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &pi_cl;
+
+		task->buffers[0].handle = xy_scratchpad_handle;
+		task->buffers[0].mode   = STARPU_SCRATCH;
+		task->buffers[1].handle = shot_cnt_handle;
+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
+
+		int ret = starpu_task_submit(task);
+		STARPU_ASSERT(!ret);
+	}
+
+	starpu_data_unregister(shot_cnt_handle);
+
+	gettimeofday(&end, NULL);
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
+	 * probability to impact the disk: pi/4 */
+	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
+	double pi_approx = ((double)shot_cnt*4.0)/total;
+
+	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");
+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", pi_approx, shot_cnt, total);
+	FPRINTF(stderr, "Error %e \n", pi_approx - PI);
+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
+	FPRINTF(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
+
+	starpu_shutdown();
+
+	if (abs(pi_approx - PI) > 1.0)
+		return 1;
+
+	return 0;
+}

+ 128 - 0
examples/opt/pi/pi_redux_kernel.cu

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+#define MAXNBLOCKS	128
+#define MAXTHREADSPERBLOCK	256
+
+static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
+{
+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
+
+	/* Do we have a successful shot ? */
+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	const int nthreads = gridDim.x * blockDim.x;
+
+	/* Blank the shared mem buffer */
+	if (threadIdx.x < MAXTHREADSPERBLOCK)
+		scnt[threadIdx.x] = 0;
+
+	__syncthreads();
+	int ind;
+	for (ind = tid; ind < n; ind += nthreads)
+	{ 
+		float xval = (2.0f * x[ind] - 1.0f);
+		float yval = (2.0f * y[ind] - 1.0f);
+		float dist = (xval*xval + yval*yval);
+
+		unsigned long success = (dist <= 1.0f)?1:0;
+
+		scnt[threadIdx.x] += success;
+
+	}
+
+	__syncthreads();
+
+	/* Perform a reduction to compute the sum on each thread within that block */
+
+	/* NB: We assume that the number of threads per block is a power of 2 ! */
+	unsigned long s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* report the number of successful shots in the block */
+	if (threadIdx.x == 0)
+		output_cnt[blockIdx.x] = scnt[0];
+
+	__syncthreads();
+}
+
+static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
+{
+	__shared__ unsigned long accumulator[MAXNBLOCKS];
+
+	unsigned i;
+
+	/* Load the values from global mem */
+	for (i = 0; i < blockDim.x; i++)
+		accumulator[i] = output_cnt[i];
+
+	__syncthreads();
+
+	/* Perform a reduction in shared memory */
+	unsigned s;
+	for (s = blockDim.x/2; s!=0; s>>=1)
+	{
+		if (threadIdx.x < s)
+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
+
+		__syncthreads();
+	}
+
+	/* Save the result in global memory */
+	if (threadIdx.x == 0)
+		*cnt = *cnt + accumulator[0];
+}
+
+extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
+{
+	cudaError_t cures;
+
+	/* How many blocks do we use ? */ 
+	unsigned nblocks = 128; // TODO
+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
+	STARPU_ASSERT((n % nblocks) == 0);
+	
+	unsigned long *per_block_cnt;
+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
+
+	/* How many threads per block ? At most 256, but no more threads than
+	 * there are entries to process per block. */
+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
+
+	/* each entry of per_block_cnt contains the number of successful shots
+	 * in the corresponding block. */
+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+
+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
+
+	/* compute the total number of successful shots by adding the elements
+	 * of the per_block_cnt array */
+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	if (cures)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cudaFree(per_block_cnt);
+}

+ 3 - 3
examples/ppm_downscaler/ppm_downscaler.c

@@ -76,7 +76,7 @@ struct ppm_image *file_to_ppm(char *filename)
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
 	{
 	{
-//		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b);
+/*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
 	}
 	}
 
 
 	fclose(file);
 	fclose(file);
@@ -136,7 +136,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 				{
 				{
 					unsigned index = (big_col + i)+(big_line + j)*input_ppm->ncols;
 					unsigned index = (big_col + i)+(big_line + j)*input_ppm->ncols;
 
 
-//					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b);
+/*					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b); */
 
 
 					sum_r += (unsigned)in[index].r;
 					sum_r += (unsigned)in[index].r;
 					sum_g += (unsigned)in[index].g;
 					sum_g += (unsigned)in[index].g;
@@ -148,7 +148,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 			out[col + line*output_ppm->ncols].g = (unsigned char)(sum_g/(FACTOR*FACTOR));
 			out[col + line*output_ppm->ncols].g = (unsigned char)(sum_g/(FACTOR*FACTOR));
 			out[col + line*output_ppm->ncols].b = (unsigned char)(sum_b/(FACTOR*FACTOR));
 			out[col + line*output_ppm->ncols].b = (unsigned char)(sum_b/(FACTOR*FACTOR));
 
 
-//			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r);
+/*			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r); */
 	
 	
 		}
 		}
 	}
 	}

+ 7 - 11
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  *
@@ -92,16 +92,12 @@ static struct starpu_codelet_t ds_codelet = {
 /* each block contains BLOCK_HEIGHT consecutive lines */
 /* each block contains BLOCK_HEIGHT consecutive lines */
 static struct starpu_data_filter filter_y = {
 static struct starpu_data_filter filter_y = {
 	.filter_func = starpu_block_filter_func,
 	.filter_func = starpu_block_filter_func,
-	.nchildren= HEIGHT/BLOCK_HEIGHT,
-	.get_nchildren = NULL,
-	.get_child_ops = NULL
+	.nchildren= HEIGHT/BLOCK_HEIGHT
 };
 };
 	
 	
 static struct starpu_data_filter filter_uv = {
 static struct starpu_data_filter filter_uv = {
 	.filter_func = starpu_block_filter_func,
 	.filter_func = starpu_block_filter_func,
-	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT,
-	.get_nchildren = NULL,
-	.get_child_ops = NULL
+	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
 };
 };
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
@@ -111,7 +107,7 @@ int main(int argc, char **argv)
 	
 	
 	parse_args(argc, argv);
 	parse_args(argc, argv);
 
 
-//	fprintf(stderr, "Reading input file ...\n");
+/*	fprintf(stderr, "Reading input file ...\n"); */
 
 
 	/* how many frames ? */
 	/* how many frames ? */
 	struct stat stbuf;
 	struct stat stbuf;
@@ -120,7 +116,7 @@ int main(int argc, char **argv)
 
 
 	unsigned nframes = filesize/FRAMESIZE; 
 	unsigned nframes = filesize/FRAMESIZE; 
 
 
-//	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes);
+/*	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */
 	assert((filesize % sizeof(struct yuv_frame)) == 0);
 	assert((filesize % sizeof(struct yuv_frame)) == 0);
 
 
 	/* fetch input data */
 	/* fetch input data */
@@ -134,7 +130,7 @@ int main(int argc, char **argv)
 	FILE *f_out = fopen(filename_out, "w+");
 	FILE *f_out = fopen(filename_out, "w+");
 	assert(f_out);
 	assert(f_out);
 
 
-//	fprintf(stderr, "Alloc output file ...\n");
+/*	fprintf(stderr, "Alloc output file ...\n"); */
 	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
 	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
 	assert(yuv_out_buffer);
 	assert(yuv_out_buffer);
 
 
@@ -199,7 +195,7 @@ int main(int argc, char **argv)
 
 
 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
 
 
-	fprintf(stderr, "Start computation: there will be %d tasks for %d frames\n", ntasks, nframes);
+	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 
 
 	/* do the computation */
 	/* do the computation */

+ 10 - 8
examples/profiling/profiling.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,6 +20,8 @@
 #include <assert.h>
 #include <assert.h>
 #include <unistd.h>
 #include <unistd.h>
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static unsigned niter = 500;
 static unsigned niter = 500;
 
 
 void sleep_codelet(__attribute__ ((unused)) void *descr[],
 void sleep_codelet(__attribute__ ((unused)) void *descr[],
@@ -70,7 +72,7 @@ int main(int argc, char **argv)
 		int ret = starpu_task_submit(task);
 		int ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 		{
-			fprintf(stderr, "No worker may execute this task\n");
+			FPRINTF(stderr, "No worker may execute this task\n");
 			exit(0);
 			exit(0);
 		}
 		}
 	}
 	}
@@ -97,8 +99,8 @@ int main(int argc, char **argv)
 
 
 	free(tasks);
 	free(tasks);
 
 
-	fprintf(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
-	fprintf(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
+	FPRINTF(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
+	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
 
 
 	/* Display the occupancy of all workers during the test */
 	/* Display the occupancy of all workers during the test */
 	int worker;
 	int worker;
@@ -117,10 +119,10 @@ int main(int argc, char **argv)
 
 
 		char workername[128];
 		char workername[128];
 		starpu_worker_get_name(worker, workername, 128);
 		starpu_worker_get_name(worker, workername, 128);
-		fprintf(stderr, "Worker %s:\n", workername);
-		fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
-		fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
-		fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
+		FPRINTF(stderr, "Worker %s:\n", workername);
+		FPRINTF(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
+		FPRINTF(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
+		FPRINTF(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
 	}
 	}
 
 
 	starpu_shutdown();
 	starpu_shutdown();

+ 36 - 11
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,13 +22,15 @@
 #include <cublas.h>
 #include <cublas.h>
 #endif
 #endif
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static float *x;
 static float *x;
 static float *y;
 static float *y;
 static starpu_data_handle *x_handles;
 static starpu_data_handle *x_handles;
 static starpu_data_handle *y_handles;
 static starpu_data_handle *y_handles;
 
 
 static unsigned nblocks = 4096;
 static unsigned nblocks = 4096;
-static unsigned entries_per_bock = 1024;
+static unsigned entries_per_block = 1024;
 
 
 #define DOT_TYPE double
 #define DOT_TYPE double
 
 
@@ -75,9 +77,16 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 	*dota = *dota + *dotb;
 	*dota = *dota + *dotb;
 }
 }
 
 
+#ifdef STARPU_USE_CUDA
+extern void redux_cuda_func(void *descr[], void *_args);
+#endif
+
 static struct starpu_codelet_t redux_codelet = {
 static struct starpu_codelet_t redux_codelet = {
-	.where = STARPU_CPU,
+	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_func = redux_cpu_func,
 	.cpu_func = redux_cpu_func,
+#ifdef STARPU_USE_CUDA
+	.cuda_func = redux_cuda_func,
+#endif
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
@@ -118,11 +127,11 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
 
 	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
 	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
 
 
-	int ret = cudaThreadSynchronize();
+	cudaThreadSynchronize();
 
 
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
 
-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
 	current_dot += local_dot;
 	current_dot += local_dot;
 
 
 	cudaThreadSynchronize();
 	cudaThreadSynchronize();
@@ -146,15 +155,13 @@ static struct starpu_codelet_t dot_codelet = {
  *	Tasks initialization
  *	Tasks initialization
  */
  */
 
 
-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
-
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
 	starpu_init(NULL);
 	starpu_init(NULL);
 
 
 	starpu_helper_cublas_init();
 	starpu_helper_cublas_init();
 
 
-	unsigned long nelems = nblocks*entries_per_bock;
+	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
 	size_t size = nelems*sizeof(float);
 
 
 	x = malloc(size);
 	x = malloc(size);
@@ -182,9 +189,9 @@ int main(int argc, char **argv)
 	for (block = 0; block < nblocks; block++)
 	for (block = 0; block < nblocks; block++)
 	{
 	{
 		starpu_vector_data_register(&x_handles[block], 0,
 		starpu_vector_data_register(&x_handles[block], 0,
-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
 		starpu_vector_data_register(&y_handles[block], 0,
 		starpu_vector_data_register(&y_handles[block], 0,
-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
 	}
 	}
 
 
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
@@ -199,6 +206,7 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 
 
 		task->cl = &dot_codelet;
 		task->cl = &dot_codelet;
+		task->destroy = 1;
 
 
 		task->buffers[0].handle = x_handles[block];
 		task->buffers[0].handle = x_handles[block];
 		task->buffers[0].mode = STARPU_R;
 		task->buffers[0].mode = STARPU_R;
@@ -208,16 +216,33 @@ int main(int argc, char **argv)
 		task->buffers[2].mode = STARPU_REDUX;
 		task->buffers[2].mode = STARPU_REDUX;
 
 
 		int ret = starpu_task_submit(task);
 		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 	}
 	}
 
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+		starpu_data_unregister(y_handles[block]);
+	}
 	starpu_data_unregister(dot_handle);
 	starpu_data_unregister(dot_handle);
 
 
-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
 
 
 	starpu_helper_cublas_shutdown();
 	starpu_helper_cublas_shutdown();
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
+	free(x);
+	free(y);
+	free(x_handles);
+	free(y_handles);
+
 	return 0;
 	return 0;
+
+enodev:
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	return 77;
 }
 }

+ 9 - 3
examples/reductions/minmax_reduction.c

@@ -22,6 +22,8 @@
 static unsigned nblocks = 8192;
 static unsigned nblocks = 8192;
 static unsigned entries_per_bock = 1024;
 static unsigned entries_per_bock = 1024;
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 #define TYPE		double
 #define TYPE		double
 #define TYPE_MAX	DBL_MAX
 #define TYPE_MAX	DBL_MAX
 #define TYPE_MIN	DBL_MIN
 #define TYPE_MIN	DBL_MIN
@@ -171,15 +173,19 @@ int main(int argc, char **argv)
 		if (ret)
 		if (ret)
 		{
 		{
 			STARPU_ASSERT(ret == -ENODEV);
 			STARPU_ASSERT(ret == -ENODEV);
-			fprintf(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
+			FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
 			return 0;
 			return 0;
 		}
 		}
 	}
 	}
 
 
+	for (block = 0; block < nblocks; block++)
+	{
+		starpu_data_unregister(x_handles[block]);
+	}
 	starpu_data_unregister(minmax_handle);
 	starpu_data_unregister(minmax_handle);
 
 
-	fprintf(stderr, "Min : %e\n", minmax[0]);
-	fprintf(stderr, "Max : %e\n", minmax[1]);
+	FPRINTF(stderr, "Min : %e\n", minmax[0]);
+	FPRINTF(stderr, "Max : %e\n", minmax[1]);
 
 
 	STARPU_ASSERT(minmax[0] <= minmax[1]);
 	STARPU_ASSERT(minmax[0] <= minmax[1]);
 
 

+ 12 - 6
examples/scheduler/dummy_sched.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <starpu.h>
 
 
 #define NTASKS	32000
 #define NTASKS	32000
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 
 struct starpu_task_list sched_list;
 struct starpu_task_list sched_list;
 
 
@@ -38,7 +39,7 @@ static void init_dummy_sched(struct starpu_machine_topology_s *topology,
 	for (workerid = 0; workerid < topology->nworkers; workerid++)
 	for (workerid = 0; workerid < topology->nworkers; workerid++)
 		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
 		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
 
 
-	fprintf(stderr, "Initialising Dummy scheduler\n");
+	FPRINTF(stderr, "Initialising Dummy scheduler\n");
 }
 }
 
 
 static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
 static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
@@ -49,7 +50,7 @@ static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
 	pthread_cond_destroy(&sched_cond);
 	pthread_cond_destroy(&sched_cond);
 	pthread_mutex_destroy(&sched_mutex);
 	pthread_mutex_destroy(&sched_mutex);
 
 
-	fprintf(stderr, "Destroying Dummy scheduler\n");
+	FPRINTF(stderr, "Destroying Dummy scheduler\n");
 }
 }
 
 
 static int push_task_dummy(struct starpu_task *task)
 static int push_task_dummy(struct starpu_task *task)
@@ -80,7 +81,6 @@ static struct starpu_sched_policy_s dummy_sched_policy = {
 	.init_sched = init_dummy_sched,
 	.init_sched = init_dummy_sched,
 	.deinit_sched = deinit_dummy_sched,
 	.deinit_sched = deinit_dummy_sched,
 	.push_task = push_task_dummy,
 	.push_task = push_task_dummy,
-	.push_prio_task = NULL,
 	.pop_task = pop_task_dummy,
 	.pop_task = pop_task_dummy,
 	.post_exec_hook = NULL,
 	.post_exec_hook = NULL,
 	.pop_every_task = NULL,
 	.pop_every_task = NULL,
@@ -118,10 +118,16 @@ static starpu_codelet dummy_codelet =
 
 
 int main(int argc, char **argv)
 int main(int argc, char **argv)
 {
 {
+	int ntasks = NTASKS;
+
 	starpu_init(&conf);
 	starpu_init(&conf);
 
 
+#ifdef STARPU_SLOW_MACHINE
+	ntasks /= 100;
+#endif
+
 	unsigned i;
 	unsigned i;
-	for (i = 0; i < NTASKS; i++)
+	for (i = 0; i < ntasks; i++)
 	{
 	{
 		struct starpu_task *task = starpu_task_create();
 		struct starpu_task *task = starpu_task_create();
 	
 	

+ 51 - 0
examples/socl/Makefile.am

@@ -0,0 +1,51 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/socl/src/libsocl.la
+AM_CPPFLAGS = -I$(top_srcdir)/socl/include/ 
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+
+SOCL_EXAMPLES	=
+TESTS		=	$(SOCL_EXAMPLES)
+
+check_PROGRAMS	=	$(STARPU_EXAMPLES)
+
+examplebindir = $(libdir)/starpu/examples/socl/
+examplebin_PROGRAMS =
+
+
+examplebin_PROGRAMS +=				\
+	basic/basic		\
+	mandelbrot/mandelbrot		\
+	clinfo/clinfo
+
+
+SOCL_EXAMPLES +=				\
+	basic/basic		\
+	mandelbrot/mandelbrot		\
+	clinfo/clinfo
+
+basic_basic_SOURCES = basic/basic.c
+clinfo_clinfo_SOURCES = clinfo/clinfo.c
+mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
+
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+if HAVE_X11
+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+endif

+ 211 - 0
examples/socl/basic/basic.c

@@ -0,0 +1,211 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <CL/cl.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
+
+#ifdef UNUSED
+#elif defined(__GNUC__)
+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
+#else
+# define UNUSED(x) x
+#endif
+
+#define SIZE 1024
+#define TYPE float
+#define REALSIZE (SIZE * sizeof(TYPE))
+
+const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \
+   size_t x = get_global_id(0);\
+   size_t y = get_global_id(1);\
+   size_t w = get_global_size(0); \
+   int idx = y*w+x; \
+   d[idx] = s1[idx] + s2[idx];\
+}";
+
+
+
+int main(int UNUSED(argc), char** UNUSED(argv)) {
+   cl_platform_id platforms[15];
+   cl_uint num_platforms;
+   cl_device_id devices[15];
+   cl_uint num_devices;
+   cl_context context;
+   cl_program program;
+   cl_kernel kernel;
+   cl_mem s1m, s2m, dm;
+   cl_command_queue cq;
+   cl_int err;
+
+   TYPE s1[SIZE],s2[SIZE],d[SIZE];
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+         s1[i] = 2.0;
+         s2[i] = 7.0;
+         d[i] = 98.0;
+      }
+   }
+
+   printf("Querying platform...\n");
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   if (num_platforms == 0) {
+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+      exit(0);
+   }
+   err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
+   check(err, "clGetPlatformIDs");
+
+   printf("Querying devices...\n");
+   unsigned int platform_idx;
+   for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
+      err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
+      check(err, "clGetDeviceIDs");
+      if (num_devices != 0)
+         break;
+   }
+   if (num_devices == 0)
+      error("No OpenCL device found\n");
+
+   printf("Creating context...\n");
+   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
+   context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
+   check(err, "clCreateContext");
+
+   printf("Creating program...\n");
+   program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
+   check(err, "clCreateProgram");
+
+   printf("Building program...\n");
+   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+   check(err, "clBuildProgram");
+
+   printf("Creating kernel...\n");
+   kernel = clCreateKernel(program, "add", &err);
+   check(err, "clCreateKernel");
+
+   printf("Creating buffers...\n");
+   s1m = clCreateBuffer(context, CL_MEM_READ_WRITE, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s1");
+   s2m = clCreateBuffer(context, CL_MEM_READ_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s2");
+   dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer d");
+
+   printf("Creating command queue...\n");
+   cl_event eventW1, eventW2, eventK, eventR;
+
+#ifdef PROFILING
+   cq = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
+#else
+   cq = clCreateCommandQueue(context, devices[0], 0, &err);
+#endif
+   check(err, "clCreateCommandQueue");
+
+   printf("Enqueueing WriteBuffers...\n");
+   err = clEnqueueWriteBuffer(cq, s1m, CL_FALSE, 0, REALSIZE, s1, 0, NULL, &eventW1);
+   check(err, "clEnqueueWriteBuffer s1");
+   err = clEnqueueWriteBuffer(cq, s2m, CL_FALSE, 0, REALSIZE, s2, 0, NULL, &eventW2);
+   check(err, "clEnqueueWriteBuffer s2");
+
+   printf("Setting kernel arguments...\n");
+   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m);
+   check(err, "clSetKernelArg 0");
+   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m);
+   check(err, "clSetKernelArg 1");
+   err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm);
+   check(err, "clSetKernelArg 2");
+
+   printf("Enqueueing NDRangeKernel...\n");
+   size_t local[3] = {16, 1, 1};
+   size_t global[3] = {1024, 1, 1};
+   cl_event deps[] = {eventW1,eventW2};
+   err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, 2, deps, &eventK);
+   check(err, "clEnqueueNDRangeKernel");
+
+   printf("Enqueueing ReadBuffer...\n");
+   err = clEnqueueReadBuffer(cq, dm, CL_FALSE, 0, REALSIZE, d, 0, NULL, &eventR);
+   check(err, "clEnqueueReadBuffer");
+
+   clFinish(cq);
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+        printf("%f ", d[i]);
+      }
+      printf("\n");
+   }
+
+#ifdef PROFILING
+   #define DURATION(event,label) do { \
+      cl_ulong t0,t1; \
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t0, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
+   } while (0);
+
+   DURATION(eventW1, "first buffer writing");
+   DURATION(eventW2, "second buffer writing");
+   DURATION(eventK, "kernel execution");
+   DURATION(eventR, "result buffer reading");
+#endif
+
+   
+   printf("Releasing events...\n");
+   err = clReleaseEvent(eventW1);
+   err |= clReleaseEvent(eventW2);
+   err |= clReleaseEvent(eventK);
+   err |= clReleaseEvent(eventR);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing command queue...\n");
+   err = clReleaseCommandQueue(cq);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing buffers...\n");
+   err = clReleaseMemObject(s1m);
+   check(err, "clReleaseMemObject s1");
+   err = clReleaseMemObject(s2m);
+   check(err, "clReleaseMemObject s2");
+   err = clReleaseMemObject(dm);
+   check(err, "clReleaseMemObject d");
+
+   printf("Releasing kernel...\n");
+   err = clReleaseKernel(kernel);
+   check(err, "clReleaseKernel");
+
+   printf("Releasing program...\n");
+   err = clReleaseProgram(program);
+   check(err, "clReleaseProgram");
+
+   printf("Releasing context...\n");
+   err = clReleaseContext(context);
+   check(err, "clReleaseContext");
+
+   return 0;
+}

+ 299 - 0
examples/socl/clinfo/clinfo.c

@@ -0,0 +1,299 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <CL/cl.h>
+
+inline 
+void 
+checkErr(cl_int err, const char * name) {
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "ERROR: %s (%d)\n", name, err);
+        exit(1);
+    }
+}
+
+int
+main(void) {
+   cl_int err;
+   cl_uint num_platforms;
+   cl_platform_id *platforms;
+
+   // Plaform info
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   checkErr(err, "Unable to get platform count");
+
+   platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
+   err = clGetPlatformIDs(num_platforms, platforms, NULL);
+   checkErr(err, "Unable to get platform list");
+   
+   
+   // Iteratate over platforms
+   printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
+
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
+         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
+
+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
+         printf("  Plaform Version:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
+         printf("  Plaform Extensions:\t\t\t %s\n", str);    
+      }
+   }
+
+   printf("\n\n");
+
+   // Now Iteratate over each platform and its devices
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         cl_device_id * devices;
+         cl_uint num_devices;
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+         devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
+         
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+
+         printf("  Number of devices:\t\t\t\t %d\n", num_devices);
+         {
+            unsigned int j;
+            for (j=0; j<num_devices; j++) {
+               cl_device_type dev_type;
+               printf("\n  DEVICE %d\n", j);
+               
+               err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
+               checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
+
+               printf("  Device Type:\t\t\t\t\t ");
+               if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
+                  printf("CL_DEVICE_TYPE_ACCELERATOR ");
+               else if (dev_type & CL_DEVICE_TYPE_CPU)
+                  printf("CL_DEVICE_TYPE_CPU ");
+               else if (dev_type & CL_DEVICE_TYPE_GPU)
+                  printf("CL_DEVICE_TYPE_GPU ");
+               else if (dev_type & CL_DEVICE_TYPE_DEFAULT)
+                  printf("CL_DEVICE_TYPE_DEFAULT ");
+
+               printf("\n");
+
+               {
+                  cl_uint vendor_id;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
+               }
+               {
+                  cl_uint units;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
+                  printf("  Max compute units:\t\t\t\t %d\n", units); 
+               }
+
+               {
+                  cl_uint dims;
+                  size_t *sizes;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  sizes = (size_t*)malloc(dims * sizeof(size_t));
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  {
+                     unsigned int k;
+                     printf("    Max work items:\t\t\t\t (");
+                     for (k=0; k<dims; k++) {
+                        printf("%u", (unsigned int)sizes[k]);
+                        if (k != dims-1)
+                           printf(",");
+                     }
+                     printf(")\n");
+                  }
+               }
+
+#define GET_SIZET(CL_D,str) { \
+   size_t val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (unsigned int)val); \
+}
+
+#define GET_STRING(CL_D,str,size) { \
+   char val[size]; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_UINT(CL_D,str) { \
+   cl_uint val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_ULONG(CL_D,str) { \
+   cl_ulong val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_BOOL(CL_D,str) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? "Yes" : "No")); \
+}
+
+#define GET_BOOL_CUSTOM(CL_D,str,t,f) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? t : f)); \
+}
+
+#define GET_BITSET_AND(TYPE,CL_D,test,str) { \
+   TYPE val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
+}
+      
+               GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
+               
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "  Preferred vector width long:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "  Preferred vector width float:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
+               GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
+               GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
+
+               GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
+
+               GET_SIZET(CL_DEVICE_MAX_PARAMETER_SIZE, "  Max size of kernel argument:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_MEM_BASE_ADDR_ALIGN, "  Alignment of base addres:\t\t\t %u bits\n")
+               GET_UINT(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "  Minimum alignment for any datatype:\t\t %u bytes\n")
+
+               printf("  Single precision floating point capability\n");
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_DENORM, "    Denorms:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_INF_NAN, "    Quiet NaNs:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_NEAREST, "    Round to nearest even:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_ZERO, "    Round to zero:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_INF, "    Round to +ve and infinity:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_FMA, "    IEEE754-2008 fused multiply-add:\t\t %s\n")
+
+               {
+                  cl_device_mem_cache_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)");
+                  printf("  Cache type:\t\t\t\t\t ");
+                  switch (cache) {
+                     case CL_NONE:
+                        printf("None\n");
+                        break;
+                     case CL_READ_ONLY_CACHE:
+                        printf("Read only\n");
+                        break;
+                     case CL_READ_WRITE_CACHE:
+                        printf("Read/Write\n");
+                        break;
+                  }
+               }
+
+               GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
+               GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
+
+               {
+                  cl_device_local_mem_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_TYPE)");
+                  printf("  Local memory type:\t\t\t\t ");
+                  switch (cache) {
+                     case CL_LOCAL:
+                        printf("Local\n");
+                        break;
+                     case CL_GLOBAL:
+                        printf("Global\n");
+                        break;
+                  }
+               }
+
+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
+               GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
+               GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
+               GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")
+               GET_BOOL(CL_DEVICE_COMPILER_AVAILABLE, "  Compiler available:\t\t\t\t %s\n")
+
+               printf("  Execution capabilities:\t\t\t\t \n");
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_KERNEL, "  Execute OpenCL kernels:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_NATIVE_KERNEL, "  Execute native kernels:\t\t\t %s\n")
+
+               printf("  Queue properties:\t\t\t\t\n ");
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "   Out-of-Order:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, "    Profiling:\t\t\t\t\t %s\n")
+
+
+               GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
+               GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
+            
+               printf("\n");
+            }
+         }
+      }
+   }
+
+   return 0;
+}

+ 0 - 0
examples/socl/mandelbrot/mandelbrot.c


この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません