14 年前 · 03b6b6a55b
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -0,0 +1,4 @@
 
																+;; Hey Emacs, use the ugly style!
															
 
																+
															
 
																+((c-mode . ((c-file-style . "linux")
															
 
																+	    (indent-tabs-mode . t))))
															
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,187 @@
 
																+/configure
															
 
																+/config.log
															
 
																+/config.status
															
 
																+/autom4te.cache
															
 
																+/libtool
															
 
																+/libstarpu.pc
															
 
																+/aclocal.m4
															
 
																+/build-aux
															
 
																+/GPATH
															
 
																+/GRTAGS
															
 
																+/GTAGS
															
 
																+/config.cache
															
 
																+/doc/starpu.info
															
 
																+*~
															
 
																+,*
															
 
																+Makefile
															
 
																+Makefile.in
															
 
																+.libs
															
 
																+.deps
															
 
																+*.o
															
 
																+*.lo
															
 
																+*.la
															
 
																+*.swp
															
 
																+.dirstamp
															
 
																+stamp-h[0-9]*
															
 
																+starpu.log
															
 
																+/gcc-plugin/src/starpu-gcc-config.h
															
 
																+/gcc-plugin/tests/*.c.[0-9]*.*
															
 
																+/tests/datawizard/handle_to_pointer
															
 
																+/tests/datawizard/data_lookup
															
 
																+/doc/stamp-vti
															
 
																+/doc/version.texi
															
 
																+/examples/basic_examples/block
															
 
																+/examples/basic_examples/hello_world
															
 
																+/examples/basic_examples/mult
															
 
																+/examples/basic_examples/variable
															
 
																+/examples/basic_examples/vector_scal
															
 
																+/examples/callback/callback
															
 
																+/examples/filters/fblock
															
 
																+/examples/filters/fmatrix
															
 
																+/examples/filters/fvector
															
 
																+/examples/incrementer/incrementer
															
 
																+/examples/mandelbrot/mandelbrot
															
 
																+/examples/matvecmult/matvecmult
															
 
																+/examples/pi/pi
															
 
																+/examples/pi/pi_redux
															
 
																+/examples/ppm_downscaler/ppm_downscaler
															
 
																+/examples/ppm_downscaler/yuv_downscaler
															
 
																+/examples/profiling/profiling
															
 
																+/examples/reductions/dot_product
															
 
																+/examples/reductions/minmax_reduction
															
 
																+/examples/scheduler/dummy_sched
															
 
																+/examples/spmv/dw_spmv
															
 
																+/examples/spmv/spmv
															
 
																+/examples/stencil/stencil
															
 
																+/examples/tag_example/tag_example
															
 
																+/examples/tag_example/tag_example2
															
 
																+/examples/tag_example/tag_example3
															
 
																+/examples/tag_example/tag_restartable
															
 
																+/mpi/examples/stencil/stencil5
															
 
																+/mpi/tests/block_interface
															
 
																+/mpi/tests/block_interface_pinned
															
 
																+/mpi/tests/insert_task
															
 
																+/mpi/tests/insert_task_block
															
 
																+/mpi/tests/insert_task_cache
															
 
																+/mpi/tests/insert_task_owner
															
 
																+/mpi/tests/insert_task_owner2
															
 
																+/mpi/tests/mpi_detached_tag
															
 
																+/mpi/tests/mpi_irecv
															
 
																+/mpi/tests/mpi_irecv_detached
															
 
																+/mpi/tests/mpi_isend
															
 
																+/mpi/tests/mpi_isend_detached
															
 
																+/mpi/tests/mpi_test
															
 
																+/mpi/tests/multiple_send
															
 
																+/mpi/tests/pingpong
															
 
																+/mpi/tests/ring
															
 
																+/mpi/tests/ring_async
															
 
																+/mpi/tests/ring_async_implicit
															
 
																+/tests/core/declare_deps_after_submission
															
 
																+/tests/core/declare_deps_after_submission_synchronous
															
 
																+/tests/core/declare_deps_in_callback
															
 
																+/tests/core/empty_task
															
 
																+/tests/core/empty_task_chain
															
 
																+/tests/core/empty_task_sync_point
															
 
																+/tests/core/empty_task_sync_point_tasks
															
 
																+/tests/core/execute_on_a_specific_worker
															
 
																+/tests/core/get_current_task
															
 
																+/tests/core/insert_task
															
 
																+/tests/core/multithreaded
															
 
																+/tests/core/multithreaded_init
															
 
																+/tests/core/regenerate
															
 
																+/tests/core/restart
															
 
																+/tests/core/starpu_task_wait
															
 
																+/tests/core/starpu_task_wait_for_all
															
 
																+/tests/core/static_restartable
															
 
																+/tests/core/static_restartable_tag
															
 
																+/tests/core/static_restartable_using_initializer
															
 
																+/tests/core/subgraph_repeat
															
 
																+/tests/core/subgraph_repeat_regenerate
															
 
																+/tests/core/tag_wait_api
															
 
																+/tests/core/task_wait_api
															
 
																+/tests/core/wait_all_regenerable_tasks
															
 
																+/tests/datawizard/acquire_cb
															
 
																+/tests/datawizard/acquire_release
															
 
																+/tests/datawizard/acquire_release2
															
 
																+/tests/datawizard/critical_section_with_void_interface
															
 
																+/tests/datawizard/data_implicit_deps
															
 
																+/tests/datawizard/data_invalidation
															
 
																+/tests/datawizard/dining_philosophers
															
 
																+/tests/datawizard/dsm_stress
															
 
																+/tests/datawizard/increment_redux
															
 
																+/tests/datawizard/increment_redux_v2
															
 
																+/tests/datawizard/lazy_allocation.c
															
 
																+/tests/datawizard/manual_reduction
															
 
																+/tests/datawizard/mpi_like
															
 
																+/tests/datawizard/mpi_like_async
															
 
																+/tests/datawizard/readers_and_writers
															
 
																+/tests/datawizard/reclaim
															
 
																+/tests/datawizard/scratch
															
 
																+/tests/datawizard/sync_and_notify_data
															
 
																+/tests/datawizard/sync_and_notify_data_implicit
															
 
																+/tests/datawizard/sync_with_data_with_mem
															
 
																+/tests/datawizard/sync_with_data_with_mem_non_blocking
															
 
																+/tests/datawizard/sync_with_data_with_mem_non_blocking_implicit
															
 
																+/tests/datawizard/unpartition
															
 
																+/tests/datawizard/user_interaction_implicit
															
 
																+/tests/datawizard/write_only_tmp_buffer
															
 
																+/tests/errorcheck/invalid_blocking_calls
															
 
																+/tests/errorcheck/invalid_tasks
															
 
																+/tests/errorcheck/starpu_init_noworker
															
 
																+/tests/helper/cublas_init
															
 
																+/tests/helper/execute_on_all
															
 
																+/tests/helper/pinned_memory
															
 
																+/tests/helper/starpu_create_sync_task
															
 
																+/tests/helper/starpu_data_cpy
															
 
																+/tests/microbenchs/async_tasks_overhead
															
 
																+/tests/microbenchs/display_structures_size
															
 
																+/tests/microbenchs/local_pingpong
															
 
																+/tests/microbenchs/prefetch_data_on_node
															
 
																+/tests/microbenchs/redundant_buffer
															
 
																+/tests/microbenchs/sync_tasks_overhead
															
 
																+/tests/microbenchs/tasks_overhead
															
 
																+/tests/overlap/overlap
															
 
																+/tests/parallel_tasks/explicit_combined_worker
															
 
																+/tests/parallel_tasks/parallel_kernels
															
 
																+/tests/parallel_tasks/parallel_kernels_spmd
															
 
																+/tests/parallel_tasks/spmd_pgreedy
															
 
																+/tests/perfmodels/non_linear_regression_based
															
 
																+/tests/perfmodels/regression_based
															
 
																+/tools/cbc2paje
															
 
																+/tools/lp2paje
															
 
																+/tools/starpu_calibrate_bus
															
 
																+/tools/starpu_machine_display
															
 
																+/tools/starpu_perfmodel_display
															
 
																+/tools/starpu_regression_display
															
 
																+/gcc-plugin/tests/scalar-tasks
															
 
																+/gcc-plugin/tests/pointers
															
 
																+/tests/datawizard/lazy_allocation
															
 
																+/gcc-plugin/tests/pointer-tasks
															
 
																+/gcc-plugin/tests/*.s
															
 
																+/gcc-plugin/tests/base
															
 
																+/gcc-plugin/tests/core
															
 
																+/mpi/tests/insert_task_owner_data
															
 
																+/mpi/examples/scatter_gather/mpi_scatter_gather
															
 
																+/examples/top/hello_world_top
															
 
																+/doc/starpu.aux
															
 
																+/doc/starpu.cp
															
 
																+/doc/starpu.cps
															
 
																+/doc/starpu.fn
															
 
																+/doc/starpu.fns
															
 
																+/doc/starpu.html
															
 
																+/doc/starpu.ky
															
 
																+/doc/starpu.pdf
															
 
																+/doc/starpu.pg
															
 
																+/doc/starpu.toc
															
 
																+/doc/starpu.tp
															
 
																+/doc/starpu.tps
															
 
																+/doc/starpu.vr
															
 
																+/gcc-plugin/tests/register
															
 
																+/tests/datawizard/acquire_cb_insert
															
 
																+/tools/starpu_perfmodel_plot
															
 
																+/gcc-plugin/tests/run-test
															
 
																+/gcc-plugin/tests/register-errors
															
 
																+/gcc-plugin/tests/acquire
															
 
																+/gcc-plugin/tests/unregister
															
 
																+/gcc-plugin/tests/lib-user
															
 
																+/gcc-plugin/examples/matrix-mult
															
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,3 +6,7 @@ Sylvain Henry <sylvain.henry@inria.fr>
 
																 Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																 François Tessier <francois.tessier@inria.fr>
															
 
																 Samuel Thibault <samuel.thibault@labri.fr>
															
 
																+William Braik <wbraik@gmail.com>
															
 
																+Yann Courtois <yann.courtois33@gmail.com>
															
 
																+Jean-Marie Couteyen <jm.couteyen@gmail.com>
															
 
																+Anthony Roy <theanthony33@gmail.com>
															
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,7 @@
 
																-StarPU 0.5 (svn revision ????)
															
 
																+StarPU 0.9 (svn revision 3721)
															
 
																 ==============================================
															
 
																-The yet-more-stuff release
															
 
																-  
															
 
																+The extensions release
															
 
																+
															
 
																   * Provide the STARPU_REDUX data access mode
															
 
																   * Externalize the scheduler API.
															
 
																   * Add theoretical bound computation
															
@@ -19,11 +19,12 @@ The yet-more-stuff release
 
																   * Add mandelbrot OpenCL example
															
 
																   * Add cg example
															
 
																   * Add stencil MPI example
															
 
																+  * Initial support for CUDA4
															
 
																 StarPU 0.4 (svn revision 2535)
															
 
																 ==============================================
															
 
																 The API strengthening release
															
 
																-  
															
 
																+
															
 
																   * Major API improvements
															
 
																     - Provide the STARPU_SCRATCH data access mode
															
 
																     - Rework data filter interface
															
@@ -41,7 +42,7 @@ The API strengthening release
 
																   * Provide a library to help accelerating MPI applications
															
 
																   * Improve data transfers overhead prediction
															
 
																     - Transparently benchmark buses to generate performance models
															
 
																-    - Bind accelerator-controlling threads with respect to NUMA locality 
															
 
																+    - Bind accelerator-controlling threads with respect to NUMA locality
															
 
																   * Improve StarPU's portability
															
 
																     - Add OpenCL support
															
 
																     - Add support for Windows
															
@@ -63,7 +64,7 @@ The asynchronous heterogeneous multi-accelerator release
 
																     - All data transfers use data requests now
															
 
																     - Implement asynchronous data transfers
															
 
																     - Implement prefetch mechanism
															
 
																-    - Chain data requests to support GPU->RAM->GPU transfers 
															
 
																+    - Chain data requests to support GPU->RAM->GPU transfers
															
 
																   * Make it possible to bypass the scheduler and to assign a task to a specific
															
 
																     worker
															
 
																   * Support restartable tasks to reinstanciate dependencies task graphs
															
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -21,8 +21,22 @@ SUBDIRS = src
 
																 if USE_MPI
															
 
																 SUBDIRS += mpi
															
 
																 endif
															
 
																+
															
 
																+if BUILD_SOCL
															
 
																+SUBDIRS += socl
															
 
																+endif
															
 
																+
															
 
																 SUBDIRS += tools examples tests doc
															
 
																+if COND_OPT
															
 
																+SUBDIRS += tests/opt examples/opt
															
 
																+endif
															
 
																+
															
 
																+
															
 
																+if BUILD_GCC_PLUGIN
															
 
																+SUBDIRS += gcc-plugin
															
 
																+endif
															
 
																+
															
 
																 pkgconfigdir = $(libdir)/pkgconfig
															
 
																 pkgconfig_DATA = libstarpu.pc
															
@@ -42,10 +56,24 @@ include_HEADERS = 				\
 
																 	include/starpu_expert.h			\
															
 
																 	include/starpu_profiling.h		\
															
 
																 	include/starpu_bound.h			\
															
 
																-	include/starpu_scheduler.h
															
 
																+	include/starpu_scheduler.h		\
															
 
																+	include/starpu_top.h
															
 
																+
															
 
																+if BUILD_STARPU_TOP
															
 
																+all-local:
															
 
																+	cd starpu-top ; $(QMAKE) ; $(MAKE)
															
 
																+clean-local:
															
 
																+	cd starpu-top ; $(MAKE) clean
															
 
																+# TODO: resources
															
 
																+install-exec-local:
															
 
																+	$(MKDIR_P) $(DESTDIR)$(bindir)
															
 
																+	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
															
 
																+endif
															
 
																+if STARPU_HAVE_WINDOWS
															
 
																 txtdir = ${prefix}
															
 
																+else
															
 
																+txtdir = ${docdir}
															
 
																+endif
															
 
																 txt_DATA = AUTHORS COPYING.LGPL README
															
 
																 EXTRA_DIST = AUTHORS COPYING.LGPL README
															
 
																-
															
 
																-
															
--- a/README
+++ b/README
@@ -59,6 +59,12 @@ advantage of their specificities in a portable fashion.
 
																    units according to the machine topology. For more details on hwloc, see
															
 
																    http://www.open-mpi.org/projects/hwloc/ .
															
 
																+ * To build the StarPU-Top tool the following are also required:
															
 
																+   * libqt4 >= 4.7
															
 
																+   * libqt4-network
															
 
																+   * libqt4-opengl
															
 
																+   * libqt4-sql
															
 
																+
															
 
																 ++=====================++
															
 
																 || III. Getting StarPU ||
															
 
																 ++=====================++
															
@@ -97,61 +103,42 @@ we provide MinGW-built binaries.  The build process produces libstarpu.dll,
 
																 libstarpu.def, and libstarpu.lib, which should be enough to use it from e.g.
															
 
																 Microsoft Visual Studio.
															
 
																-A few details need to be fixed when building StarPU on windows:
															
 
																-
															
 
																-- To get a .def file built, make sure that MSVC's lib.exe tool is in PATH.
															
 
																-
															
 
																-- Update the video drivers to the latest stable release available for your
															
 
																-  hardware. Older ATI drivers (< 2.3) contain bugs that cause OpenCL support in
															
 
																-  StarPU to hang or exhibit incorrect behaviour.
															
 
																+Update the video drivers to the latest stable release available for your
															
 
																+hardware. Old ATI drivers (< 2.3) contain bugs that cause OpenCL support in
															
 
																+StarPU to hang or exhibit incorrect behaviour.
															
 
																-- c:\cuda\include\host_defines.h has a bogus CUDARTAPI definition which makes
															
 
																-  linking fail completely. Replace the first occurence of
															
 
																+For details on the Windows build process, see the README.dev file in the
															
 
																+subversion tree.
															
 
																-    #define CUDARTAPI
															
 
																-    
															
 
																-  with
															
 
																-    
															
 
																-    #ifdef _WIN32
															
 
																-    #define CUDARTAPI __stdcall
															
 
																-    #else
															
 
																-    #define CUDARTAPI
															
 
																-    #endif
															
 
																-
															
 
																-  While at it, you can also comment the __cdecl definition to avoid spurious
															
 
																-  warnings.
															
 
																-
															
 
																-- If you have a non-english version of windows, use
															
 
																+++==================++
															
 
																+|| V. Documentation ||
															
 
																+++==================++
															
 
																-    export LANG=C
															
 
																+Texinfo documentation is available in doc/ . If LaTeX is available on the
															
 
																+machine, a pdf can be generated by running
															
 
																-  else libtool has troubles parsing the translated output of the toolchain.
															
 
																+  $ make -C doc pdf
															
 
																-- libtool is not able to find the libraries automatically, you need to make some
															
 
																-  copies:
															
 
																+If makeinfo is available on the machine, html pages can be generated by running
															
 
																-    copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
															
 
																-    copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
															
 
																-    copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
															
 
																-    copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
															
 
																-    copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
															
 
																+  $ make -C doc html
															
 
																-++===========++
															
 
																-|| V. Trying ||
															
 
																-++===========++
															
 
																+++============++
															
 
																+|| VI. Trying ||
															
 
																+++============++
															
 
																 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
															
 
																-++=============++
															
 
																-|| VI. Upgrade ||
															
 
																-++=============++
															
 
																+++==============++
															
 
																+|| VII. Upgrade ||
															
 
																+++==============++
															
 
																 To upgrade your source code from older version (there were quite a few
															
 
																 renamings), use the tools/rename.sh script
															
 
																-++==============++
															
 
																-|| VII. Contact ||
															
 
																-++==============++
															
 
																+++===============++
															
 
																+|| VIII. Contact ||
															
 
																+++===============++
															
 
																 For any questions regarding StarPU, please contact the starpu-devel
															
 
																 mailing-list at starpu-devel@lists.gforge.inria.fr .
															
--- a/README.dev
+++ b/README.dev
@@ -0,0 +1,169 @@
 
																+Installing StarPU on windows
															
 
																+----------------------------
															
 
																+
															
 
																+If you are building from a tarball downloaded from the website, you can skip the
															
 
																+cygwin part.
															
 
																+
															
 
																+1. Install cygwin
															
 
																+
															
 
																+http://cygwin.com/install.html
															
 
																+
															
 
																+Make sure the following packages are available:
															
 
																+- (Devel)/subversion
															
 
																+- (Devel)/libtool
															
 
																+- (Devel)/gcc
															
 
																+- (Devel)/make
															
 
																+- your favorite editor (vi, emacs, ...)
															
 
																+- (Devel)/gdb
															
 
																+- (Archive)/zip
															
 
																+- (Devel)/pkg-config
															
 
																+
															
 
																+2. Install mingw
															
 
																+
															
 
																+http://sourceforge.net/projects/mingw/
															
 
																+
															
 
																+3. Install hwloc (not mandatory)
															
 
																+
															
 
																+http://www.open-mpi.org/projects/hwloc
															
 
																+
															
 
																+4. Install Microsoft Visual C++ Studio Express
															
 
																+
															
 
																+   http://www.microsoft.com/express/Downloads
															
 
																+
															
 
																+   Add in your path the following directories.
															
 
																+   (adjusting where necessary for the Installation location according to VC
															
 
																+    version and on 64 and 32bit Windows versions)
															
 
																+
															
 
																+   On cygwin, with Visual C++ 2010 e.g.;
															
 
																+
															
 
																+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
															
 
																+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
															
 
																+
															
 
																+   On MingW, with Visual C++ 2010, e.g.;
															
 
																+
															
 
																+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
															
 
																+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
															
 
																+
															
 
																+   Try to call <lib.exe> and <link.exe> without any option to make sure these
															
 
																+   dump their help output, else no .def or .lib file will be produced.
															
 
																+
															
 
																+5. Install GPU Drivers (not mandatory)
															
 
																+
															
 
																+  5.1 Install Cuda
															
 
																+
															
 
																+      http://developer.nvidia.com/object/cuda_3_2_downloads.html
															
 
																+
															
 
																+      You need to install at least the CUDA toolkit.
															
 
																+
															
 
																+      libtool is not able to find the libraries automatically, you
															
 
																+      need to make some copies:
															
 
																+
															
 
																+      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
															
 
																+      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
															
 
																+      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
															
 
																+      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
															
 
																+      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
															
 
																+
															
 
																+      (and if the version of your CUDA driver is >= 3.2)
															
 
																+
															
 
																+      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
															
 
																+
															
 
																+      Add the CUDA bin directory in your path
															
 
																+
															
 
																+      export PATH=/cygdrive/c/CUDA/bin:$PATH
															
 
																+
															
 
																+      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
															
 
																+      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
															
 
																+      definition which makes linking fail completely. Replace the first
															
 
																+      occurence of
															
 
																+
															
 
																+      #define CUDARTAPI
															
 
																+
															
 
																+      with
															
 
																+
															
 
																+      #ifdef _WIN32
															
 
																+      #define CUDARTAPI __stdcall
															
 
																+      #else
															
 
																+      #define CUDARTAPI
															
 
																+      #endif
															
 
																+
															
 
																+      While at it, you can also comment the __cdecl definition to avoid spurious
															
 
																+      warnings.
															
 
																+
															
 
																+
															
 
																+  5.2 Install OpenCL
															
 
																+
															
 
																+      http://developer.nvidia.com/object/opencl-download.html
															
 
																+
															
 
																+      You need to download the NVIDIA Drivers for your version of
															
 
																+      Windows. Executing the file will extract all files in a given
															
 
																+      directory. The the driver installation will start, it will fail
															
 
																+      if no compatibles drivers can be found on your system.
															
 
																+
															
 
																+      Anyway, you should copy the *.dl_ files from the directory
															
 
																+      (extraction path) in the bin directory of the CUDA installation
															
 
																+      directory (the directory should be v3.2/bin/)
															
 
																+
															
 
																+  5.3 Install MsCompress
															
 
																+
															
 
																+      http://gnuwin32.sourceforge.net/packages/mscompress.htm
															
 
																+
															
 
																+      Go in the CUDA bin directory, uncompress .dl_ files and rename
															
 
																+      them in .dll files
															
 
																+
															
 
																+      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
															
 
																+      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
															
 
																+
															
 
																+If you are building from a tarball downloaded from the website, you can skip the
															
 
																+autogen.sh part.
															
 
																+
															
 
																+6. Start autogen.sh from cygwin
															
 
																+
															
 
																+   cd starpu-trunk
															
 
																+   ./autogen.sh
															
 
																+
															
 
																+7. Start a MinGW shell
															
 
																+
															
 
																+   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
															
 
																+
															
 
																+8. Configure, make, install from MinGW
															
 
																+
															
 
																+   If you have a non-english version of windows, use
															
 
																+
															
 
																+     export LANG=C
															
 
																+
															
 
																+   else libtool has troubles parsing the translated output of the toolchain.
															
 
																+
															
 
																+   cd starpu-trunk
															
 
																+   mkdir build
															
 
																+   cd build
															
 
																+   ../configure --prefix=$PWD/target --disable-default-drand48 \
															
 
																+        --with-hwloc=<HWLOC installation directory> \
															
 
																+        --with-cuda-dir=<CUDA installation directory> \
															
 
																+        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
															
 
																+	--with-opencl-dir=<CUDA installation directory>
															
 
																+   make
															
 
																+   make install
															
 
																+
															
 
																+   Also convert a couple of files to CRLF:
															
 
																+
															
 
																+   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
															
 
																+   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
															
 
																+   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
															
 
																+
															
 
																+9. If you want your StarPU installation to be standalone, you need to
															
 
																+   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
															
 
																+   installation bin directory, as well as MinGW/bin/libpthread*dll
															
 
																+
															
 
																+   cp <CUDA directory>/bin/*dll target/bin
															
 
																+   cp <HWLOC directory>/bin/*dll target/bin
															
 
																+   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
															
 
																+
															
 
																+   and set the StarPU bin directory in your path.
															
 
																+
															
 
																+   export PATH=<StarPU installation directory>/bin:$PATH
															
 
																+
															
 
																+
															
 
																+Developers warning
															
 
																+------------------
															
 
																+They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.
															
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1,3 +1,19 @@
 
																+dnl Copyright (C) Free Software Foundation, Inc.
															
 
																+dnl
															
 
																+dnl This program is free software; you can redistribute it and/or modify
															
 
																+dnl it under the terms of the GNU General Public License as published by
															
 
																+dnl the Free Software Foundation; either version 2 of the License, or
															
 
																+dnl (at your option) any later version.
															
 
																+dnl 
															
 
																+dnl This program is distributed in the hope that it will be useful,
															
 
																+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
															
 
																+dnl GNU General Public License for more details.
															
 
																+dnl 
															
 
																+dnl You should have received a copy of the GNU General Public License
															
 
																+dnl along with this program; if not, write to the Free Software
															
 
																+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
															
 
																+dnl
															
 
																 dnl This test is taken from libgfortran
															
 
																 dnl Check whether the target supports __sync_val_compare_and_swap.
															
--- a/configure.ac
+++ b/configure.ac
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
--- a/doc/tutorial/Makefile
+++ b/doc/tutorial/Makefile
@@ -0,0 +1,45 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+CFLAGS          +=      $$(pkg-config --cflags libstarpu)
															
 
																+LDFLAGS         +=      $$(pkg-config --libs libstarpu)
															
 
																+
															
 
																+HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
															
 
																+NVCC		?=	nvcc
															
 
																+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
															
 
																+
															
 
																+%.o: %.cu
															
 
																+	nvcc $(CFLAGS) $< -c
															
 
																+
															
 
																+all: hello_world vector_scal
															
 
																+
															
 
																+VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o 
															
 
																+ifneq ($(strip $(HAS_CUDA)),)
															
 
																+VECTOR_SCAL_PREREQUISITES	+=	vector_scal_cuda.o
															
 
																+VECTOR_SCAL_COMPILER		=	$(NVCC)
															
 
																+else
															
 
																+VECTOR_SCAL_COMPILER		=	$(CC)
															
 
																+endif
															
 
																+ifneq ($(strip $(HAS_OPENCL)),)
															
 
																+VECTOR_SCAL_PREREQUISITES += vector_scal_opencl.o
															
 
																+endif
															
 
																+
															
 
																+vector_scal: $(VECTOR_SCAL_PREREQUISITES)
															
 
																+	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
															
 
																+
															
 
																+clean:
															
 
																+	rm -f hello_world vector_scal *.o
															
 
																+
															
--- a/doc/tutorial/README
+++ b/doc/tutorial/README
@@ -0,0 +1,33 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+
															
 
																+Instructions on how to compile and run StarPU examples
															
 
																+------------------------------------------------------
															
 
																+
															
 
																+% export STARPU_DIR=<directory where StarPU is installed>
															
 
																+% export PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
															
 
																+% export LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
															
 
																+
															
 
																+% make hello_world
															
 
																+% ./hello_world
															
 
																+
															
 
																+% make vector_scal
															
 
																+% ./vector_scal
															
 
																+
															
 
																+% STARPU_NCPUS=0 ./vector_scal
															
 
																+% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
															
 
																+
															
--- a/doc/tutorial/hello_world.c
+++ b/doc/tutorial/hello_world.c
@@ -0,0 +1,70 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+struct params {
															
 
																+    int i;
															
 
																+    float f;
															
 
																+};
															
 
																+
															
 
																+void cpu_func(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+    struct params *params = cl_arg;
															
 
																+
															
 
																+    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
															
 
																+}
															
 
																+
															
 
																+starpu_codelet cl =
															
 
																+{
															
 
																+    .where = STARPU_CPU,
															
 
																+    .cpu_func = cpu_func,
															
 
																+    .nbuffers = 0
															
 
																+};
															
 
																+
															
 
																+void callback_func(void *callback_arg)
															
 
																+{
															
 
																+    printf("Callback function (arg %x)\n", callback_arg);
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+    /* initialize StarPU */
															
 
																+    starpu_init(NULL);
															
 
																+
															
 
																+    struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+    task->cl = &cl; /* Pointer to the codelet defined above */
															
 
																+
															
 
																+    struct params params = { 1, 2.0f };
															
 
																+    task->cl_arg = &params;
															
 
																+    task->cl_arg_size = sizeof(params);
															
 
																+
															
 
																+    task->callback_func = callback_func;
															
 
																+    task->callback_arg = 0x42;
															
 
																+
															
 
																+    /* starpu_task_submit will be a blocking call */
															
 
																+    task->synchronous = 1;
															
 
																+
															
 
																+    /* submit the task to StarPU */
															
 
																+    starpu_task_submit(task);
															
 
																+
															
 
																+    /* terminate StarPU */
															
 
																+    starpu_shutdown();
															
 
																+
															
 
																+    return 0;
															
 
																+}
															
--- a/doc/tutorial/vector_scal.c
+++ b/doc/tutorial/vector_scal.c
@@ -0,0 +1,124 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/*
															
 
																+ * This example demonstrates how to use StarPU to scale an array by a factor.
															
 
																+ * It shows how to manipulate data with StarPU's data management library.
															
 
																+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
															
 
																+ *  2- how to describe which data are accessed by a task (task->buffers[0])
															
 
																+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
															
 
																+ */
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+
															
 
																+#define    NX    2048
															
 
																+
															
 
																+extern void scal_cpu_func(void *buffers[], void *_args);
															
 
																+extern void scal_cuda_func(void *buffers[], void *_args);
															
 
																+extern void scal_opencl_func(void *buffers[], void *_args);
															
 
																+
															
 
																+static starpu_codelet cl = {
															
 
																+    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
															
 
																+    /* CPU implementation of the codelet */
															
 
																+    .cpu_func = scal_cpu_func,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+    /* CUDA implementation of the codelet */
															
 
																+    .cuda_func = scal_cuda_func,
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+    /* OpenCL implementation of the codelet */
															
 
																+    .opencl_func = scal_opencl_func,
															
 
																+#endif
															
 
																+    .nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+struct starpu_opencl_program programs;
															
 
																+#endif
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+    /* We consider a vector of float that is initialized just as any of C
															
 
																+      * data */
															
 
																+    float vector[NX];
															
 
																+    unsigned i;
															
 
																+    for (i = 0; i < NX; i++)
															
 
																+        vector[i] = 1.0f;
															
 
																+
															
 
																+    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
															
 
																+
															
 
																+    /* Initialize StarPU with default configuration */
															
 
																+    starpu_init(NULL);
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+        starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
															
 
																+#endif
															
 
																+
															
 
																+    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
															
 
																+     * identifier. When a task needs to access a piece of data, it should
															
 
																+     * refer to the handle that is associated to it.
															
 
																+     * In the case of the "vector" data interface:
															
 
																+     *  - the first argument of the registration method is a pointer to the
															
 
																+     *    handle that should describe the data
															
 
																+     *  - the second argument is the memory node where the data (ie. "vector")
															
 
																+     *    resides initially: 0 stands for an address in main memory, as
															
 
																+     *    opposed to an adress on a GPU for instance.
															
 
																+     *  - the third argument is the adress of the vector in RAM
															
 
																+     *  - the fourth argument is the number of elements in the vector
															
 
																+     *  - the fifth argument is the size of each element.
															
 
																+     */
															
 
																+    starpu_data_handle vector_handle;
															
 
																+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
															
 
																+                                NX, sizeof(vector[0]));
															
 
																+
															
 
																+    float factor = 3.14;
															
 
																+
															
 
																+    /* create a synchronous task: any call to starpu_task_submit will block
															
 
																+      * until it is terminated */
															
 
																+    struct starpu_task *task = starpu_task_create();
															
 
																+    task->synchronous = 1;
															
 
																+
															
 
																+    task->cl = &cl;
															
 
																+
															
 
																+    /* the codelet manipulates one buffer in RW mode */
															
 
																+    task->buffers[0].handle = vector_handle;
															
 
																+    task->buffers[0].mode = STARPU_RW;
															
 
																+
															
 
																+    /* an argument is passed to the codelet, beware that this is a
															
 
																+     * READ-ONLY buffer and that the codelet may be given a pointer to a
															
 
																+     * COPY of the argument */
															
 
																+    task->cl_arg = &factor;
															
 
																+    task->cl_arg_size = sizeof(factor);
															
 
																+
															
 
																+    /* execute the task on any eligible computational ressource */
															
 
																+    starpu_task_submit(task);
															
 
																+
															
 
																+    /* StarPU does not need to manipulate the array anymore so we can stop
															
 
																+      * monitoring it */
															
 
																+    starpu_data_unregister(vector_handle);
															
 
																+
															
 
																+#ifdef STARPU_USE_OPENCL
															
 
																+    starpu_opencl_unload_opencl(&programs);
															
 
																+#endif
															
 
																+
															
 
																+    /* terminate StarPU, no task can be submitted after */
															
 
																+    starpu_shutdown();
															
 
																+
															
 
																+    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
															
 
																+
															
 
																+    return 0;
															
 
																+}
															
--- a/doc/tutorial/vector_scal_cpu.c
+++ b/doc/tutorial/vector_scal_cpu.c
@@ -0,0 +1,50 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+/* This kernel takes a buffer and scales it by a constant factor */
															
 
																+void scal_cpu_func(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+    unsigned i;
															
 
																+    float *factor = cl_arg;
															
 
																+
															
 
																+    /*
															
 
																+     * The "buffers" array matches the task->buffers array: for instance
															
 
																+     * task->buffers[0].handle is a handle that corresponds to a data with
															
 
																+     * vector "interface", so that the first entry of the array in the
															
 
																+     * codelet  is a pointer to a structure describing such a vector (ie.
															
 
																+     * struct starpu_vector_interface_s *). Here, we therefore manipulate
															
 
																+     * the buffers[0] element as a vector: nx gives the number of elements
															
 
																+     * in the array, ptr gives the location of the array (that was possibly
															
 
																+     * migrated/replicated), and elemsize gives the size of each elements.
															
 
																+     */
															
 
																+    starpu_vector_interface_t *vector = buffers[0];
															
 
																+
															
 
																+    /* length of the vector */
															
 
																+    unsigned n = STARPU_VECTOR_GET_NX(vector);
															
 
																+
															
 
																+    /* get a pointer to the local copy of the vector : note that we have to
															
 
																+     * cast it in (float *) since a vector could contain any type of
															
 
																+     * elements so that the .ptr field is actually a uintptr_t */
															
 
																+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
															
 
																+
															
 
																+    /* scale the vector */
															
 
																+    for (i = 0; i < n; i++)
															
 
																+        val[i] *= *factor;
															
 
																+}
															
 
																+
															
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -0,0 +1,43 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+
															
 
																+static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
															
 
																+{
															
 
																+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
															
 
																+        if (i < n)
															
 
																+               val[i] *= factor;
															
 
																+}
															
 
																+
															
 
																+extern "C" void scal_cuda_func(void *buffers[], void *_args)
															
 
																+{
															
 
																+        float *factor = (float *)_args;
															
 
																+
															
 
																+        /* length of the vector */
															
 
																+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																+        /* local copy of the vector pointer */
															
 
																+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+        unsigned threads_per_block = 64;
															
 
																+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																+
															
 
																+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
															
 
																+
															
 
																+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+}
															
 
																+
															
--- a/doc/tutorial/vector_scal_opencl.c
+++ b/doc/tutorial/vector_scal_opencl.c
@@ -0,0 +1,60 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_opencl.h>
															
 
																+
															
 
																+extern struct starpu_opencl_program programs;
															
 
																+
															
 
																+void scal_opencl_func(void *buffers[], void *_args)
															
 
																+{
															
 
																+    float *factor = _args;
															
 
																+    int id, devid, err;
															
 
																+    cl_kernel kernel;
															
 
																+    cl_command_queue queue;
															
 
																+    cl_event event;
															
 
																+
															
 
																+    /* length of the vector */
															
 
																+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																+    /* OpenCL copy of the vector pointer */
															
 
																+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+
															
 
																+    id = starpu_worker_get_id();
															
 
																+    devid = starpu_worker_get_devid(id);
															
 
																+
															
 
																+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
															
 
																+                    "vector_mult_opencl", devid);   /* Name of the codelet defined above */
															
 
																+    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
															
 
																+    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
															
 
																+    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
															
 
																+    if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+
															
 
																+    {
															
 
																+        size_t global=1;
															
 
																+        size_t local=1;
															
 
																+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
															
 
																+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																+    }
															
 
																+
															
 
																+    clFinish(queue);
															
 
																+    starpu_opencl_collect_stats(event);
															
 
																+    clReleaseEvent(event);
															
 
																+
															
 
																+    starpu_opencl_release_kernel(kernel);
															
 
																+}
															
--- a/doc/tutorial/vector_scal_opencl_kernel.cl
+++ b/doc/tutorial/vector_scal_opencl_kernel.cl
@@ -0,0 +1,25 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
															
 
																+{
															
 
																+        const int i = get_global_id(0);
															
 
																+        if (i < nx) {
															
 
																+                val[i] *= factor;
															
 
																+        }
															
 
																+}
															
 
																+
															
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 
																 #ifdef STARPU_USE_OPENCL
															
 
																         starpu_opencl_load_opencl_from_file(
															
 
																-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs);
															
 
																+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
															
 
																 #endif
															
 
																     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
															
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -1,9 +1,10 @@
 
																 #include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																 static __global__ void vector_mult_cuda(float *val, unsigned n,
															
 
																                                         float factor)
															
 
																 @{
															
 
																-        unsigned i;
															
 
																+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
															
 
																         if (i < n)
															
 
																                val[i] *= factor;
															
 
																 @}
															
--- a/doc/vector_scal_opencl.texi
+++ b/doc/vector_scal_opencl.texi
@@ -13,8 +13,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																     /* length of the vector */
															
 
																     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																-    /* local copy of the vector pointer */
															
 
																-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+    /* OpenCL copy of the vector pointer */
															
 
																+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																     id = starpu_worker_get_id();
															
 
																     devid = starpu_worker_get_devid(id);
															
@@ -23,7 +23,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																                                     devid);
															
 
																     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
															
 
																     err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
															
 
																     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
															
 
																     if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
--- a/examples/.gitignore
+++ b/examples/.gitignore
@@ -0,0 +1 @@
 
																+/.deps
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																-# Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -14,24 +14,23 @@
 
																 #
															
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																-AUTOMAKE_OPTIONS = subdir-objects
															
 
																-
															
 
																-AM_CFLAGS = $(HWLOC_CFLAGS) -Wall
															
 
																-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
															
 
																+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
															
 
																 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
															
 
																+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
															
 
																-TESTS	=	$(check_PROGRAMS)
															
 
																+SUBDIRS = stencil
															
 
																-SUBDIRS = stencil stencil_ctx
															
 
																+if STARPU_USE_SOCL
															
 
																+SUBDIRS += socl
															
 
																+endif
															
 
																 if STARPU_HAVE_FFTW
															
 
																-if STARPU_HAVE_FFTWL
															
 
																+if STARPU_HAVE_FFTWF
															
 
																 SUBDIRS += starpufft
															
 
																 endif
															
 
																 endif
															
 
																-check_PROGRAMS =
															
 
																-
															
 
																 BUILT_SOURCES =
															
 
																 if STARPU_USE_OPENCL
															
@@ -40,7 +39,9 @@ endif
 
																 EXTRA_DIST = 					\
															
 
																 	basic_examples/vector_scal_opencl_kernel.cl \
															
 
																+	common/blas_model.c			\
															
 
																 	spmv/spmv_cuda.cu			\
															
 
																+	spmv/spmv_opencl.cl			\
															
 
																 	gordon/null_kernel_gordon.c		\
															
 
																 	mult/xgemm.c				\
															
 
																 	lu/xlu.c				\
															
@@ -54,6 +55,7 @@ EXTRA_DIST = 					\
 
																 	basic_examples/variable_kernels_opencl_kernel.cl	\
															
 
																 	matvecmult/matvecmult_kernel.cl				\
															
 
																 	basic_examples/block_opencl_kernel.cl			\
															
 
																+	openmp/vector_scal.c			\
															
 
																 	filters/fblock_opencl_kernel.cl
															
 
																 CLEANFILES = 					\
															
@@ -64,7 +66,7 @@ CLEANFILES += *.gcno *.gcda *.linkinfo
 
																 if STARPU_USE_CUDA
															
 
																-NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  -arch sm_13
															
 
																+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS) -arch sm_13
															
 
																 .cu.o:
															
 
																 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
															
@@ -106,11 +108,6 @@ noinst_HEADERS = 				\
 
																 	lu/xlu_kernels.h			\
															
 
																 	lu/float.h				\
															
 
																 	lu/double.h				\
															
 
																-	pi/pi.h					\
															
 
																-	pi/SobolQRNG/sobol.h			\
															
 
																-	pi/SobolQRNG/sobol_gold.h		\
															
 
																-	pi/SobolQRNG/sobol_gpu.h		\
															
 
																-	pi/SobolQRNG/sobol_primitives.h		\
															
 
																 	cholesky/cholesky.h			\
															
 
																 	common/blas_model.h			\
															
 
																 	common/blas.h				\
															
@@ -122,22 +119,134 @@ noinst_HEADERS = 				\
 
																 	ppm_downscaler/yuv_downscaler.h		\
															
 
																 	spmv/matrix_market/mmio.h		\
															
 
																 	spmv/matrix_market/mm_to_bcsr.h		\
															
 
																-	spmv/dw_spmv.h				\
															
 
																+	spmv/spmv.h				\
															
 
																 	spmv/dw_block_spmv.h
															
 
																+#####################################
															
 
																+# What to install and what to check #
															
 
																+#####################################
															
 
																-##################
															
 
																-# Basic examples #
															
 
																-##################
															
 
																+STARPU_EXAMPLES	=
															
 
																+TESTS		=	$(STARPU_EXAMPLES)
															
 
																+
															
 
																+if STARPU_HAVE_WINDOWS
															
 
																+check_PROGRAMS	=	$(STARPU_EXAMPLES)
															
 
																+else
															
 
																+check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
															
 
																+endif
															
 
																+
															
 
																+if !STARPU_HAVE_WINDOWS
															
 
																+## test loader program
															
 
																+LOADER			=	loader
															
 
																+LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
															
 
																+loader_SOURCES		=	../tests/loader.c
															
 
																+TESTS_ENVIRONMENT	=	$(LOADER_BIN)
															
 
																+endif
															
 
																 examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/hello_world
															
 
																+	basic_examples/hello_world		\
															
 
																+	basic_examples/vector_scal		\
															
 
																+	basic_examples/mult			\
															
 
																+	basic_examples/block			\
															
 
																+	basic_examples/variable			\
															
 
																+	filters/fvector				\
															
 
																+	filters/fblock				\
															
 
																+	filters/fmatrix				\
															
 
																+	tag_example/tag_example			\
															
 
																+	tag_example/tag_example3		\
															
 
																+	tag_example/tag_example2		\
															
 
																+	tag_example/tag_restartable		\
															
 
																+	spmv/spmv				\
															
 
																+	callback/callback			\
															
 
																+	incrementer/incrementer			\
															
 
																+	matvecmult/matvecmult			\
															
 
																+	profiling/profiling			\
															
 
																+	scheduler/dummy_sched			\
															
 
																+	reductions/dot_product			\
															
 
																+	reductions/minmax_reduction		\
															
 
																+	mandelbrot/mandelbrot			\
															
 
																+	ppm_downscaler/ppm_downscaler		\
															
 
																+	ppm_downscaler/yuv_downscaler
															
 
																-basic_examples_hello_world_SOURCES =		\
															
 
																-	basic_examples/hello_world.c
															
 
																+if STARPU_HAVE_F77_H
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	basic_examples/vector_scal_fortran
															
 
																+endif
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	axpy/axpy				\
															
 
																+	mult/sgemm 				\
															
 
																+	mult/dgemm				\
															
 
																+	cholesky/cholesky_tag			\
															
 
																+	cholesky/cholesky_tile_tag		\
															
 
																+	cholesky/cholesky_grain_tag		\
															
 
																+	cholesky/cholesky_implicit		\
															
 
																+	lu/lu_example_float			\
															
 
																+	lu/lu_example_double			\
															
 
																+	lu/lu_implicit_example_float		\
															
 
																+	lu/lu_implicit_example_double		\
															
 
																+	heat/heat				\
															
 
																+	cg/cg
															
 
																+endif
															
 
																+if ATLAS_BLAS_LIB
															
 
																 examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/vector_scal
															
 
																+	spmv/dw_block_spmv
															
 
																+endif
															
 
																+
															
 
																+STARPU_EXAMPLES +=				\
															
 
																+	basic_examples/hello_world		\
															
 
																+	basic_examples/vector_scal		\
															
 
																+	basic_examples/mult			\
															
 
																+	basic_examples/block			\
															
 
																+	basic_examples/variable			\
															
 
																+	filters/fvector				\
															
 
																+	filters/fblock				\
															
 
																+	filters/fmatrix				\
															
 
																+	tag_example/tag_example			\
															
 
																+	tag_example/tag_example3		\
															
 
																+	tag_example/tag_example2		\
															
 
																+	tag_example/tag_restartable		\
															
 
																+	spmv/spmv				\
															
 
																+	callback/callback			\
															
 
																+	incrementer/incrementer			\
															
 
																+	matvecmult/matvecmult			\
															
 
																+	profiling/profiling			\
															
 
																+	scheduler/dummy_sched			\
															
 
																+	reductions/dot_product			\
															
 
																+	reductions/minmax_reduction
															
 
																+
															
 
																+if STARPU_HAVE_F77_H
															
 
																+STARPU_EXAMPLES +=				\
															
 
																+	basic_examples/vector_scal_fortran
															
 
																+endif
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+STARPU_EXAMPLES +=				\
															
 
																+	axpy/axpy				\
															
 
																+	mult/sgemm 				\
															
 
																+	mult/dgemm				\
															
 
																+	cholesky/cholesky_tag			\
															
 
																+	cholesky/cholesky_tile_tag		\
															
 
																+	cholesky/cholesky_grain_tag		\
															
 
																+	cholesky/cholesky_implicit		\
															
 
																+	lu/lu_example_float			\
															
 
																+	lu/lu_example_double			\
															
 
																+	lu/lu_implicit_example_float		\
															
 
																+	lu/lu_implicit_example_double		\
															
 
																+	heat/heat				\
															
 
																+	cg/cg
															
 
																+endif
															
 
																+
															
 
																+if ATLAS_BLAS_LIB
															
 
																+STARPU_EXAMPLES +=				\
															
 
																+	spmv/dw_block_spmv
															
 
																+endif
															
 
																+
															
 
																+##################
															
 
																+# Basic examples #
															
 
																+##################
															
 
																 basic_examples_vector_scal_SOURCES =		\
															
 
																 	basic_examples/vector_scal.c		\
															
@@ -156,9 +265,6 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
																 endif
															
 
																 if STARPU_HAVE_F77_H
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/vector_scal_fortran
															
 
																-
															
 
																 basic_examples_vector_scal_fortran_SOURCES =	\
															
 
																 	basic_examples/vector_scal_fortran.F	\
															
 
																 	basic_examples/vector_scal_c.c		\
															
@@ -167,25 +273,15 @@ basic_examples_vector_scal_fortran_SOURCES =	\
 
																 if STARPU_USE_CUDA
															
 
																 basic_examples_vector_scal_fortran_SOURCES +=	\
															
 
																 	basic_examples/vector_scal_cuda.cu
															
 
																+basic_examples_vector_scal_fortran_LDADD =	\
															
 
																+	$(STARPU_CUDA_FORTRAN_LDFLAGS)
															
 
																 endif
															
 
																 endif
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/mult
															
 
																-
															
 
																-basic_examples_mult_SOURCES =			\
															
 
																-	basic_examples/mult.c
															
 
																-
															
 
																 #################
															
 
																 # block example #
															
 
																 #################
															
 
																-check_PROGRAMS +=				\
															
 
																-	basic_examples/block
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/block
															
 
																-
															
 
																 basic_examples_block_SOURCES =			\
															
 
																 	basic_examples/block.c			\
															
 
																 	basic_examples/block_cpu.c
															
@@ -206,12 +302,6 @@ endif
 
																 # Variable example #
															
 
																 ####################
															
 
																-check_PROGRAMS +=				\
															
 
																-	basic_examples/variable
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	basic_examples/variable
															
 
																-
															
 
																 basic_examples_variable_SOURCES =		\
															
 
																 	basic_examples/variable.c		\
															
 
																 	basic_examples/variable_kernels_cpu.c
															
@@ -232,14 +322,6 @@ endif
 
																 # Filters #
															
 
																 ###########
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	filters/fvector				\
															
 
																-	filters/fblock				\
															
 
																-	filters/fmatrix
															
 
																-
															
 
																-filters_fvector_SOURCES =			\
															
 
																-	filters/fvector.c
															
 
																-
															
 
																 filters_fblock_SOURCES =			\
															
 
																 	filters/fblock.c			\
															
 
																 	filters/fblock_cpu.c
															
@@ -255,69 +337,17 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
																 	filters/fblock_opencl_kernel.cl
															
 
																 endif
															
 
																-filters_fmatrix_SOURCES =			\
															
 
																-	filters/fmatrix.c
															
 
																-
															
 
																-###################
															
 
																-# PPM downscaling #
															
 
																-###################
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	ppm_downscaler/ppm_downscaler
															
 
																-
															
 
																-ppm_downscaler_ppm_downscaler_SOURCES =		\
															
 
																-	ppm_downscaler/ppm_downscaler.c
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	ppm_downscaler/yuv_downscaler
															
 
																-
															
 
																-ppm_downscaler_yuv_downscaler_SOURCES =		\
															
 
																-	ppm_downscaler/yuv_downscaler.c
															
 
																-
															
 
																-######
															
 
																-# Pi #
															
 
																-######
															
 
																-
															
 
																-check_PROGRAMS +=				\
															
 
																-	pi/pi_redux
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	pi/pi					\
															
 
																-	pi/pi_redux
															
 
																-
															
 
																-pi_pi_SOURCES =					\
															
 
																-	pi/pi.c					\
															
 
																-	pi/SobolQRNG/sobol_gold.c		\
															
 
																-	pi/SobolQRNG/sobol_primitives.c
															
 
																-
															
 
																-if STARPU_USE_CUDA
															
 
																-pi_pi_SOURCES +=				\
															
 
																-	pi/pi_kernel.cu				\
															
 
																-	pi/SobolQRNG/sobol_gpu.cu
															
 
																-endif
															
 
																-
															
 
																-pi_pi_redux_SOURCES =				\
															
 
																-	pi/pi_redux.c
															
 
																-
															
 
																-if STARPU_USE_CUDA
															
 
																-pi_pi_redux_SOURCES +=				\
															
 
																-	pi/pi_redux_kernel.cu
															
 
																-endif
															
 
																-
															
 
																-
															
 
																 ################
															
 
																 # AXPY example #
															
 
																 ################
															
 
																 if !NO_BLAS_LIB
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	axpy/axpy
															
 
																-
															
 
																 axpy_axpy_SOURCES =				\
															
 
																 	axpy/axpy.c				\
															
 
																 	common/blas.c
															
 
																+axpy_axpy_LDADD =				\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																 endif
															
 
																 ################
															
@@ -326,18 +356,20 @@ endif
 
																 if !NO_BLAS_LIB
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	mult/sgemm 				\
															
 
																-	mult/dgemm
															
 
																-
															
 
																 mult_sgemm_SOURCES = 				\
															
 
																 	mult/sgemm.c				\
															
 
																 	common/blas.c
															
 
																+mult_sgemm_LDADD =				\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 mult_dgemm_SOURCES = 				\
															
 
																 	mult/dgemm.c				\
															
 
																 	common/blas.c
															
 
																+mult_dgemm_LDADD =				\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 endif
															
 
																 ####################
															
@@ -346,36 +378,42 @@ endif
 
																 if !NO_BLAS_LIB
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	cholesky/cholesky_tag			\
															
 
																-	cholesky/cholesky_tile_tag		\
															
 
																-	cholesky/cholesky_grain_tag		\
															
 
																-	cholesky/cholesky_implicit
															
 
																-
															
 
																 cholesky_cholesky_tag_SOURCES =			\
															
 
																 	cholesky/cholesky_tag.c			\
															
 
																 	cholesky/cholesky_models.c		\
															
 
																 	cholesky/cholesky_kernels.c		\
															
 
																 	common/blas.c
															
 
																+cholesky_cholesky_tag_LDADD =			\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 cholesky_cholesky_tile_tag_SOURCES =		\
															
 
																 	cholesky/cholesky_tile_tag.c		\
															
 
																 	cholesky/cholesky_models.c		\
															
 
																 	cholesky/cholesky_kernels.c		\
															
 
																 	common/blas.c
															
 
																+cholesky_cholesky_tile_tag_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 cholesky_cholesky_grain_tag_SOURCES =		\
															
 
																 	cholesky/cholesky_grain_tag.c		\
															
 
																 	cholesky/cholesky_models.c		\
															
 
																 	cholesky/cholesky_kernels.c		\
															
 
																 	common/blas.c
															
 
																+cholesky_cholesky_grain_tag_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 cholesky_cholesky_implicit_SOURCES =		\
															
 
																 	cholesky/cholesky_implicit.c		\
															
 
																 	cholesky/cholesky_models.c		\
															
 
																 	cholesky/cholesky_kernels.c		\
															
 
																 	common/blas.c
															
 
																+cholesky_cholesky_implicit_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 endif
															
 
																 ##############
															
@@ -384,14 +422,6 @@ endif
 
																 if !NO_BLAS_LIB
															
 
																-check_PROGRAMS +=				\
															
 
																-	lu/lu_example_float			\
															
 
																-	lu/lu_implicit_example_float
															
 
																-
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	lu/lu_example_float			\
															
 
																-	lu/lu_example_double
															
 
																-
															
 
																 lu_lu_example_float_SOURCES =			\
															
 
																 	lu/lu_example_float.c			\
															
 
																 	lu/slu.c				\
															
@@ -399,6 +429,9 @@ lu_lu_example_float_SOURCES =			\
 
																 	lu/slu_kernels.c			\
															
 
																 	common/blas.c
															
 
																+lu_lu_example_float_LDADD =			\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 lu_lu_example_double_SOURCES =			\
															
 
																 	lu/lu_example_double.c			\
															
 
																 	lu/dlu.c				\
															
@@ -406,9 +439,8 @@ lu_lu_example_double_SOURCES =			\
 
																 	lu/dlu_kernels.c			\
															
 
																 	common/blas.c
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	lu/lu_implicit_example_float		\
															
 
																-	lu/lu_implicit_example_double
															
 
																+lu_lu_example_double_LDADD =			\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																 lu_lu_implicit_example_float_SOURCES =		\
															
 
																 	lu/lu_example_float.c			\
															
@@ -417,6 +449,9 @@ lu_lu_implicit_example_float_SOURCES =		\
 
																 	lu/slu_kernels.c			\
															
 
																 	common/blas.c
															
 
																+lu_lu_implicit_example_float_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 lu_lu_implicit_example_double_SOURCES =		\
															
 
																 	lu/lu_example_double.c			\
															
 
																 	lu/dlu_implicit.c			\
															
@@ -424,6 +459,8 @@ lu_lu_implicit_example_double_SOURCES =		\
 
																 	lu/dlu_kernels.c			\
															
 
																 	common/blas.c
															
 
																+lu_lu_implicit_example_double_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																 endif
															
 
																 ###########################
															
@@ -448,8 +485,6 @@ endif
 
																 if !NO_BLAS_LIB
															
 
																-examplebin_PROGRAMS += heat/heat
															
 
																-
															
 
																 heat_heat_SOURCES =				\
															
 
																 	heat/heat.c				\
															
 
																 	heat/dw_factolu.c			\
															
@@ -462,6 +497,10 @@ heat_heat_SOURCES =				\
 
																 	heat/dw_factolu_kernels.c		\
															
 
																 	common/blas.c
															
 
																+heat_heat_LDADD =				\
															
 
																+	$(STARPU_OPENGL_RENDER_LDFLAGS)		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																 endif
															
 
																 ##############
															
@@ -470,8 +509,6 @@ endif
 
																 if !NO_BLAS_LIB
															
 
																-examplebin_PROGRAMS += cg/cg
															
 
																-
															
 
																 cg_cg_SOURCES =					\
															
 
																 	cg/cg.c					\
															
 
																 	cg/cg_kernels.c				\
															
@@ -482,62 +519,33 @@ cg_cg_SOURCES +=				\
 
																 	cg/cg_dot_kernel.cu
															
 
																 endif
															
 
																-endif
															
 
																-
															
 
																+cg_cg_LDADD =					\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																-
															
 
																-################
															
 
																-# Tag examples #
															
 
																-################
															
 
																-
															
 
																-check_PROGRAMS +=			\
															
 
																-	tag_example/tag_example			\
															
 
																-	tag_example/tag_example3			\
															
 
																-	tag_example/tag_example2	\
															
 
																-	tag_example/tag_restartable
															
 
																-
															
 
																-examplebin_PROGRAMS +=			\
															
 
																-	tag_example/tag_example			\
															
 
																-	tag_example/tag_example3		\
															
 
																-	tag_example/tag_example2	\
															
 
																-	tag_example/tag_restartable
															
 
																-
															
 
																-tag_example_tag_example_SOURCES =		\
															
 
																-	tag_example/tag_example.c
															
 
																-
															
 
																-tag_example_tag_example2_SOURCES =		\
															
 
																-	tag_example/tag_example2.c
															
 
																-
															
 
																-tag_example_tag_example3_SOURCES =		\
															
 
																-	tag_example/tag_example3.c
															
 
																-
															
 
																-tag_example_tag_restartable_SOURCES =		\
															
 
																-	tag_example/tag_restartable.c
															
 
																+endif
															
 
																 ################
															
 
																 # SpMV example #
															
 
																 ################
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	spmv/dw_spmv
															
 
																-
															
 
																-spmv_dw_spmv_SOURCES = 				\
															
 
																-	spmv/dw_spmv.c
															
 
																+spmv_spmv_SOURCES = 				\
															
 
																+	spmv/spmv.c				\
															
 
																+	spmv/spmv_kernels.c
															
 
																 if STARPU_USE_CUDA
															
 
																-spmv_dw_spmv_SOURCES +=				\
															
 
																+spmv_spmv_SOURCES +=				\
															
 
																 	spmv/spmv_cuda.cu
															
 
																 endif
															
 
																 if ATLAS_BLAS_LIB
															
 
																-examplebin_PROGRAMS += 				\
															
 
																-	spmv/dw_block_spmv
															
 
																-
															
 
																 spmv_dw_block_spmv_SOURCES =			\
															
 
																 	spmv/dw_block_spmv.c			\
															
 
																 	spmv/dw_block_spmv_kernels.c		\
															
 
																 	spmv/matrix_market/mm_to_bcsr.c		\
															
 
																 	spmv/matrix_market/mmio.c
															
 
																+
															
 
																+spmv_dw_block_spmv_LDADD =			\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																 endif
															
 
																 #######################
															
@@ -545,12 +553,6 @@ endif
 
																 #######################
															
 
																-check_PROGRAMS +=				\
															
 
																-	incrementer/incrementer
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	incrementer/incrementer
															
 
																-
															
 
																 incrementer_incrementer_SOURCES =	\
															
 
																 	incrementer/incrementer.c
															
 
																 if STARPU_USE_CUDA
															
@@ -568,78 +570,38 @@ endif
 
																 # matVecMult example #
															
 
																 ######################
															
 
																-check_PROGRAMS +=				\
															
 
																-	matvecmult/matvecmult
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	matvecmult/matvecmult
															
 
																-
															
 
																-matvecmult_matvecmult_SOURCES =	\
															
 
																-	matvecmult/matvecmult.c
															
 
																-
															
 
																 if STARPU_USE_OPENCL
															
 
																 nobase_STARPU_OPENCL_DATA_DATA += \
															
 
																 	matvecmult/matvecmult_kernel.cl
															
 
																 endif
															
 
																-#####################
															
 
																-# profiling example #
															
 
																-#####################
															
 
																-
															
 
																-check_PROGRAMS +=				\
															
 
																-	profiling/profiling
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	profiling/profiling
															
 
																-
															
 
																-profiling_profiling_SOURCES =			\
															
 
																-	profiling/profiling.c
															
 
																-
															
 
																-#####################
															
 
																-# scheduler example #
															
 
																-#####################
															
 
																-
															
 
																-check_PROGRAMS +=				\
															
 
																-	scheduler/dummy_sched
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	scheduler/dummy_sched
															
 
																-
															
 
																-scheduler_dummy_sched_SOURCES =			\
															
 
																-	scheduler/dummy_sched.c
															
 
																-
															
 
																 #######################
															
 
																 # dot_product example #
															
 
																 #######################
															
 
																-check_PROGRAMS +=				\
															
 
																-	reductions/dot_product
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	reductions/dot_product
															
 
																-
															
 
																 reductions_dot_product_SOURCES =		\
															
 
																 	reductions/dot_product.c
															
 
																-
															
 
																-#####################
															
 
																-# Min/Max reduction #
															
 
																-#####################
															
 
																-
															
 
																-check_PROGRAMS +=				\
															
 
																-	reductions/minmax_reduction
															
 
																-
															
 
																-examplebin_PROGRAMS +=				\
															
 
																-	reductions/minmax_reduction
															
 
																-
															
 
																-reductions_minmax_reduction_SOURCES =		\
															
 
																-	reductions/minmax_reduction.c
															
 
																+if STARPU_USE_CUDA
															
 
																+reductions_dot_product_SOURCES +=		\
															
 
																+	reductions/dot_product_kernels.cu
															
 
																+endif
															
 
																 ##################
															
 
																 # Mandelbrot Set #
															
 
																 ##################
															
 
																+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
															
 
																+if HAVE_X11
															
 
																+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
															
 
																+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
															
 
																+endif
															
 
																+
															
 
																+################
															
 
																+# Top Examples #
															
 
																+################
															
 
																+
															
 
																 examplebin_PROGRAMS +=				\
															
 
																-	mandelbrot/mandelbrot
															
 
																+	top/hello_world_top
															
 
																-mandelbrot_mandelbrot_SOURCES =			\
															
 
																-	mandelbrot/mandelbrot.c
															
 
																+top_hello_world_top_SOURCES =			\
															
 
																+	top/hello_world_top.c
															
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -30,7 +30,7 @@
 
																 #include <cufft.h>
															
 
																 #endif
															
 
																-//#define SAVE_RAW	1
															
 
																+/* #define SAVE_RAW	1 */
															
 
																 #define DEFAULTINPUTFILE	"input.wav"
															
 
																 #define DEFAULTOUTPUTFILE	"output.wav"
															
@@ -328,14 +328,14 @@ static void init_problem(void)
 
																 	/* allocate a buffer to store the content of input file */
															
 
																 	if (use_pin)
															
 
																 	{
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
															
 
																+		starpu_malloc((void **)&A, length_data*sizeof(float));
															
 
																 	}
															
 
																 	else {
															
 
																 		A = malloc(length_data*sizeof(float));
															
 
																 	}
															
 
																 	/* allocate working buffer (this could be done online, but we'll keep it simple) */
															
 
																-	//starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
															
 
																+	/* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
															
 
																 	/* read input data into buffer "A" */
															
 
																 	read_16bit_wav(infile, length_data, A, infile_raw);
															
@@ -396,9 +396,7 @@ int main(int argc, char **argv)
 
																 	struct starpu_data_filter f = 
															
 
																 	{
															
 
																 		.filter_func = starpu_block_filter_func_vector,
															
 
																-		.nchildren = niter,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = niter
															
 
																 	};
															
 
																 	starpu_data_partition(A_handle, &f);
															
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -36,6 +36,8 @@
 
																 #define NBLOCKS	8
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 TYPE *vec_x, *vec_y;
															
 
																 /* descriptors for StarPU */
															
@@ -93,21 +95,21 @@ int main(int argc, char **argv)
 
																 		vec_a = malloc(N*sizeof(TYPE));
															
 
																 		vec_b = malloc(N*sizeof(TYPE));
															
 
																 	*/
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&vec_x, N*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&vec_x, N*sizeof(TYPE));
															
 
																 	assert(vec_x);
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&vec_y, N*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&vec_y, N*sizeof(TYPE));
															
 
																 	assert(vec_y);
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < N; i++)
															
 
																 	{
															
 
																-		vec_x[i] = 1.0f;//(TYPE)starpu_drand48();
															
 
																-		vec_y[i] = 4.0f;//(TYPE)starpu_drand48();
															
 
																+		vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
															
 
																+		vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
															
 
																 	}
															
 
																-	fprintf(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
															
 
																-	fprintf(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
															
 
																+	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
															
 
																+	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
															
 
																 	/* Declare the data to StarPU */
															
 
																 	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
															
@@ -116,9 +118,7 @@ int main(int argc, char **argv)
 
																 	/* Divide the vector into blocks */
															
 
																 	struct starpu_data_filter block_filter = {
															
 
																 		.filter_func = starpu_block_filter_func_vector,
															
 
																-		.nchildren = NBLOCKS,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = NBLOCKS
															
 
																 	};
															
 
																 	starpu_data_partition(handle_x, &block_filter);
															
@@ -151,16 +151,21 @@ int main(int argc, char **argv)
 
																 	starpu_task_wait_for_all();
															
 
																+	starpu_data_unpartition(handle_x, 0);
															
 
																 	starpu_data_unpartition(handle_y, 0);
															
 
																+	starpu_data_unregister(handle_x);
															
 
																 	starpu_data_unregister(handle_y);
															
 
																 	gettimeofday(&end, NULL);
															
 
																         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
															
 
																                                         (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
															
 
																+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
															
 
																+
															
 
																+	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
															
 
																-	fprintf(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
															
 
																+	starpu_free((void *)vec_x);
															
 
																+	starpu_free((void *)vec_y);
															
 
																 	/* Stop StarPU */
															
 
																 	starpu_shutdown();
															
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -20,6 +20,8 @@
 
																 #include <pthread.h>
															
 
																 #include <math.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 extern void cpu_codelet(void *descr[], void *_args);
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 extern void cuda_codelet(void *descr[], void *_args);
															
@@ -52,24 +54,23 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 
																         task->buffers[0].handle = block_handle;
															
 
																         task->buffers[0].mode = STARPU_RW;
															
 
																 	task->cl_arg = &multiplier;
															
 
																+	task->cl_arg_size = sizeof(multiplier);
															
 
																         int ret = starpu_task_submit(task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                FPRINTF(stderr, "No worker may execute this task\n");
															
 
																                 return 1;
															
 
																 	}
															
 
																 	starpu_task_wait_for_all();
															
 
																 	/* update the array in RAM */
															
 
																-        starpu_data_acquire(block_handle, STARPU_R);
															
 
																+	starpu_data_unregister(block_handle);
															
 
																         for(i=0 ; i<pnx*pny*pnz; i++) {
															
 
																-          fprintf(stderr, "%f ", block[i]);
															
 
																+          FPRINTF(stderr, "%f ", block[i]);
															
 
																         }
															
 
																-        fprintf(stderr, "\n");
															
 
																-
															
 
																-        starpu_data_release(block_handle);
															
 
																+        FPRINTF(stderr, "\n");
															
 
																         return 0;
															
 
																 }
															
@@ -98,7 +99,7 @@ int main(int argc, char **argv)
 
																         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
															
 
																         if (!ret) multiplier *= 1.0;
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code);
															
 
																+        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
															
 
																         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
															
 
																         if (!ret) multiplier *= 2.0;
															
 
																 #endif
															
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
 
																         if (!ret) multiplier *= 3.0;
															
 
																 #endif
															
 
																-        // Check result is correct
															
 
																+        /* Check result is correct */
															
 
																         ret=1;
															
 
																         for(i=0 ; i<nx*ny*nz ; i++) {
															
 
																           if (block[i] != (i+1) * multiplier) {
															
@@ -116,7 +117,9 @@ int main(int argc, char **argv)
 
																           }
															
 
																         }
															
 
																-        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
															
 
																+        FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
															
 
																+	free(block);
															
 
																+
															
 
																         starpu_shutdown();
															
 
																 	return 0;
															
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -25,8 +25,8 @@ void opencl_codelet(void *descr[], void *_args)
 
																 	cl_kernel kernel;
															
 
																 	cl_command_queue queue;
															
 
																 	cl_event event;
															
 
																-	int id, devid, err, n;
															
 
																-	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
															
 
																+	int id, devid, err;
															
 
																+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
															
 
																 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
															
 
																 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
															
 
																 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
															
@@ -41,14 +41,13 @@ void opencl_codelet(void *descr[], void *_args)
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	err = 0;
															
 
																-        n=0;
															
 
																-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
															
 
																-	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
															
 
																-	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
															
 
																-	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
															
 
																+	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
															
 
																+	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
															
 
																+	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
															
 
																 	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
															
 
																 	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
															
 
																-	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
															
 
																+	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
															
 
																         if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	{
															
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -31,12 +31,14 @@
 
																 #include <stdint.h>
															
 
																 #include <starpu.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 /* When the task is done, task->callback_func(task->callback_arg) is called. Any
															
 
																  * callback function must have the prototype void (*)(void *).
															
 
																  * NB: Callback are NOT allowed to perform potentially blocking operations */
															
 
																 void callback_func(void *callback_arg)
															
 
																 {
															
 
																-	printf("Callback function got argument %p\n", callback_arg);
															
 
																+        FPRINTF(stdout, "Callback function got argument %p\n", callback_arg);
															
 
																 }
															
 
																 /* Every implementation of a codelet must have this prototype, the first
															
@@ -52,22 +54,16 @@ void cpu_func(void *buffers[], void *cl_arg)
 
																 {
															
 
																 	struct params *params = cl_arg;
															
 
																-	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
															
 
																+	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
															
 
																 }
															
 
																-starpu_codelet cl =
															
 
																-{
															
 
																-	/* this codelet may only be executed on a CPU, and its cpu
															
 
																- 	 * implementation is function "cpu_func" */
															
 
																-	.where = STARPU_CPU,
															
 
																-	.cpu_func = cpu_func,
															
 
																-	/* the codelet does not manipulate any data that is managed
															
 
																-	 * by our DSM */
															
 
																-	.nbuffers = 0
															
 
																-};
															
 
																+starpu_codelet cl;
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																+	struct starpu_task *task;
															
 
																+	struct params params = {1, 2.0f};
															
 
																+
															
 
																 	/* initialize StarPU : passing a NULL argument means that we use
															
 
																  	* default configuration for the scheduling policies and the number of
															
 
																 	* processors/accelerators */
															
@@ -76,7 +72,15 @@ int main(int argc, char **argv)
 
																 	/* create a new task that is non-blocking by default : the task is not
															
 
																 	 * submitted to the scheduler until the starpu_task_submit function is
															
 
																 	 * called */
															
 
																-	struct starpu_task *task = starpu_task_create();
															
 
																+	task = starpu_task_create();
															
 
																+
															
 
																+	/* this codelet may only be executed on a CPU, and its cpu
															
 
																+ 	 * implementation is function "cpu_func" */
															
 
																+	cl.where = STARPU_CPU;
															
 
																+	cl.cpu_func = cpu_func;
															
 
																+	/* the codelet does not manipulate any data that is managed
															
 
																+	 * by our DSM */
															
 
																+	cl.nbuffers = 0;
															
 
																 	/* the task uses codelet "cl" */
															
 
																 	task->cl = &cl;
															
@@ -89,7 +93,6 @@ int main(int argc, char **argv)
 
																 	 * is read-only so that any modification is not passed to other copies
															
 
																 	 * of the buffer.  For this reason, a buffer passed as a codelet
															
 
																 	 * argument (cl_arg) is NOT a valid synchronization medium! */
															
 
																-	struct params params = { 1, 2.0f };
															
 
																 	task->cl_arg = &params;
															
 
																 	task->cl_arg_size = sizeof(params);
															
@@ -103,6 +106,9 @@ int main(int argc, char **argv)
 
																 	/* submit the task to StarPU */
															
 
																 	starpu_task_submit(task);
															
 
																+
															
 
																+	/* destroy the task */
															
 
																+	starpu_task_destroy(task);
															
 
																 	/* terminate StarPU: statistics and other debug outputs are not
															
 
																 	 * guaranteed to be generated unless this function is called. Once it
															
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -28,8 +28,7 @@
 
																  *    monitoring data (starpu_data_unregister)
															
 
																  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
															
 
																  *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
															
 
																- *  - how to submit asynchronous tasks and how to use callback to handle task
															
 
																- *    termination
															
 
																+ *  - how to submit asynchronous tasks
															
 
																  */
															
 
																 #include <string.h>
															
@@ -44,11 +43,6 @@
 
																 static float *A, *B, *C;
															
 
																 static starpu_data_handle A_handle, B_handle, C_handle;
															
 
																-static pthread_mutex_t mutex;
															
 
																-static pthread_cond_t cond;
															
 
																-static unsigned taskcounter;
															
 
																-static unsigned terminated = 0;
															
 
																-
															
 
																 static unsigned nslicesx = 4;
															
 
																 static unsigned nslicesy = 4;
															
 
																 static unsigned xdim = 1024;
															
@@ -77,37 +71,11 @@ static unsigned zdim = 512;
 
																  */
															
 
																-static void callback_func(void *arg)
															
 
																-{
															
 
																-	/* the argument is a pointer to a counter of the remaining tasks */
															
 
																-	int *counterptr = arg;
															
 
																-
															
 
																-	/* counterptr points to a variable with the number of remaining tasks,
															
 
																- 	 * when it reaches 0, all tasks are done */
															
 
																-	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
															
 
																-	if (counter == 0)
															
 
																-	{
															
 
																-		/* IMPORTANT : note that we CANNOT call blocking operations
															
 
																-		 * within callbacks as it may lead to a deadlock of StarPU.
															
 
																-		 * starpu_data_unpartition is for instance called by the main
															
 
																-		 * thread since it may cause /potentially/ blocking operations
															
 
																-		 * such as memory transfers from a GPU to a CPU. */
															
 
																-		
															
 
																-		/* wake the application to notify the termination of all the
															
 
																- 		 * tasks */
															
 
																-		pthread_mutex_lock(&mutex);
															
 
																-		terminated = 1;
															
 
																-		pthread_cond_signal(&cond);
															
 
																-		pthread_mutex_unlock(&mutex);
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																 /*
															
 
																  * The codelet is passed 3 matrices, the "descr" union-type field gives a
															
 
																  * description of the layout of those 3 matrices in the local memory (ie. RAM
															
 
																  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
															
 
																- * registered data with the "blas" data interface, we manipulate the .blas
															
 
																- * field of the descr[x] elements which are union types.
															
 
																+ * registered data with the "matrix" data interface, we use the matrix macros.
															
 
																  */
															
 
																 static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
															
@@ -218,18 +186,14 @@ static void partition_mult_data(void)
 
																 	/* StarPU supplies some basic filters such as the partition of a matrix
															
 
																 	 * into blocks, note that we are using a FORTRAN ordering so that the
															
 
																 	 * name of the filters are a bit misleading */
															
 
																-	struct starpu_data_filter f = {
															
 
																+	struct starpu_data_filter vert = {
															
 
																 		.filter_func = starpu_vertical_block_filter_func,
															
 
																-		.nchildren = nslicesx,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = nslicesx
															
 
																 	};
															
 
																-	struct starpu_data_filter f2 = {
															
 
																+	struct starpu_data_filter horiz = {
															
 
																 		.filter_func = starpu_block_filter_func,
															
 
																-		.nchildren = nslicesy,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = nslicesy
															
 
																 	};
															
 
																 /*
															
@@ -269,17 +233,17 @@ static void partition_mult_data(void)
 
																  *	enforce memory consistency.
															
 
																  */
															
 
																-	starpu_data_partition(B_handle, &f);
															
 
																-	starpu_data_partition(A_handle, &f2);
															
 
																+	starpu_data_partition(B_handle, &vert);
															
 
																+	starpu_data_partition(A_handle, &horiz);
															
 
																 	/* starpu_data_map_filters is a variable-arity function, the first argument
															
 
																 	 * is the handle of the data to partition, the second argument is the
															
 
																 	 * number of filters to apply recursively. Filters are applied in the
															
 
																 	 * same order as the arguments.
															
 
																-	 * This would be equivalent to starpu_data_partition(C_handle, &f) and
															
 
																-	 * then applying f2 on each sub-data (ie. each column of C)
															
 
																+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
															
 
																+	 * then applying horiz on each sub-data (ie. each column of C)
															
 
																 	 */
															
 
																-	starpu_data_map_filters(C_handle, 2, &f, &f2);
															
 
																+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
															
 
																 }
															
 
																 static struct starpu_perfmodel_t mult_perf_model = {
															
@@ -287,28 +251,23 @@ static struct starpu_perfmodel_t mult_perf_model = {
 
																 	.symbol = "mult_perf_model"
															
 
																 };
															
 
																+static starpu_codelet cl = {
															
 
																+        /* we can only execute that kernel on a CPU yet */
															
 
																+        .where = STARPU_CPU,
															
 
																+        /* CPU implementation of the codelet */
															
 
																+        .cpu_func = cpu_mult,
															
 
																+        /* the codelet manipulates 3 buffers that are managed by the
															
 
																+         * DSM */
															
 
																+        .nbuffers = 3,
															
 
																+        /* in case the scheduling policy may use performance models */
															
 
																+        .model = &mult_perf_model
															
 
																+};
															
 
																+
															
 
																 static void launch_tasks(void)
															
 
																 {
															
 
																 	/* partition the work into slices */
															
 
																 	unsigned taskx, tasky;
															
 
																-	/* the callback decrements this value every time a task is terminated
															
 
																-	 * and notify the termination of the computation to the application
															
 
																-	 * when the counter reaches 0 */
															
 
																-	taskcounter = nslicesx * nslicesy;
															
 
																-
															
 
																-	starpu_codelet cl = {
															
 
																-		/* we can only execute that kernel on a CPU yet */
															
 
																-		.where = STARPU_CPU,
															
 
																-		/* CPU implementation of the codelet */
															
 
																-		.cpu_func = cpu_mult,
															
 
																-		/* the codelet manipulates 3 buffers that are managed by the
															
 
																- 		 * DSM */
															
 
																-		.nbuffers = 3,
															
 
																-		/* in case the scheduling policy may use performance models */
															
 
																-		.model = &mult_perf_model
															
 
																-	};
															
 
																-
															
 
																 	for (taskx = 0; taskx < nslicesx; taskx++) 
															
 
																 	{
															
 
																 		for (tasky = 0; tasky < nslicesy; tasky++)
															
@@ -322,9 +281,6 @@ static void launch_tasks(void)
 
																 			/* this task implements codelet "cl" */
															
 
																 			task->cl = &cl;
															
 
																-			task->callback_func = callback_func;
															
 
																-			task->callback_arg = &taskcounter;
															
 
																-
															
 
																 			/*
															
 
																 			 *              |---|---|---|---|
															
 
																 			 *              |   | * |   |   | B
															
@@ -371,9 +327,6 @@ static void launch_tasks(void)
 
																 int main(__attribute__ ((unused)) int argc, 
															
 
																 	 __attribute__ ((unused)) char **argv)
															
 
																 {
															
 
																-	pthread_mutex_init(&mutex, NULL);
															
 
																-	pthread_cond_init(&cond, NULL);
															
 
																-
															
 
																 	/* start the runtime */
															
 
																 	starpu_init(NULL);
															
@@ -387,26 +340,30 @@ int main(__attribute__ ((unused)) int argc,
 
																 	/* submit all tasks in an asynchronous fashion */
															
 
																 	launch_tasks();
															
 
																-	/* the different tasks are asynchronous so we use a callback to get
															
 
																-	 * notified of the termination of the computation */
															
 
																-	pthread_mutex_lock(&mutex);
															
 
																-	if (!terminated)
															
 
																-		pthread_cond_wait(&cond, &mutex);
															
 
																-	pthread_mutex_unlock(&mutex);
															
 
																+	/* wait for termination */
															
 
																+        starpu_task_wait_for_all();
															
 
																 	/* remove the filters applied by the means of starpu_data_map_filters; now
															
 
																  	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
															
 
																 	 * starpu_data_map_filters is called again on C_handle.
															
 
																 	 * The second argument is the memory node where the different subsets
															
 
																 	 * should be reassembled, 0 = main memory (RAM) */
															
 
																+	starpu_data_unpartition(A_handle, 0);
															
 
																+	starpu_data_unpartition(B_handle, 0);
															
 
																 	starpu_data_unpartition(C_handle, 0);
															
 
																 	/* stop monitoring matrix C : after this, it is not possible to pass C 
															
 
																 	 * (or any subset of C) as a codelet input/output. This also implements
															
 
																 	 * a barrier so that the piece of data is put back into main memory in
															
 
																 	 * case it was only available on a GPU for instance. */
															
 
																+	starpu_data_unregister(A_handle);
															
 
																+	starpu_data_unregister(B_handle);
															
 
																 	starpu_data_unregister(C_handle);
															
 
																-	
															
 
																+
															
 
																+	free(A);
															
 
																+	free(B);
															
 
																+	free(C);
															
 
																+
															
 
																 	starpu_shutdown();
															
 
																 	return 0;
															
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -18,6 +18,8 @@
 
																 #include <starpu.h>
															
 
																 #include <pthread.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static unsigned niter = 50000;
															
 
																 extern void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
@@ -41,6 +43,9 @@ int main(int argc, char **argv)
 
																 	starpu_init(NULL);
															
 
																+#ifdef STARPU_SLOW_MACHINE
															
 
																+	niter /= 100;
															
 
																+#endif
															
 
																         if (argc == 2) niter = atoi(argv[1]);
															
 
																         foo = 0.0f;
															
@@ -48,7 +53,7 @@ int main(int argc, char **argv)
 
																                                       (uintptr_t)&foo, sizeof(float));
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program);
															
 
																+        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
															
 
																 #endif
															
 
																 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
															
@@ -77,7 +82,7 @@ int main(int argc, char **argv)
 
																 		ret = starpu_task_submit(task);
															
 
																 		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 		{
															
 
																-			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 			exit(0);
															
 
																 		}
															
 
																 	}
															
@@ -85,11 +90,9 @@ int main(int argc, char **argv)
 
																 	starpu_task_wait_for_all();
															
 
																 	/* update the array in RAM */
															
 
																-	starpu_data_acquire(float_array_handle, STARPU_R);
															
 
																-
															
 
																-	fprintf(stderr, "variable -> %f\n", foo);
															
 
																+	starpu_data_unregister(float_array_handle);
															
 
																-	starpu_data_release(float_array_handle);
															
 
																+	FPRINTF(stderr, "variable -> %f\n", foo);
															
 
																 	starpu_shutdown();
															
--- a/examples/basic_examples/variable_kernels_opencl.c
+++ b/examples/basic_examples/variable_kernels_opencl.c
@@ -21,7 +21,7 @@
 
																 extern struct starpu_opencl_program opencl_program;
															
 
																 void opencl_codelet(void *descr[], void *_args)
															
 
																 {
															
 
																-	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	cl_kernel kernel;
															
 
																 	cl_command_queue queue;
															
 
																 	cl_event event;
															
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 
																 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	err = 0;
															
 
																-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
															
 
																 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	{
															
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -28,6 +28,7 @@
 
																 #include <stdio.h>
															
 
																 #define	NX	2048
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 extern void scal_cpu_func(void *buffers[], void *_args);
															
 
																 extern void scal_cuda_func(void *buffers[], void *_args);
															
@@ -71,16 +72,17 @@ int main(int argc, char **argv)
 
																 	float vector[NX];
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < NX; i++)
															
 
																-		vector[i] = 1.0f;
															
 
																+                vector[i] = (i+1.0f);
															
 
																-	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
															
 
																 	/* Initialize StarPU with default configuration */
															
 
																 	starpu_init(NULL);
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
															
 
																-					    &opencl_program);
															
 
																+					    &opencl_program, NULL);
															
 
																 #endif
															
 
																 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
															
@@ -125,6 +127,8 @@ int main(int argc, char **argv)
 
																  	 * monitoring it */
															
 
																 	starpu_data_unregister(vector_handle);
															
 
																+	starpu_task_destroy(task);
															
 
																+
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																         starpu_opencl_unload_opencl(&opencl_program);
															
 
																 #endif
															
@@ -132,7 +136,8 @@ int main(int argc, char **argv)
 
																 	/* terminate StarPU, no task can be submitted after */
															
 
																 	starpu_shutdown();
															
 
																-	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
															
 
																 	return 0;
															
 
																 }
															
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2010  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,7 +25,7 @@
 
																 static __global__ void vector_mult_cuda(float *val, unsigned n,
															
 
																                                         float factor)
															
 
																 {
															
 
																-        unsigned i = threadIdx.x;
															
 
																+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
															
 
																 	if (i < n)
															
 
																                val[i] *= factor;
															
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -36,8 +36,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																 	/* length of the vector */
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																-	/* local copy of the vector pointer */
															
 
																-	float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+	/* OpenCL copy of the vector pointer */
															
 
																+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																 	id = starpu_worker_get_id();
															
 
																 	devid = starpu_worker_get_devid(id);
															
@@ -45,7 +45,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
															
 
																 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
															
 
																 	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
															
 
																 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
															
 
																 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
--- a/examples/callback/callback.c
+++ b/examples/callback/callback.c
@@ -0,0 +1,73 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <pthread.h>
															
 
																+#include <sys/time.h>
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																+starpu_data_handle handle;
															
 
																+
															
 
																+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+
															
 
																+	*val += 1;
															
 
																+}
															
 
																+
															
 
																+starpu_codelet cl =
															
 
																+{
															
 
																+	.where = STARPU_CPU,
															
 
																+	.cpu_func = cpu_codelet,
															
 
																+	.nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+void callback_func(void *callback_arg)
															
 
																+{
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+	task->cl = &cl;
															
 
																+	task->buffers[0].handle = handle;
															
 
																+	task->buffers[0].mode = STARPU_RW;
															
 
																+	starpu_task_submit(task);
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int v=40;
															
 
																+
															
 
																+	starpu_init(NULL);
															
 
																+	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
															
 
																+
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+	task->cl = &cl;
															
 
																+	task->callback_func = callback_func;
															
 
																+	task->callback_arg = NULL;
															
 
																+	task->buffers[0].handle = handle;
															
 
																+	task->buffers[0].mode = STARPU_RW;
															
 
																+
															
 
																+	starpu_task_submit(task);
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_data_unregister(handle);
															
 
																+
															
 
																+	FPRINTF(stderr, "v -> %d\n", v);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -13,6 +13,7 @@
 
																  *
															
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																+
															
 
																 #include <math.h>
															
 
																 #include <assert.h>
															
 
																 #include <sys/time.h>
															
@@ -24,6 +25,8 @@
 
																 #include <cublas.h>
															
 
																 #endif
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 /*
															
 
																  *	Conjugate Gradient
															
 
																  *
															
@@ -92,23 +95,19 @@ extern starpu_codelet bzero_vector_cl;
 
																 static void generate_random_problem(void)
															
 
																 {
															
 
																-	srand48(0xdeadbeef);
															
 
																-
															
 
																 	int i, j;
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&A, n*n*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&b, n*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&x, n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&A, n*n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&b, n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&x, n*sizeof(TYPE));
															
 
																 	assert(A && b && x);
															
 
																-	/* Create a random matrix (A) and two random vectors (x and b) */
															
 
																 	for (j = 0; j < n; j++)
															
 
																 	{
															
 
																 		b[j] = (TYPE)1.0;
															
 
																 		x[j] = (TYPE)0.0;
															
 
																 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
															
 
																-
															
 
																 		for (i = 0; i < n; i++)
															
 
																 		{
															
 
																 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));
															
@@ -116,9 +115,9 @@ static void generate_random_problem(void)
 
																 	}
															
 
																 	/* Internal vectors */
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&r, n*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&d, n*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&q, n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&r, n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&d, n*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&q, n*sizeof(TYPE));
															
 
																 	assert(r && d && q);
															
 
																 	memset(r, 0, n*sizeof(TYPE));
															
@@ -205,12 +204,12 @@ static void display_vector(starpu_data_handle handle, TYPE *ptr)
 
																 		starpu_data_acquire(starpu_data_get_sub_data(handle, 1, b), STARPU_R);
															
 
																 		for (ind = 0; ind < block_size; ind++)
															
 
																 		{
															
 
																-			fprintf(stderr, "%2.2e ", ptr[b*block_size + ind]);
															
 
																+			FPRINTF(stderr, "%2.2e ", ptr[b*block_size + ind]);
															
 
																 		}
															
 
																-		fprintf(stderr, "| ");
															
 
																+		FPRINTF(stderr, "| ");
															
 
																 		starpu_data_release(starpu_data_get_sub_data(handle, 1, b));
															
 
																 	}
															
 
																-	fprintf(stderr, "\n");
															
 
																+	FPRINTF(stderr, "\n");
															
 
																 }
															
 
																 static void display_matrix(void)
															
@@ -220,9 +219,9 @@ static void display_matrix(void)
 
																 	{
															
 
																 		for (j = 0; j < n; j++)
															
 
																 		{
															
 
																-			fprintf(stderr, "%2.2e ", A[j*n + i]);
															
 
																+			FPRINTF(stderr, "%2.2e ", A[j*n + i]);
															
 
																 		}
															
 
																-		fprintf(stderr, "\n");
															
 
																+		FPRINTF(stderr, "\n");
															
 
																 	}
															
 
																 }
															
 
																 #endif
															
@@ -255,8 +254,8 @@ static void cg(void)
 
																 	delta_0 = delta_new;
															
 
																 	starpu_data_release(rtr_handle);
															
 
																-	fprintf(stderr, "*************** INITIAL ************ \n");
															
 
																-	fprintf(stderr, "Delta 0: %e\n", delta_new);
															
 
																+	FPRINTF(stderr, "*************** INITIAL ************ \n");
															
 
																+	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
															
 
																 	struct timeval start;
															
 
																 	struct timeval end;
															
@@ -307,8 +306,8 @@ static void cg(void)
 
																 		{
															
 
																 			/* We here take the error as ||r||_2 / (n||b||_2) */
															
 
																 			double error = sqrt(delta_new/delta_0)/(1.0*n);
															
 
																-			fprintf(stderr, "*****************************************\n");
															
 
																-			fprintf(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
															
 
																+			FPRINTF(stderr, "*****************************************\n");
															
 
																+			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
															
 
																 		}
															
 
																 		i++;
															
@@ -317,8 +316,8 @@ static void cg(void)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
															
 
																-	fprintf(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
															
 
																-	fprintf(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
															
 
																+	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
															
 
																+	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
															
 
																 }
															
 
																 static int check(void)
															
@@ -351,7 +350,7 @@ static void parse_args(int argc, char **argv)
 
																 		}
															
 
																 	        if (strcmp(argv[i], "-h") == 0) {
															
 
																-			fprintf(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
															
 
																+			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
															
 
																 			exit(-1);
															
 
																 			continue;
															
 
																 		}
															
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -82,4 +82,4 @@ void copy_handle(starpu_data_handle dst,
 
																 		starpu_data_handle src,
															
 
																 		unsigned nblocks);
															
 
																-#endif // __STARPU_EXAMPLE_CG_H__
															
 
																+#endif /* __STARPU_EXAMPLE_CG_H__ */
															
--- a/examples/cg/cg_dot_kernel.cu
+++ b/examples/cg/cg_dot_kernel.cu
@@ -126,3 +126,22 @@ extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
 
																 	cudaFree(per_block_sum);
															
 
																 }
															
 
																+
															
 
																+static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
															
 
																+
															
 
																+	for (i = first_i; i < nelems; i += nelems_per_thread)
															
 
																+		x[i] = 0.0;
															
 
																+}
															
 
																+
															
 
																+extern "C" void zero_vector(TYPE *x, unsigned nelems)
															
 
																+{
															
 
																+	unsigned nblocks = STARPU_MIN(128, nelems);
															
 
																+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
															
 
																+
															
 
																+	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
															
 
																+
															
 
																+	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
															
 
																+}
															
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -16,6 +16,7 @@
 
																 #include "cg.h"
															
 
																 #include <math.h>
															
 
																+#include <limits.h>
															
 
																 #if 0
															
 
																 static void print_vector_from_descr(unsigned nx, TYPE *v)
															
@@ -123,11 +124,14 @@ starpu_codelet accumulate_vector_cl = {
 
																  */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																+extern void zero_vector(TYPE *x, unsigned nelems);
															
 
																+
															
 
																 static void bzero_variable_cuda(void *descr[], void *cl_arg)
															
 
																 {
															
 
																 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+
															
 
																+	zero_vector(v, 1);
															
 
																-	cublasscal (1, (TYPE)0.0, v, 1);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
@@ -159,7 +163,8 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 
																 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																-	cublasscal (n, (TYPE)0.0, v, 1);
															
 
																+	zero_vector(v, n);
															
 
																+
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
@@ -578,8 +583,8 @@ static void copy_handle_cuda(void *descr[], void *cl_arg)
 
																 	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
															
 
																 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
															
 
																-	cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
															
 
																-	cudaThreadSynchronize();
															
 
																+	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
 
																 #endif
															
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -31,6 +31,7 @@
 
																 #include <common/blas.h>
															
 
																 #include <starpu.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #define NMAXBLOCKS	32
															
 
																 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
															
@@ -112,4 +113,4 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
																 	}
															
 
																 }
															
 
																-#endif // __DW_CHOLESKY_H__
															
 
																+#endif /* __DW_CHOLESKY_H__ */
															
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
																 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
															
 
																 {
															
 
																-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
															
 
																+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
															
 
																 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
															
@@ -121,7 +121,7 @@ static starpu_codelet cl22 =
 
																 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
															
 
																 {
															
 
																-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
															
 
																+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
															
 
																 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
															
@@ -173,17 +173,15 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
																 	starpu_data_set_sequential_consistency_flag(dataA, 0);
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
@@ -214,7 +212,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
																 	int ret = starpu_task_submit(entry_task);
															
 
																 	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
 
																-		fprintf(stderr, "No worker may execute this task\n");
															
 
																+		FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 		exit(-1);
															
 
																 	}
															
@@ -261,7 +259,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	if (pinned)
															
 
																 	{
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
															
 
																+		starpu_malloc((void **)A, dim*dim*sizeof(float));
															
 
																 	} 
															
 
																 	else {
															
 
																 		*A = malloc(dim*dim*sizeof(float));
															
@@ -280,11 +278,11 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	double flop = (1.0f*size*size*size)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 	starpu_helper_cublas_shutdown();
															
@@ -311,26 +309,26 @@ int main(int argc, char **argv)
 
																 		for (j = 0; j < size; j++)
															
 
																 		{
															
 
																 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
															
 
																-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
															
 
																+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																 #ifdef CHECK_OUTPUT
															
 
																-	printf("Input :\n");
															
 
																+	FPRINTF(stdout, "Input :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
@@ -338,43 +336,43 @@ int main(int argc, char **argv)
 
																 	cholesky_grain(mat, size, size, nblocks, nbigblocks);
															
 
																 #ifdef CHECK_OUTPUT
															
 
																-	printf("Results :\n");
															
 
																+	FPRINTF(stdout, "Results :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																-				mat[j+i*size] = 0.0f; // debug
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+				mat[j+i*size] = 0.0f; /* debug */
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																-	fprintf(stderr, "compute explicit LLt ...\n");
															
 
																+	FPRINTF(stderr, "compute explicit LLt ...\n");
															
 
																 	float *test_mat = malloc(size*size*sizeof(float));
															
 
																 	STARPU_ASSERT(test_mat);
															
 
																 	SSYRK("L", "N", size, size, 1.0f, 
															
 
																 				mat, size, 0.0f, test_mat, size);
															
 
																-	fprintf(stderr, "comparing results ...\n");
															
 
																+	FPRINTF(stderr, "comparing results ...\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", test_mat[j +i*size]);
															
 
																+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -126,13 +126,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	unsigned long n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (1.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
															
@@ -143,17 +143,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
																 	 * one block is now determined by 2 unsigned (i,j) */
															
 
																 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
@@ -174,7 +172,7 @@ int main(int argc, char **argv)
 
																 	starpu_helper_cublas_init();
															
 
																 	float *mat;
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
															
 
																+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
															
 
																 	unsigned i,j;
															
 
																 	for (i = 0; i < size; i++)
															
@@ -182,58 +180,58 @@ int main(int argc, char **argv)
 
																 		for (j = 0; j < size; j++)
															
 
																 		{
															
 
																 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
															
 
																-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
															
 
																+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																-//#define PRINT_OUTPUT
															
 
																+/* #define PRINT_OUTPUT */
															
 
																 #ifdef PRINT_OUTPUT
															
 
																-	printf("Input :\n");
															
 
																+	FPRINTF(stdout, "Input :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
 
																 	cholesky(mat, size, size, nblocks);
															
 
																 #ifdef PRINT_OUTPUT
															
 
																-	printf("Results :\n");
															
 
																+	FPRINTF(stdout, "Results :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																-				mat[j+i*size] = 0.0f; // debug
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+				mat[j+i*size] = 0.0f; /* debug */
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
 
																 	if (check)
															
 
																 	{
															
 
																-		fprintf(stderr, "compute explicit LLt ...\n");
															
 
																+		FPRINTF(stderr, "compute explicit LLt ...\n");
															
 
																 		for (j = 0; j < size; j++)
															
 
																 		{
															
 
																 			for (i = 0; i < size; i++)
															
 
																 			{
															
 
																 				if (i > j) {
															
 
																-					mat[j+i*size] = 0.0f; // debug
															
 
																+					mat[j+i*size] = 0.0f; /* debug */
															
 
																 				}
															
 
																 			}
															
 
																 		}
															
@@ -243,20 +241,20 @@ int main(int argc, char **argv)
 
																 		SSYRK("L", "N", size, size, 1.0f,
															
 
																 					mat, size, 0.0f, test_mat, size);
															
 
																-		fprintf(stderr, "comparing results ...\n");
															
 
																+		FPRINTF(stderr, "comparing results ...\n");
															
 
																 #ifdef PRINT_OUTPUT
															
 
																 		for (j = 0; j < size; j++)
															
 
																 		{
															
 
																 			for (i = 0; i < size; i++)
															
 
																 			{
															
 
																 				if (i <= j) {
															
 
																-					printf("%2.2f\t", test_mat[j +i*size]);
															
 
																+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
															
 
																 				}
															
 
																 				else {
															
 
																-					printf(".\t");
															
 
																+					FPRINTF(stdout, ".\t");
															
 
																 				}
															
 
																 			}
															
 
																-			printf("\n");
															
 
																+			FPRINTF(stdout, "\n");
															
 
																 		}
															
 
																 #endif
															
@@ -268,7 +266,7 @@ int main(int argc, char **argv)
 
																 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
															
 
																 	                                float err = abs(test_mat[j +i*size] - orig);
															
 
																 	                                if (err > 0.00001) {
															
 
																-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
															
 
																+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
															
 
																 	                                        assert(0);
															
 
																 	                                }
															
 
																 	                        }
															
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -20,6 +20,10 @@
 
																 #include "../common/blas.h"
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 #include <starpu_cuda.h>
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																+#include "magma.h"
															
 
																+#include "magma_lapack.h"
															
 
																+#endif
															
 
																 #endif
															
 
																 /*
															
@@ -28,7 +32,7 @@
 
																 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
															
 
																 {
															
 
																-	//printf("22\n");
															
 
																+	/* printf("22\n"); */
															
 
																 	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																 	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
															
 
																 	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
															
@@ -88,7 +92,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
																 {
															
 
																 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
															
 
																 }
															
 
																-#endif// STARPU_USE_CUDA
															
 
																+#endif /* STARPU_USE_CUDA */
															
 
																 /* 
															
 
																  * U21
															
@@ -96,7 +100,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
																 static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
															
 
																 {
															
 
																-//	printf("21\n");
															
 
																+/*	printf("21\n"); */
															
 
																 	float *sub11;
															
 
																 	float *sub21;
															
@@ -143,7 +147,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 
																 static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
															
 
																 {
															
 
																-//	printf("11\n");
															
 
																+/*	printf("11\n"); */
															
 
																 	float *sub11;
															
 
																 	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
															
@@ -179,13 +183,27 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																 			{
															
 
																+			int ret;
															
 
																+			int info;
															
 
																+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
															
 
																+			if (ret != MAGMA_SUCCESS) {
															
 
																+				fprintf(stderr, "Error in Magma: %d\n", ret);
															
 
																+				STARPU_ABORT();
															
 
																+			}
															
 
																+			cudaError_t cures = cudaThreadSynchronize();
															
 
																+			STARPU_ASSERT(!cures);
															
 
																+			}
															
 
																+#else
															
 
																+			{
															
 
																+
															
 
																 			float *lambda11;
															
 
																 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
															
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																-
															
 
																+				
															
 
																 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
															
 
																 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
@@ -193,7 +211,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
																 				*lambda11 = sqrt(*lambda11);
															
 
																-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
															
 
																+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
															
 
																 				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
															
 
																 				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
															
@@ -206,8 +224,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
																 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 			cudaFreeHost(lambda11);
															
 
																 			}
															
 
																-		
															
 
																-
															
 
																+#endif
															
 
																 			break;
															
 
																 #endif
															
 
																 		default:
															
@@ -227,4 +244,4 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 
																 {
															
 
																 	chol_common_codelet_update_u11(descr, 1, _args);
															
 
																 }
															
 
																-#endif// STARPU_USE_CUDA
															
 
																+#endif/* STARPU_USE_CUDA */
															
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -26,7 +26,7 @@
 
																 #include <starpu.h>
															
 
																-//#define USE_PERTURBATION	1
															
 
																+/* #define USE_PERTURBATION	1 */
															
 
																 #ifdef USE_PERTURBATION
															
 
																 #define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
															
@@ -43,7 +43,7 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
@@ -58,7 +58,7 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
@@ -73,7 +73,7 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
@@ -88,7 +88,7 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
@@ -103,7 +103,7 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
@@ -118,7 +118,7 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
																 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
															
 
																 #ifdef STARPU_MODEL_DEBUG
															
 
																-	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
															
 
																+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
															
 
																 #endif
															
 
																 	return PERTURBATE(cost);
															
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
																 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
															
 
																 {
															
 
																-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
															
 
																+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
															
 
																 	struct starpu_task *task = create_task(TAG11(k));
															
@@ -108,7 +108,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 
																 	int ret = starpu_task_submit(task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                FPRINTF(stderr, "No worker may execute this task\n");
															
 
																                 exit(0);
															
 
																         }
															
@@ -127,7 +127,7 @@ static starpu_codelet cl22 =
 
																 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
															
 
																 {
															
 
																-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
															
 
																+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
															
 
																 	struct starpu_task *task = create_task(TAG22(k, i, j));
															
@@ -155,7 +155,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
																 	int ret = starpu_task_submit(task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                FPRINTF(stderr, "No worker may execute this task\n");
															
 
																                 exit(0);
															
 
																         }
															
 
																 }
															
@@ -189,7 +189,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
																 		else {
															
 
																 			int ret = starpu_task_submit(task);
															
 
																                         if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                                FPRINTF(stderr, "No worker may execute this task\n");
															
 
																                                 exit(0);
															
 
																                         }
															
@@ -210,7 +210,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
																 	/* schedule the codelet */
															
 
																 	int ret = starpu_task_submit(entry_task);
															
 
																         if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                fprintf(stderr, "No worker may execute this task\n");
															
 
																+                FPRINTF(stderr, "No worker may execute this task\n");
															
 
																                 exit(0);
															
 
																         }
															
@@ -224,13 +224,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	unsigned n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (1.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 static void initialize_system(float **A, unsigned dim, unsigned pinned)
															
@@ -241,7 +241,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
																 	if (pinned)
															
 
																 	{
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
															
 
																+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
															
 
																 	} 
															
 
																 	else {
															
 
																 		*A = malloc(dim*dim*sizeof(float));
															
@@ -258,17 +258,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
																 	starpu_data_set_sequential_consistency_flag(dataA, 0);
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
@@ -299,26 +297,26 @@ int main(int argc, char **argv)
 
																 		for (j = 0; j < size; j++)
															
 
																 		{
															
 
																 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
															
 
																-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
															
 
																+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
															
 
																 		}
															
 
																 	}
															
 
																 #ifdef CHECK_OUTPUT
															
 
																-	printf("Input :\n");
															
 
																+	FPRINTF(stdout, "Input :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
@@ -326,43 +324,43 @@ int main(int argc, char **argv)
 
																 	cholesky(mat, size, size, nblocks);
															
 
																 #ifdef CHECK_OUTPUT
															
 
																-	printf("Results :\n");
															
 
																+	FPRINTF(stdout, "Results :\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																-				mat[j+i*size] = 0.0f; // debug
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+				mat[j+i*size] = 0.0f; /* debug */
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																-	fprintf(stderr, "compute explicit LLt ...\n");
															
 
																+	FPRINTF(stderr, "compute explicit LLt ...\n");
															
 
																 	float *test_mat = malloc(size*size*sizeof(float));
															
 
																 	STARPU_ASSERT(test_mat);
															
 
																 	SSYRK("L", "N", size, size, 1.0f, 
															
 
																 				mat, size, 0.0f, test_mat, size);
															
 
																-	fprintf(stderr, "comparing results ...\n");
															
 
																+	FPRINTF(stderr, "comparing results ...\n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			if (i <= j) {
															
 
																-				printf("%2.2f\t", test_mat[j +i*size]);
															
 
																+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
															
 
																 			}
															
 
																 			else {
															
 
																-				printf(".\t");
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																 			}
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -59,7 +59,7 @@ static starpu_codelet cl11 =
 
																 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
															
 
																 {
															
 
																-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
															
 
																+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
															
 
																 	struct starpu_task *task = create_task(TAG11(k));
															
@@ -145,7 +145,7 @@ static starpu_codelet cl22 =
 
																 static void create_task_22(unsigned k, unsigned i, unsigned j)
															
 
																 {
															
 
																-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
															
 
																+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
															
 
																 	struct starpu_task *task = create_task(TAG22(k, i, j));
															
@@ -224,11 +224,11 @@ static void cholesky_no_stride(void)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	double flop = (1.0f*size*size*size)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 int main(int argc, char **argv)
															
@@ -239,7 +239,7 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																 	assert(nblocks <= NMAXBLOCKS);
															
 
																-	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
															
 
																+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
															
 
																 	starpu_init(NULL);
															
--- a/examples/common/blas.h
+++ b/examples/common/blas.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -154,4 +154,4 @@ extern void dswap_(const int *n, double *x, const int *incx, double *y, const in
 
																 #endif
															
 
																-#endif // __BLAS_H__
															
 
																+#endif /* __BLAS_H__ */
															
--- a/examples/common/blas_model.c
+++ b/examples/common/blas_model.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -37,11 +37,11 @@ double gemm_cost(starpu_buffer_descr *descr)
 
																 	nyC = starpu_matrix_get_ny(descr[2].handle);
															
 
																 	nxA = starpu_matrix_get_nx(descr[0].handle);
															
 
																-//	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
															
 
																+/*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
															
 
																 	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
															
 
																-//	printf("cost %e \n", cost);
															
 
																+/*	printf("cost %e \n", cost); */
															
 
																 	return cost;
															
 
																 }
															
--- a/examples/common/blas_model.h
+++ b/examples/common/blas_model.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -54,4 +54,4 @@ static struct starpu_perfmodel_t starpu_dgemm_model_common = {
 
																 	.type = STARPU_COMMON,
															
 
																 };
															
 
																-#endif // __BLAS_MODEL_H__
															
 
																+#endif /* __BLAS_MODEL_H__ */
															
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -23,6 +23,8 @@
 
																 #define NZ    3
															
 
																 #define PARTS 2
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 extern void cpu_func(void *buffers[], void *cl_arg);
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -36,17 +38,17 @@ extern void opencl_func(void *buffers[], void *cl_arg);
 
																 void print_block(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz)
															
 
																 {
															
 
																         int i, j, k;
															
 
																-        fprintf(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%d ldz=%d\n", block, nx, ny, nz, ldy, ldz);
															
 
																+        FPRINTF(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%u ldz=%u\n", block, nx, ny, nz, ldy, ldz);
															
 
																         for(k=0 ; k<nz ; k++) {
															
 
																                 for(j=0 ; j<ny ; j++) {
															
 
																                         for(i=0 ; i<nx ; i++) {
															
 
																-                                fprintf(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
															
 
																+                                FPRINTF(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
															
 
																                         }
															
 
																-                        fprintf(stderr,"\n");
															
 
																+                        FPRINTF(stderr,"\n");
															
 
																                 }
															
 
																-                fprintf(stderr,"\n");
															
 
																+                FPRINTF(stderr,"\n");
															
 
																         }
															
 
																-        fprintf(stderr,"\n");
															
 
																+        FPRINTF(stderr,"\n");
															
 
																 }
															
 
																 void print_data(starpu_data_handle block_handle)
															
@@ -96,30 +98,28 @@ int main(int argc, char **argv)
 
																         starpu_init(NULL);
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program);
															
 
																+        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program, NULL);
															
 
																 #endif
															
 
																         /* Declare data to StarPU */
															
 
																         starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
															
 
																-        fprintf(stderr, "IN  Block\n");
															
 
																+        FPRINTF(stderr, "IN  Block\n");
															
 
																         print_data(handle);
															
 
																         /* Partition the block in PARTS sub-blocks */
															
 
																 	struct starpu_data_filter f =
															
 
																 	{
															
 
																 		.filter_func = starpu_block_filter_func_block,
															
 
																-		.nchildren = PARTS,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = PARTS
															
 
																 	};
															
 
																         starpu_data_partition(handle, &f);
															
 
																-        fprintf(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
															
 
																+        FPRINTF(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
															
 
																         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
															
 
																         {
															
 
																                 starpu_data_handle sblock = starpu_data_get_sub_data(handle, 1, i);
															
 
																-                fprintf(stderr, "Sub block %d\n", i);
															
 
																+                FPRINTF(stderr, "Sub block %d\n", i);
															
 
																                 print_data(sblock);
															
 
																         }
															
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
 
																                 int ret,multiplier=i;
															
 
																                 struct starpu_task *task = starpu_task_create();
															
 
																-                fprintf(stderr,"Dealing with sub-block %d\n", i);
															
 
																+                FPRINTF(stderr,"Dealing with sub-block %d\n", i);
															
 
																                 task->cl = &cl;
															
 
																                 task->synchronous = 1;
															
 
																                 task->callback_func = NULL;
															
@@ -139,9 +139,10 @@ int main(int argc, char **argv)
 
																                 ret = starpu_task_submit(task);
															
 
																                 if (ret) {
															
 
																-                        fprintf(stderr, "Error when submitting task\n");
															
 
																+                        FPRINTF(stderr, "Error when submitting task\n");
															
 
																                         exit(ret);
															
 
																                 }
															
 
																+		starpu_task_destroy(task);
															
 
																         }
															
 
																         /* Unpartition the data, unregister it from StarPU and shutdown */
															
@@ -150,10 +151,11 @@ int main(int argc, char **argv)
 
																         starpu_data_unregister(handle);
															
 
																         /* Print result block */
															
 
																-        fprintf(stderr, "OUT Block\n");
															
 
																+        FPRINTF(stderr, "OUT Block\n");
															
 
																         print_block(block, NX, NY, NZ, NX, NX*NY);
															
 
																-	starpu_shutdown();
															
 
																+	free(block);
															
 
																+	starpu_shutdown();
															
 
																 	return 0;
															
 
																 }
															
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -28,7 +28,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 
																 	cl_event event;
															
 
																         int *factor = cl_arg;
															
 
																-	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
															
 
																+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(buffers[0]);
															
 
																 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
															
 
																 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
															
 
																 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
															
@@ -42,7 +42,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 
																 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	err = 0;
															
 
																-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
															
 
																 	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
															
 
																 	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
															
 
																 	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
															
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -20,6 +20,8 @@
 
																 #define NY    4
															
 
																 #define PARTS 2
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																         unsigned i, j;
															
@@ -43,15 +45,15 @@ int main(int argc, char **argv)
 
																 	unsigned i, j, n=1;
															
 
																         int matrix[NX*NY];
															
 
																-        fprintf(stderr,"IN  Matrix: \n");
															
 
																+        FPRINTF(stderr,"IN  Matrix: \n");
															
 
																         for(j=0 ; j<NY ; j++) {
															
 
																                 for(i=0 ; i<NX ; i++) {
															
 
																                         matrix[(j*NX)+i] = n++;
															
 
																-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
															
 
																+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
															
 
																                 }
															
 
																-                fprintf(stderr,"\n");
															
 
																+                FPRINTF(stderr,"\n");
															
 
																         }
															
 
																-        fprintf(stderr,"\n");
															
 
																+        FPRINTF(stderr,"\n");
															
 
																         starpu_data_handle handle;
															
 
																         starpu_codelet cl = {
															
@@ -68,9 +70,7 @@ int main(int argc, char **argv)
 
																 	struct starpu_data_filter f =
															
 
																 	{
															
 
																 		.filter_func = starpu_block_filter_func,
															
 
																-		.nchildren = PARTS,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = PARTS
															
 
																 	};
															
 
																 	starpu_data_partition(handle, &f);
															
@@ -86,6 +86,7 @@ int main(int argc, char **argv)
 
																                 task->cl_arg = &factor;
															
 
																                 task->cl_arg_size = sizeof(factor);
															
 
																 		starpu_task_submit(task);
															
 
																+		starpu_task_destroy(task);
															
 
																 	}
															
 
																         /* Unpartition the data, unregister it from StarPU and shutdown */
															
@@ -94,14 +95,14 @@ int main(int argc, char **argv)
 
																 	starpu_shutdown();
															
 
																         /* Print result matrix */
															
 
																-        fprintf(stderr,"OUT Matrix: \n");
															
 
																+        FPRINTF(stderr,"OUT Matrix: \n");
															
 
																         for(j=0 ; j<NY ; j++) {
															
 
																                 for(i=0 ; i<NX ; i++) {
															
 
																-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
															
 
																+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
															
 
																                 }
															
 
																-                fprintf(stderr,"\n");
															
 
																+                FPRINTF(stderr,"\n");
															
 
																         }
															
 
																-        fprintf(stderr,"\n");
															
 
																+        FPRINTF(stderr,"\n");
															
 
																 	return 0;
															
 
																 }
															
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -19,6 +19,8 @@
 
																 #define NX    21
															
 
																 #define PARTS 3
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 void cpu_func(void *buffers[], void *cl_arg)
															
 
																 {
															
 
																         unsigned i;
															
@@ -47,9 +49,9 @@ int main(int argc, char **argv)
 
																         };
															
 
																         for(i=0 ; i<NX ; i++) vector[i] = i;
															
 
																-        fprintf(stderr,"IN  Vector: ");
															
 
																-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
															
 
																-        fprintf(stderr,"\n");
															
 
																+        FPRINTF(stderr,"IN  Vector: ");
															
 
																+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
															
 
																+        FPRINTF(stderr,"\n");
															
 
																 	starpu_init(NULL);
															
@@ -60,9 +62,7 @@ int main(int argc, char **argv)
 
																 	struct starpu_data_filter f =
															
 
																 	{
															
 
																 		.filter_func = starpu_block_filter_func_vector,
															
 
																-		.nchildren = PARTS,
															
 
																-		.get_nchildren = NULL,
															
 
																-		.get_child_ops = NULL
															
 
																+		.nchildren = PARTS
															
 
																 	};
															
 
																 	starpu_data_partition(handle, &f);
															
@@ -81,15 +81,16 @@ int main(int argc, char **argv)
 
																                 task->cl_arg_size = sizeof(factor);
															
 
																 		starpu_task_submit(task);
															
 
																+		starpu_task_destroy(task);
															
 
																 	}
															
 
																 	starpu_data_unpartition(handle, 0);
															
 
																         starpu_data_unregister(handle);
															
 
																 	starpu_shutdown();
															
 
																-        fprintf(stderr,"OUT Vector: ");
															
 
																-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
															
 
																-        fprintf(stderr,"\n");
															
 
																+        FPRINTF(stderr,"OUT Vector: ");
															
 
																+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
															
 
																+        FPRINTF(stderr,"\n");
															
 
																 	return 0;
															
 
																 }
															
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -624,12 +624,12 @@ void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	unsigned n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (2.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
															
@@ -666,7 +666,7 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 
																 	int ret = starpu_task_submit(task);
															
 
																 	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
 
																-		fprintf(stderr, "No worker may execute this task\n");
															
 
																+		FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 		exit(0);
															
 
																 	}
															
@@ -681,12 +681,12 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	unsigned n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (2.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
															
@@ -697,8 +697,8 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
																 	if (pinned)
															
 
																 	{
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
															
 
																+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
															
 
																+		starpu_malloc((void **)B, (size_t)dim*sizeof(float));
															
 
																 	} 
															
 
																 	else {
															
 
																 		*A = malloc((size_t)dim*dim*sizeof(float));
															
@@ -714,7 +714,7 @@ void dw_factoLU(float *matA, unsigned size,
 
																 {
															
 
																 #ifdef CHECK_RESULTS
															
 
																-	fprintf(stderr, "Checking results ...\n");
															
 
																+	FPRINTF(stderr, "Checking results ...\n");
															
 
																 	float *Asaved;
															
 
																 	Asaved = malloc((size_t)ld*ld*sizeof(float));
															
@@ -730,17 +730,15 @@ void dw_factoLU(float *matA, unsigned size,
 
																 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
															
 
																 			size, size, sizeof(float));
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																-
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -36,6 +36,8 @@
 
																 #include "lu_kernels_model.h"
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 #define BLAS3_FLOP(n1,n2,n3)    \
															
 
																         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
															
@@ -82,53 +84,53 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
																 #if 0
															
 
																 	/* display L */
															
 
																-	printf("(LU): \n");
															
 
																+	FPRINTF(stdout, "(LU): \n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																-//			if (i <= j) {
															
 
																-				printf("%2.2f\t", LU[j +i*size]);
															
 
																-//			}
															
 
																-//			else {
															
 
																-//				printf(".\t");
															
 
																-//			}
															
 
																+/*			if (i <= j) { */
															
 
																+				FPRINTF(stdout, "%2.2f\t", LU[j +i*size]);
															
 
																+/*			}
															
 
																+			else {
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+			} */
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 	/* display L */
															
 
																-	printf("L: \n");
															
 
																+	FPRINTF(stdout, "L: \n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																-//			if (i <= j) {
															
 
																-				printf("%2.2f\t", L[j +i*size]);
															
 
																-//			}
															
 
																-//			else {
															
 
																-//				printf(".\t");
															
 
																-//			}
															
 
																+/*			if (i <= j) { */
															
 
																+				FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
															
 
																+/*			}
															
 
																+			else {
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+			} */
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 	/* display U */
															
 
																-	printf("U: \n");
															
 
																+	FPRINTF(stdout, "U: \n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																-//			if (i <= j) {
															
 
																-				printf("%2.2f\t", U[j +i*size]);
															
 
																-//			}
															
 
																-//			else {
															
 
																-//				printf(".\t");
															
 
																-//			}
															
 
																+/*			if (i <= j) { */
															
 
																+				FPRINTF(stdout, "%2.2f\t", U[j +i*size]);
															
 
																+/*			}
															
 
																+			else {
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+			} */
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
@@ -148,42 +150,42 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
																 #if 0
															
 
																 	/* display A */
															
 
																-	printf("A: \n");
															
 
																+	FPRINTF(stdout, "A: \n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																-	//		if (i <= j) {
															
 
																-	      			printf("%2.2f\t", A[j +i*size]);
															
 
																-	//		}
															
 
																-	//		else {
															
 
																-	//			printf(".\t");
															
 
																-	//		}
															
 
																+	/*		if (i <= j) { */
															
 
																+	      			FPRINTF(stdout, "%2.2f\t", A[j +i*size]);
															
 
																+	/*		}
															
 
																+			else {
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+			} */
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 	/* display LU */
															
 
																-	printf("LU: \n");
															
 
																+	FPRINTF(stdout, "LU: \n");
															
 
																 	for (j = 0; j < size; j++)
															
 
																 	{
															
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																-	//		if (i <= j) {
															
 
																-	      			printf("%2.2f\t", L[j +i*size]);
															
 
																-	//		}
															
 
																-	//		else {
															
 
																-	//			printf(".\t");
															
 
																-	//		}
															
 
																+	/*		if (i <= j) { */
															
 
																+	      			FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
															
 
																+	/*		}
															
 
																+			else {
															
 
																+				FPRINTF(stdout, ".\t");
															
 
																+			} */
															
 
																 		}
															
 
																-		printf("\n");
															
 
																+		FPRINTF(stdout, "\n");
															
 
																 	}
															
 
																 #endif
															
 
																-	printf("max error between A and L*U = %f \n", max_err);
															
 
																+	FPRINTF(stdout, "max error between A and L*U = %f \n", max_err);
															
 
																 }
															
 
																-#endif // CHECK_RESULTS
															
 
																+#endif /* CHECK_RESULTS */
															
 
																 void dw_cpu_codelet_update_u11(void **, void *);
															
 
																 void dw_cpu_codelet_update_u12(void **, void *);
															
@@ -211,4 +213,4 @@ extern struct starpu_perfmodel_t model_12;
 
																 extern struct starpu_perfmodel_t model_21;
															
 
																 extern struct starpu_perfmodel_t model_22;
															
 
																-#endif // __DW_FACTO_LU_H__
															
 
																+#endif /* __DW_FACTO_LU_H__ */
															
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -54,7 +54,7 @@ static starpu_codelet cl11 = {
 
																 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
															
 
																 {
															
 
																-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
															
 
																+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
															
 
																 	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
															
@@ -87,7 +87,7 @@ static starpu_codelet cl12 = {
 
																 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
															
 
																 {
															
 
																-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
															
 
																+/*	FPRINTF(stdout, "task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
															
 
																 	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
															
@@ -163,7 +163,7 @@ static starpu_codelet cl22 = {
 
																 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
															
 
																 {
															
 
																-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
															
 
																+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
															
 
																 	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
															
@@ -207,17 +207,15 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
																 	unsigned nblocks = size / blocksize;
															
 
																 	unsigned maxk = inner_size / blocksize;
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
@@ -262,7 +260,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
																 	int ret = starpu_task_submit(entry_task);
															
 
																 	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
 
																-		fprintf(stderr, "No worker may execute this task\n");
															
 
																+		FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 		exit(-1);
															
 
																 	}
															
@@ -299,13 +297,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
																 		float *newmatA = &matA[inner_size*(ld+1)];
															
 
																-//		if (tag_prefix < 2)
															
 
																-//		{
															
 
																-//			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
															
 
																-//		}
															
 
																-//		else {
															
 
																+/*		if (tag_prefix < 2)
															
 
																+		{
															
 
																+			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
															
 
																+		}
															
 
																+		else { */
															
 
																 			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
															
 
																-//		}
															
 
																+/*		} */
															
 
																 	}
															
 
																 }
															
@@ -314,7 +312,7 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 
																 {
															
 
																 #ifdef CHECK_RESULTS
															
 
																-	fprintf(stderr, "Checking results ...\n");
															
 
																+	FPRINTF(stderr, "Checking results ...\n");
															
 
																 	float *Asaved;
															
 
																 	Asaved = malloc(ld*ld*sizeof(float));
															
@@ -333,12 +331,12 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																-	printf("%2.2f\n", timing/1000);
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stdout, "%2.2f\n", timing/1000);
															
 
																 	unsigned n = size;
															
 
																 	double flop = (2.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 #ifdef CHECK_RESULTS
															
 
																 	compare_A_LU(Asaved, matA, size, ld);
															
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -33,7 +33,7 @@ void display_stat_heat(void)
 
																 {
															
 
																 	unsigned nworkers = starpu_worker_get_count();
															
 
																-	fprintf(stderr, "STATS : \n");
															
 
																+	FPRINTF(stderr, "STATS : \n");
															
 
																 	unsigned worker;
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
@@ -49,7 +49,7 @@ void display_stat_heat(void)
 
																 		count_22_total += count_22_per_worker[worker];
															
 
																 	}
															
 
																-	fprintf(stderr, "\t11 (diagonal block LU)\n");
															
 
																+	FPRINTF(stderr, "\t11 (diagonal block LU)\n");
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		if (count_total_per_worker[worker])
															
@@ -57,11 +57,11 @@ void display_stat_heat(void)
 
																 			char name[32];
															
 
																 			starpu_worker_get_name(worker, name, 32);
															
 
																-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
															
 
																+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
															
 
																 		}
															
 
																 	}
															
 
																-	fprintf(stderr, "\t12 (TRSM)\n");
															
 
																+	FPRINTF(stderr, "\t12 (TRSM)\n");
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		if (count_total_per_worker[worker])
															
@@ -69,12 +69,12 @@ void display_stat_heat(void)
 
																 			char name[32];
															
 
																 			starpu_worker_get_name(worker, name, 32);
															
 
																-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
															
 
																+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
															
 
																 		}
															
 
																 	}
															
 
																-	fprintf(stderr, "\t21 (TRSM)\n");
															
 
																+	FPRINTF(stderr, "\t21 (TRSM)\n");
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		if (count_total_per_worker[worker])
															
@@ -82,11 +82,11 @@ void display_stat_heat(void)
 
																 			char name[32];
															
 
																 			starpu_worker_get_name(worker, name, 32);
															
 
																-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
															
 
																+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
															
 
																 		}
															
 
																 	}
															
 
																-	fprintf(stderr, "\t22 (SGEMM)\n");
															
 
																+	FPRINTF(stderr, "\t22 (SGEMM)\n");
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		if (count_total_per_worker[worker])
															
@@ -94,7 +94,7 @@ void display_stat_heat(void)
 
																 			char name[32];
															
 
																 			starpu_worker_get_name(worker, name, 32);
															
 
																-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
															
 
																+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
															
 
																 		}
															
 
																 	}
															
 
																 }
															
@@ -162,7 +162,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 
																 	int id = starpu_worker_get_id();
															
 
																 	count_22_per_worker[id]++;
															
 
																 }
															
 
																-#endif// STARPU_USE_CUDA
															
 
																+#endif /* STARPU_USE_CUDA */
															
 
																 /*
															
 
																  * U12
															
@@ -225,7 +225,7 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 
																 	int id = starpu_worker_get_id();
															
 
																 	count_12_per_worker[id]++;
															
 
																 }
															
 
																-#endif // STARPU_USE_CUDA
															
 
																+#endif /* STARPU_USE_CUDA */
															
 
																 /* 
															
 
																  * U21
															
@@ -298,12 +298,12 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 
																 	{
															
 
																 		for (i = 0; i < n; i++)
															
 
																 		{
															
 
																-			fprintf(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
															
 
																+			FPRINTF(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
															
 
																 		}
															
 
																-		fprintf(stderr, "\n");
															
 
																+		FPRINTF(stderr, "\n");
															
 
																 	}
															
 
																-	fprintf(stderr, "\n");
															
 
																+	FPRINTF(stderr, "\n");
															
 
																 }
															
 
																 static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
															
@@ -378,4 +378,4 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 
																 	int id = starpu_worker_get_id();
															
 
																 	count_11_per_worker[id]++;
															
 
																 }
															
 
																-#endif// STARPU_USE_CUDA
															
 
																+#endif /* STARPU_USE_CUDA */
															
--- a/examples/heat/dw_factolu_tag.c
+++ b/examples/heat/dw_factolu_tag.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -56,7 +56,7 @@ static starpu_codelet cl11 = {
 
																 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
															
 
																 {
															
 
																-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
															
 
																+/*	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
															
 
																 	struct starpu_task *task = create_task(TAG11(k));
															
@@ -90,7 +90,7 @@ static starpu_codelet cl12 = {
 
																 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
															
 
																 {
															
 
																-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
															
 
																+/*	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
															
 
																 	struct starpu_task *task = create_task(TAG12(k, i));
															
@@ -166,7 +166,7 @@ static starpu_codelet cl22 = {
 
																 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
															
 
																 {
															
 
																-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
															
 
																+/*	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
															
 
																 	struct starpu_task *task = create_task(TAG22(k, i, j));
															
@@ -241,7 +241,7 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 
																 	int ret = starpu_task_submit(entry_task);
															
 
																 	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 	{
															
 
																-		fprintf(stderr, "No worker may execute this task\n");
															
 
																+		FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 		exit(-1);
															
 
																 	}
															
@@ -253,19 +253,19 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Computation took (in ms)\n");
															
 
																+	FPRINTF(stderr, "Computation took (in ms)\n");
															
 
																 	printf("%2.2f\n", timing/1000);
															
 
																 	unsigned n = starpu_matrix_get_nx(dataA);
															
 
																 	double flop = (2.0f*n*n*n)/3.0f;
															
 
																-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
															
 
																 }
															
 
																 void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
															
 
																 {
															
 
																 #ifdef CHECK_RESULTS
															
 
																-	fprintf(stderr, "Checking results ...\n");
															
 
																+	FPRINTF(stderr, "Checking results ...\n");
															
 
																 	float *Asaved;
															
 
																 	Asaved = malloc((size_t)ld*ld*sizeof(float));
															
@@ -280,17 +280,15 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
																 	 * one block is now determined by 2 unsigned (i,j) */
															
 
																 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																-
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -20,6 +20,7 @@
 
																  */
															
 
																 #include "dw_sparse_cg.h"
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 static struct starpu_task *create_task(starpu_tag_t id)
															
 
																 {
															
@@ -298,13 +299,13 @@ void iteration_cg(void *problem)
 
																 {
															
 
																 	struct cg_problem *pb = problem;
															
 
																-	printf("i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
															
 
																+	FPRINTF(stdout, "i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
															
 
																 	if ((pb->i < MAXITER) && 
															
 
																 		(pb->delta_new > pb->epsilon) )
															
 
																 	{
															
 
																 		if (pb->i % 1000 == 0)
															
 
																-			printf("i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
															
 
																+			FPRINTF(stdout, "i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
															
 
																 		pb->i++;
															
@@ -313,8 +314,8 @@ void iteration_cg(void *problem)
 
																 	}
															
 
																 	else {
															
 
																 		/* we may stop */
															
 
																-		printf("We are done ... after %d iterations \n", pb->i - 1);
															
 
																-		printf("i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
															
 
																+		FPRINTF(stdout, "We are done ... after %d iterations \n", pb->i - 1);
															
 
																+		FPRINTF(stdout, "i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
															
 
																 		sem_post(pb->sem);
															
 
																 	}
															
 
																 }
															
@@ -353,7 +354,7 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 
																 		ptr_vecq[i] = 0.0f;
															
 
																 	}
															
 
																-	printf("nrow = %d \n", nrow);
															
 
																+	FPRINTF(stdout, "nrow = %u \n", nrow);
															
 
																 	/* and register them as well */
															
 
																 	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));
															
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -101,7 +101,7 @@ static void __attribute__ ((unused)) print_results(float *result, unsigned size)
 
																 	for (i = 0; i < STARPU_MIN(size, 16); i++)
															
 
																 	{
															
 
																-		printf("%d -> %f\n", i, result[i]);
															
 
																+		printf("%u -> %f\n", i, result[i]);
															
 
																 	}
															
 
																 }
															
@@ -134,4 +134,4 @@ void iteration_cg(void *problem);
 
																 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
															
 
																 			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
															
 
																-#endif // __DW_SPARSE_CG_H__
															
 
																+#endif /* __DW_SPARSE_CG_H__ */
															
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -64,10 +64,8 @@ void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
 
																 	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
															
 
																-	uint32_t nnz;
															
 
																 	uint32_t nrow;
															
 
																-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
															
 
																 	nrow = STARPU_CSR_GET_NROW(descr[0]);
															
 
																 	unsigned row;
															
@@ -173,10 +171,8 @@ void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
 
																 	float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																 	float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
															
 
																-	uint32_t nnz;
															
 
																 	uint32_t nrow;
															
 
																-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
															
 
																 	nrow = STARPU_CSR_GET_NROW(descr[0]);
															
 
																 	unsigned row;
															
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -321,7 +321,7 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
																 	/* solve the actual problem LU X = B */
															
 
																         /* solve LX' = Y with X' = UX */
															
 
																         /* solve UX = X' */
															
 
																-	fprintf(stderr, "Solving the problem ...\n");
															
 
																+	FPRINTF(stderr, "Solving the problem ...\n");
															
 
																 	float *savedB;
															
 
																 	float *LUB;
															
@@ -360,10 +360,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
																 		/* check if LUB is close to the 0 vector */
															
 
																 		int maxind = ISAMAX(subsize, LUB, 1);
															
 
																-		fprintf(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
															
 
																+		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
															
 
																 		float sum = SASUM(subsize, LUB, 1);
															
 
																-		fprintf(stderr,"avg. error %e\n", sum/subsize);
															
 
																+		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
															
 
																 		free(LUB);
															
 
																 		free(savedB);
															
@@ -494,10 +494,10 @@ static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned n
 
																 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
															
 
																 				{
															
 
																 					/* we got a possible neighbour */
															
 
																-					unsigned node = 
															
 
																+					unsigned pnode = 
															
 
																 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
															
 
																-					neighbours[nneighbours++] = TRANSLATEBACK(node);
															
 
																+					neighbours[nneighbours++] = TRANSLATEBACK(pnode);
															
 
																 				}
															
 
																 			}
															
 
																 		}
															
@@ -569,10 +569,10 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 
																 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
															
 
																 		{
															
 
																-			unsigned i = neighbours[neighbour]; 
															
 
																-			if (i >= newsize)
															
 
																+			unsigned n = neighbours[neighbour]; 
															
 
																+			if (n >= newsize)
															
 
																 			{
															
 
																-				B[j] -= compute_A_value(TRANSLATE(i), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(i)];
															
 
																+				B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -729,7 +729,7 @@ int main(int argc, char **argv)
 
																 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
															
 
																-		fprintf(stderr, "Problem size : %dx%d (%dx%d) (%ld MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
															
 
																+		FPRINTF(stderr, "Problem size : %ux%u (%ux%u) (%lu MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
															
 
																 		STARPU_ASSERT(newsize % nblocks == 0);
															
--- a/examples/heat/heat.h
+++ b/examples/heat/heat.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -24,7 +24,7 @@
 
																 #include <assert.h>
															
 
																 #include <math.h>
															
 
																-// needed for STARPU_OPENGL_RENDER
															
 
																+/* needed for STARPU_OPENGL_RENDER */
															
 
																 #include <starpu_config.h>
															
 
																 #include <starpu.h>
															
@@ -36,6 +36,8 @@
 
																 #include <GL/glut.h>
															
 
																 #endif
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 #define X	0
															
 
																 #define Y	1
															
@@ -66,4 +68,4 @@ void display_stat_heat(void);
 
																 extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
															
 
																 #endif
															
 
																-#endif // __HEAT_H__
															
 
																+#endif /* __HEAT_H__ */
															
--- a/examples/heat/heat_display.c
+++ b/examples/heat/heat_display.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -133,8 +133,8 @@ static void display(void)
 
																 	float factor = 1.0/amplitude;
															
 
																 	glScalef (factor, factor, factor);      /* modeling transformation */
															
 
																 	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
															
 
																-//	printf("factor %f\n", factor);
															
 
																-	//   glRotatef(-0,0.0,0.0,0.0);
															
 
																+/*	printf("factor %f\n", factor);
															
 
																+	   glRotatef(-0,0.0,0.0,0.0); */
															
 
																 	generate_graph();
															
 
																 	glFlush ();
															
 
																 }
															
@@ -211,7 +211,7 @@ void find_limits(void)
 
																 void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
															
 
																 {
															
 
																-	fprintf(stderr, "OpenGL rendering ... \n");
															
 
																+	FPRINTF(stderr, "OpenGL rendering ... \n");
															
 
																 	ntheta = _ntheta;
															
 
																 	nthick = _nthick;
															
@@ -236,4 +236,4 @@ void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_p
 
																 	glutReshapeFunc(reshape);
															
 
																 	glutMainLoop();
															
 
																 }
															
 
																-#endif // STARPU_OPENGL_RENDER
															
 
																+#endif /* STARPU_OPENGL_RENDER */
															
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -26,7 +26,7 @@
 
																  *	Number of flops of Gemm 
															
 
																  */
															
 
																-//#define USE_PERTURBATION	1
															
 
																+/* #define USE_PERTURBATION	1 */
															
 
																 #ifdef USE_PERTURBATION
															
@@ -58,10 +58,10 @@ double task_12_cost(starpu_buffer_descr *descr)
 
																 	n = starpu_matrix_get_nx(descr[0].handle);
															
 
																-//	double cost = ((n*n*n)/1744.695);
															
 
																+/*	double cost = ((n*n*n)/1744.695); */
															
 
																 	double cost = ((n*n*n)/3210.80);
															
 
																-	//fprintf(stderr, "task 12 predicts %e\n", cost);
															
 
																+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -72,10 +72,10 @@ double task_21_cost(starpu_buffer_descr *descr)
 
																 	n = starpu_matrix_get_nx(descr[0].handle);
															
 
																-//	double cost = ((n*n*n)/1744.695);
															
 
																+/*	double cost = ((n*n*n)/1744.695); */
															
 
																 	double cost = ((n*n*n)/3691.53);
															
 
																-	//fprintf(stderr, "task 12 predicts %e\n", cost);
															
 
																+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -109,7 +109,7 @@ double task_11_cost_cuda(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/1853.7806);
															
 
																-//	printf("CUDA task 11 ; predict %e\n", cost);
															
 
																+/*	printf("CUDA task 11 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -121,7 +121,7 @@ double task_12_cost_cuda(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/42838.5718);
															
 
																-//	printf("CUDA task 12 ; predict %e\n", cost);
															
 
																+/*	printf("CUDA task 12 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -134,7 +134,7 @@ double task_21_cost_cuda(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/49208.667);
															
 
																-//	printf("CUDA task 21 ; predict %e\n", cost);
															
 
																+/*	printf("CUDA task 21 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -150,7 +150,7 @@ double task_22_cost_cuda(starpu_buffer_descr *descr)
 
																 	double cost = ((nx*ny*nz)/57523.560);
															
 
																-//	printf("CUDA task 22 ; predict %e\n", cost);
															
 
																+/*	printf("CUDA task 22 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -168,7 +168,7 @@ double task_11_cost_cpu(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/537.5);
															
 
																-//	printf("CPU task 11 ; predict %e\n", cost);
															
 
																+/*	printf("CPU task 11 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -180,7 +180,7 @@ double task_12_cost_cpu(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/6668.224);
															
 
																-//	printf("CPU task 12 ; predict %e\n", cost);
															
 
																+/*	printf("CPU task 12 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -193,7 +193,7 @@ double task_21_cost_cpu(starpu_buffer_descr *descr)
 
																 	double cost = ((n*n*n)/6793.8423);
															
 
																-//	printf("CPU task 21 ; predict %e\n", cost);
															
 
																+/*	printf("CPU task 21 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
@@ -209,7 +209,7 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
																 	double cost = ((nx*ny*nz)/4203.0175);
															
 
																-//	printf("CPU task 22 ; predict %e\n", cost);
															
 
																+/*	printf("CPU task 22 ; predict %e\n", cost); */
															
 
																 	return PERTURBATE(cost);
															
 
																 }
															
--- a/examples/heat/lu_kernels_model.h
+++ b/examples/heat/lu_kernels_model.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -20,4 +20,4 @@
 
																 #include <starpu.h>
															
 
																-#endif // __LU_KERNELS_MODEL_H__
															
 
																+#endif /* __LU_KERNELS_MODEL_H__ */
															
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -20,6 +20,7 @@
 
																 #include <sys/time.h>
															
 
																 static unsigned niter = 50000;
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
															
@@ -42,6 +43,9 @@ int main(int argc, char **argv)
 
																 {
															
 
																 	starpu_init(NULL);
															
 
																+#ifdef STARPU_SLOW_MACHINE
															
 
																+	niter /= 100;
															
 
																+#endif
															
 
																 	if (argc == 2)
															
 
																 		niter = atoi(argv[1]);
															
@@ -52,7 +56,7 @@ int main(int argc, char **argv)
 
																 			(uintptr_t)&float_array, 4, sizeof(float));
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program);
															
 
																+        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
															
 
																 #endif
															
 
																 	starpu_codelet cl =
															
@@ -88,7 +92,7 @@ int main(int argc, char **argv)
 
																 		int ret = starpu_task_submit(task);
															
 
																 		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 		{
															
 
																-			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 			exit(0);
															
 
																 		}
															
 
																 	}
															
@@ -96,24 +100,24 @@ int main(int argc, char **argv)
 
																 	starpu_task_wait_for_all();
															
 
																 	/* update the array in RAM */
															
 
																-	starpu_data_acquire(float_array_handle, STARPU_R);
															
 
																+	starpu_data_unregister(float_array_handle);
															
 
																 	gettimeofday(&end, NULL);
															
 
																-	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
															
 
																+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
															
 
																                 float_array[1], float_array[2], float_array[3]);
															
 
																+	STARPU_ASSERT(float_array[0] == niter);
															
 
																+
															
 
																 	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
															
 
																-		fprintf(stderr, "Incorrect result\n");
															
 
																+		FPRINTF(stderr, "Incorrect result\n");
															
 
																 		return 1;
															
 
																 	}
															
 
																-	starpu_data_release(float_array_handle);
															
 
																-
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
															
 
																 					(end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "%d elems took %lf ms\n", niter, timing/1000);
															
 
																+	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
															
 
																 	starpu_shutdown();
															
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -21,7 +21,7 @@
 
																 extern struct starpu_opencl_program opencl_program;
															
 
																 void opencl_codelet(void *descr[], void *_args)
															
 
																 {
															
 
																-	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	cl_kernel kernel;
															
 
																 	cl_command_queue queue;
															
 
																 	cl_event event;
															
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 
																 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	err = 0;
															
 
																-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
 
																+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
															
 
																 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	{
															
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -137,7 +137,7 @@ void copy_matrix_into_blocks(void)
 
																 	for (bj = 0; bj < nblocks; bj++)
															
 
																 	for (bi = 0; bi < nblocks; bi++)
															
 
																 	{
															
 
																-		starpu_data_malloc_pinned_if_possible((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
															
 
																+		starpu_malloc((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
															
 
																 		for (j = 0; j < blocksize; j++)
															
 
																 		for (i = 0; i < blocksize; i++)
															
@@ -151,7 +151,7 @@ void copy_matrix_into_blocks(void)
 
																 static void init_matrix(void)
															
 
																 {
															
 
																 	/* allocate matrix */
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&A, (size_t)size*size*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&A, (size_t)size*size*sizeof(TYPE));
															
 
																 	STARPU_ASSERT(A);
															
 
																 	starpu_srand48((long int)time(NULL));
															
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
 
																 		} else {
															
 
																 			starpu_bound_compute(&min, NULL, 0);
															
 
																 			if (min != 0.)
															
 
																-				FPRINTF(stderr, "theoretical min: %lf ms\n", min);
															
 
																+				FPRINTF(stderr, "theoretical min: %f ms\n", min);
															
 
																 		}
															
 
																 	}
															
--- a/examples/lu/xlu.c
+++ b/examples/lu/xlu.c
@@ -236,17 +236,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 
																 	/* We already enforce deps by hand */
															
 
																 	starpu_data_set_sequential_consistency_flag(dataA, 0);
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																-
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
--- a/examples/lu/xlu_implicit.c
+++ b/examples/lu/xlu_implicit.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -143,17 +143,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 
																 	 * one block is now determined by 2 unsigned (i,j) */
															
 
																 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																-
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -189,17 +189,15 @@ void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size
 
																 	 * one block is now determined by 2 unsigned (i,j) */
															
 
																 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
															
 
																-	struct starpu_data_filter f;
															
 
																-		f.filter_func = starpu_vertical_block_filter_func;
															
 
																-		f.nchildren = nblocks;
															
 
																-		f.get_nchildren = NULL;
															
 
																-		f.get_child_ops = NULL;
															
 
																-
															
 
																-	struct starpu_data_filter f2;
															
 
																-		f2.filter_func = starpu_block_filter_func;
															
 
																-		f2.nchildren = nblocks;
															
 
																-		f2.get_nchildren = NULL;
															
 
																-		f2.get_child_ops = NULL;
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_vertical_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_data_filter f2 = {
															
 
																+		.filter_func = starpu_block_filter_func,
															
 
																+		.nchildren = nblocks
															
 
																+	};
															
 
																 	starpu_data_map_filters(dataA, 2, &f, &f2);
															
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -29,14 +29,15 @@ int use_x11 = 1;
 
																 #endif
															
 
																 int demo = 0;
															
 
																+static double demozoom = 0.05;
															
 
																 /* NB: The X11 code is inspired from the http://locklessinc.com/articles/mandelbrot/ article */
															
 
																 static int nblocks = 20;
															
 
																 static int height = 400;
															
 
																 static int width = 640;
															
 
																-static int maxIt = 20000; // max number of iteration in the Mandelbrot function
															
 
																-static int niter = -1; // number of loops in case we don't use X11, -1 means infinite
															
 
																+static int maxIt = 20000; /* max number of iteration in the Mandelbrot function */
															
 
																+static int niter = -1; /* number of loops in case we don't use X11, -1 means infinite */
															
 
																 static int use_spmd = 0;
															
 
																 static double leftX = -0.745;
															
@@ -233,7 +234,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
																 {
															
 
																 	int iby, block_size;
															
 
																 	double stepX, stepY;
															
 
																-	int *pcnt; // unused for CUDA tasks
															
 
																+	int *pcnt; /* unused for CUDA tasks */
															
 
																 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
															
 
																 	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
															
@@ -247,15 +248,15 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
																 	starpu_opencl_load_kernel(&kernel, &queue, &opencl_programs, "mandelbrot_kernel", devid);
															
 
																-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
															
 
																-	clSetKernelArg(kernel, 1, sizeof(double), &leftX);
															
 
																-	clSetKernelArg(kernel, 2, sizeof(double), &topY);
															
 
																-	clSetKernelArg(kernel, 3, sizeof(double), &stepX);
															
 
																-	clSetKernelArg(kernel, 4, sizeof(double), &stepY);
															
 
																-	clSetKernelArg(kernel, 5, sizeof(int), &maxIt);
															
 
																-	clSetKernelArg(kernel, 6, sizeof(int), &iby);
															
 
																-	clSetKernelArg(kernel, 7, sizeof(int), &block_size);
															
 
																-	clSetKernelArg(kernel, 8, sizeof(int), &width);
															
 
																+	clSetKernelArg(kernel, 0, sizeof(data), &data);
															
 
																+	clSetKernelArg(kernel, 1, sizeof(leftX), &leftX);
															
 
																+	clSetKernelArg(kernel, 2, sizeof(topY), &topY);
															
 
																+	clSetKernelArg(kernel, 3, sizeof(stepX), &stepX);
															
 
																+	clSetKernelArg(kernel, 4, sizeof(stepY), &stepY);
															
 
																+	clSetKernelArg(kernel, 5, sizeof(maxIt), &maxIt);
															
 
																+	clSetKernelArg(kernel, 6, sizeof(iby), &iby);
															
 
																+	clSetKernelArg(kernel, 7, sizeof(block_size), &block_size);
															
 
																+	clSetKernelArg(kernel, 8, sizeof(width), &width);
															
 
																 	unsigned dim = 16;
															
 
																 	size_t local[2] = {dim, 1};
															
@@ -278,7 +279,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
																 	int iby, block_size;
															
 
																 	double stepX, stepY;
															
 
																-	int *pcnt; // unused for sequential tasks
															
 
																+	int *pcnt; /* unused for sequential tasks */
															
 
																 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
															
 
																 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
@@ -291,7 +292,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
																 		{
															
 
																 			double cx = leftX + ix * stepX;
															
 
																 			double cy = topY - iy * stepY;
															
 
																-			// Z = X+I*Y
															
 
																+			/* Z = X+I*Y */
															
 
																 			double x = 0;
															
 
																 			double y = 0;
															
 
																 			int it;
															
@@ -300,13 +301,13 @@ static void compute_block(void *descr[], void *cl_arg)
 
																 				double x2 = x*x;
															
 
																 				double y2 = y*y;
															
 
																-				// Stop iterations when |Z| > 2
															
 
																+				/* Stop iterations when |Z| > 2 */
															
 
																 				if (x2 + y2 > 4.0)
															
 
																 					break;
															
 
																 				double twoxy = 2.0*x*y;
															
 
																-				// Z = Z^2 + C
															
 
																+				/* Z = Z^2 + C */
															
 
																 				x = x2 - y2 + cx;
															
 
																 				y = twoxy + cy;
															
 
																 			}
															
@@ -327,8 +328,8 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
																 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																-	int ix, iy; // global coordinates
															
 
																-	int local_iy; // current line
															
 
																+	int ix, iy; /* global coordinates */
															
 
																+	int local_iy; /* current line */
															
 
																 	while (1)
															
 
																 	{
															
@@ -342,7 +343,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
																 		{
															
 
																 			double cx = leftX + ix * stepX;
															
 
																 			double cy = topY - iy * stepY;
															
 
																-			// Z = X+I*Y
															
 
																+			/* Z = X+I*Y */
															
 
																 			double x = 0;
															
 
																 			double y = 0;
															
 
																 			int it;
															
@@ -351,13 +352,13 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
																 				double x2 = x*x;
															
 
																 				double y2 = y*y;
															
 
																-				// Stop iterations when |Z| > 2
															
 
																+				/* Stop iterations when |Z| > 2 */
															
 
																 				if (x2 + y2 > 4.0)
															
 
																 					break;
															
 
																 				double twoxy = 2.0*x*y;
															
 
																-				// Z = Z^2 + C
															
 
																+				/* Z = Z^2 + C */
															
 
																 				x = x2 - y2 + cx;
															
 
																 				y = twoxy + cy;
															
 
																 			}
															
@@ -396,7 +397,7 @@ static void parse_args(int argc, char **argv)
 
																 	int i;
															
 
																 	for (i = 1; i < argc; i++) {
															
 
																 		if (strcmp(argv[i], "-h") == 0) {
															
 
																-			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd]\n", argv[0]);
															
 
																+			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd] [-demo] [-demozoom 0.2]\n", argv[0]);
															
 
																 			exit(-1);
															
 
																 		}
															
@@ -434,6 +435,11 @@ static void parse_args(int argc, char **argv)
 
																 		}
															
 
																+		if (strcmp(argv[i], "-demozoom") == 0) {
															
 
																+			char *argptr;
															
 
																+			demozoom = strtof(argv[++i], &argptr);
															
 
																+		}
															
 
																+
															
 
																 		if (strcmp(argv[i], "-no-x11") == 0) {
															
 
																 #ifdef STARPU_HAVE_X11
															
 
																 			use_x11 = 0;
															
@@ -461,7 +467,7 @@ int main(int argc, char **argv)
 
																 	starpu_init(&conf);
															
 
																 	unsigned *buffer;
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&buffer, height*width*sizeof(unsigned));
															
 
																+	starpu_malloc((void **)&buffer, height*width*sizeof(unsigned));
															
 
																 #ifdef STARPU_HAVE_X11
															
 
																 	if (use_x11)
															
@@ -472,7 +478,7 @@ int main(int argc, char **argv)
 
																 	STARPU_ASSERT((height % nblocks) == 0);
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs);
															
 
																+	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs, NULL);
															
 
																 #endif
															
 
																 	starpu_data_handle block_handles[nblocks];
															
@@ -520,24 +526,24 @@ int main(int argc, char **argv)
 
																 		for (iby = 0; iby < nblocks; iby++)
															
 
																 		{
															
 
																-			starpu_data_acquire(block_handles[iby], STARPU_R);
															
 
																 #ifdef STARPU_HAVE_X11
															
 
																 			if (use_x11)
															
 
																 			{
															
 
																+				starpu_data_acquire(block_handles[iby], STARPU_R);
															
 
																 				XPutImage(dpy, win, gc, bitmap,
															
 
																 					0, iby*block_size,
															
 
																 					0, iby*block_size,
															
 
																 					width, block_size);
															
 
																+				starpu_data_release(block_handles[iby]);
															
 
																 			}
															
 
																 #endif
															
 
																-			starpu_data_release(block_handles[iby]);
															
 
																 		}
															
 
																 		if (demo)
															
 
																 		{
															
 
																 			/* Zoom in */
															
 
																-			double zoom_factor = 0.05;
															
 
																+			double zoom_factor = demozoom;
															
 
																 			double widthX = rightX - leftX;
															
 
																 			double heightY = topY - bottomY;
															
@@ -554,7 +560,7 @@ int main(int argc, char **argv)
 
																 				gettimeofday(&end, NULL);
															
 
																 				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-				fprintf(stderr, "Time to generate %d frames : %f s\n", iter, timing/1000000.0);
															
 
																+				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
															
 
																 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
															
 
																 				/* Reset counters */
															
@@ -583,7 +589,7 @@ int main(int argc, char **argv)
 
																 	for (iby = 0; iby < nblocks; iby++)
															
 
																 		starpu_data_unregister(block_handles[iby]);
															
 
																-//	starpu_data_free_pinned_if_possible(buffer);
															
 
																+/*	starpu_data_free_pinned_if_possible(buffer); */
															
 
																 	starpu_shutdown();
															
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -20,6 +20,8 @@
 
																 #include <pthread.h>
															
 
																 #include <math.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 struct starpu_opencl_program opencl_code;
															
 
																 void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
@@ -27,9 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
																 	cl_kernel kernel;
															
 
																 	cl_command_queue queue;
															
 
																 	int id, devid, err, n;
															
 
																-	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																-	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																-	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
															
 
																+	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_PTR(descr[0]);
															
 
																+	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																+	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
															
 
																 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
															
 
																 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
															
 
																 	cl_event event;
															
@@ -41,11 +43,11 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																         n=0;
															
 
																-        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
															
 
																-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
															
 
																-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&nx);
															
 
																-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&ny);
															
 
																-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
															
 
																+        err = clSetKernelArg(kernel, n++, sizeof(matrix), &matrix);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(vector), &vector);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(nx), (void*)&nx);
															
 
																+        err |= clSetKernelArg(kernel, n++, sizeof(ny), (void*)&ny);
															
 
																+	err |= clSetKernelArg(kernel, n++, sizeof(mult), &mult);
															
 
																         if (err) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																 	{
															
@@ -73,9 +75,9 @@ void fillArray(float* pfData, int iSize) {
 
																 void printArray(float* pfData, int iSize) {
															
 
																     int i;
															
 
																     for (i = 0; i < iSize; ++i) {
															
 
																-            fprintf(stderr, "%f ", pfData[i]);
															
 
																+            FPRINTF(stderr, "%f ", pfData[i]);
															
 
																     }
															
 
																-    fprintf(stderr, "\n");
															
 
																+    FPRINTF(stderr, "\n");
															
 
																 }
															
 
																 void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
															
@@ -121,8 +123,8 @@ int main(int argc, char **argv)
 
																                 .nopencl = 1,
															
 
																 	};
															
 
																-        //int width=1100;
															
 
																-        //int height=244021;
															
 
																+        /* int width=1100; */
															
 
																+        /* int height=244021; */
															
 
																         int width=20;
															
 
																         int height=4;
															
@@ -131,8 +133,14 @@ int main(int argc, char **argv)
 
																         unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
															
 
																 	starpu_data_handle matrix_handle, vector_handle, mult_handle;
															
 
																+	int ret, submit;
															
 
																-        starpu_init(&conf);
															
 
																+        ret = starpu_init(&conf);
															
 
																+	if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																+                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
															
 
																+		starpu_shutdown();
															
 
																+		exit(0);
															
 
																+	}
															
 
																         mem_size_matrix = width * height * sizeof(float);
															
 
																         matrix = (float*)malloc(mem_size_matrix);
															
@@ -157,7 +165,7 @@ int main(int argc, char **argv)
 
																 	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code);
															
 
																+        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code, NULL);
															
 
																 #endif
															
 
																 	cl.where = STARPU_OPENCL;
															
@@ -177,30 +185,28 @@ int main(int argc, char **argv)
 
																         task->buffers[2].handle = mult_handle;
															
 
																         task->buffers[2].mode = STARPU_RW;
															
 
																-        int ret = starpu_task_submit(task);
															
 
																-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
															
 
																-                fprintf(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
															
 
																-                exit(0);
															
 
																+        submit = starpu_task_submit(task);
															
 
																+        if (STARPU_UNLIKELY(submit == -ENODEV)) {
															
 
																+                FPRINTF(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
															
 
																+	}
															
 
																+	else {
															
 
																+		starpu_task_wait_for_all();
															
 
																 	}
															
 
																-	starpu_task_wait_for_all();
															
 
																+	starpu_data_unregister(matrix_handle);
															
 
																+	starpu_data_unregister(vector_handle);
															
 
																+	starpu_data_unregister(mult_handle);
															
 
																-	/* update the array in RAM */
															
 
																-        starpu_data_acquire(matrix_handle, STARPU_R);
															
 
																-        starpu_data_acquire(vector_handle, STARPU_R);
															
 
																-        starpu_data_acquire(mult_handle, STARPU_R);
															
 
																+        if (STARPU_LIKELY(submit != -ENODEV)) {
															
 
																+		int res = compareL2fe(correctResult, mult, height, 1e-6f);
															
 
																+		FPRINTF(stdout, "TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
															
 
																+	}
															
 
																-        int res = compareL2fe(correctResult, mult, height, 1e-6f);
															
 
																-        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
															
 
																 #if 0
															
 
																         printArray(matrix, width*height);
															
 
																         printArray(vector, width);
															
 
																         printArray(mult, height);
															
 
																 #endif
															
 
																-        starpu_data_release(matrix_handle);
															
 
																-        starpu_data_release(vector_handle);
															
 
																-        starpu_data_release(mult_handle);
															
 
																-
															
 
																         starpu_shutdown();
															
 
																 	return 0;
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -42,6 +42,8 @@ static unsigned check = 0;
 
																 static TYPE *A, *B, *C;
															
 
																 static starpu_data_handle A_handle, B_handle, C_handle;
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static void check_output(void)
															
 
																 {
															
 
																 	/* compute C = C - AB */
															
@@ -52,14 +54,14 @@ static void check_output(void)
 
																 	err = CPU_ASUM(xdim*ydim, C, 1);
															
 
																 	if (err < xdim*ydim*0.001) {
															
 
																-		fprintf(stderr, "Results are OK\n");
															
 
																+		FPRINTF(stderr, "Results are OK\n");
															
 
																 	}
															
 
																 	else {
															
 
																 		int max;
															
 
																 		max = CPU_IAMAX(xdim*ydim, C, 1);
															
 
																-		fprintf(stderr, "There were errors ... err = %f\n", err);
															
 
																-		fprintf(stderr, "Max error : %e\n", C[max]);
															
 
																+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
															
 
																+		FPRINTF(stderr, "Max error : %e\n", C[max]);
															
 
																 	}
															
 
																 }
															
@@ -67,9 +69,9 @@ static void init_problem_data(void)
 
																 {
															
 
																 	unsigned i,j;
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
															
 
																-	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
															
 
																+	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
															
 
																 	/* fill the A and B matrices */
															
 
																 	for (j=0; j < ydim; j++) {
															
@@ -100,20 +102,20 @@ static void partition_mult_data(void)
 
																 	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
															
 
																 		ydim, ydim, xdim, sizeof(TYPE));
															
 
																-	struct starpu_data_filter f;
															
 
																-	memset(&f, 0, sizeof(f));
															
 
																-	f.filter_func = starpu_vertical_block_filter_func;
															
 
																-	f.nchildren = nslicesx;
															
 
																+	struct starpu_data_filter vert;
															
 
																+	memset(&vert, 0, sizeof(vert));
															
 
																+	vert.filter_func = starpu_vertical_block_filter_func;
															
 
																+	vert.nchildren = nslicesx;
															
 
																-	struct starpu_data_filter f2;
															
 
																-	memset(&f2, 0, sizeof(f2));
															
 
																-	f2.filter_func = starpu_block_filter_func;
															
 
																-	f2.nchildren = nslicesy;
															
 
																+	struct starpu_data_filter horiz;
															
 
																+	memset(&horiz, 0, sizeof(horiz));
															
 
																+	horiz.filter_func = starpu_block_filter_func;
															
 
																+	horiz.nchildren = nslicesy;
															
 
																-	starpu_data_partition(B_handle, &f);
															
 
																-	starpu_data_partition(A_handle, &f2);
															
 
																+	starpu_data_partition(B_handle, &vert);
															
 
																+	starpu_data_partition(A_handle, &horiz);
															
 
																-	starpu_data_map_filters(C_handle, 2, &f, &f2);
															
 
																+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
															
 
																 }
															
 
																 static void mult_kernel_common(void *descr[], int type)
															
@@ -145,10 +147,12 @@ static void mult_kernel_common(void *descr[], int type)
 
																 			int block_size = (nyC + worker_size - 1)/worker_size;
															
 
																 			int new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
															
 
																-			TYPE *new_subA = &subA[block_size*rank];
															
 
																+			STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1]));
															
 
																+
															
 
																+			TYPE *new_subB = &subB[block_size*rank];
															
 
																 			TYPE *new_subC = &subC[block_size*rank];
															
 
																-			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, new_subA, ldA, subB, ldB, (TYPE)0.0, new_subC, ldC);
															
 
																+			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
															
 
																 		}
															
 
																 	}
															
 
																 #ifdef STARPU_USE_CUDA
															
@@ -282,11 +286,11 @@ int main(int argc, char **argv)
 
																 	gettimeofday(&end, NULL);
															
 
																 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																-	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
															
 
																+	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
															
 
																 	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
															
 
																 				*((unsigned long)ydim)*((unsigned long)zdim);
															
 
																-	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
															
 
																+	FPRINTF(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
															
 
																 	starpu_data_unpartition(C_handle, 0);
															
 
																 	starpu_data_unregister(C_handle);
															
--- a/examples/openmp/vector_scal.c
+++ b/examples/openmp/vector_scal.c
@@ -0,0 +1,105 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/* gcc build:
															
 
																+
															
 
																+   gcc -fopenmp vector_scal.c -o vector_scal $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu)
															
 
																+
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <stdio.h>
															
 
																+#include <limits.h>
															
 
																+
															
 
																+#define	NX	2048
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																+void scal_cpu_func(void *buffers[], void *_args) {
															
 
																+	unsigned i;
															
 
																+	float *factor = _args;
															
 
																+	starpu_vector_interface_t *vector = buffers[0];
															
 
																+	unsigned n = STARPU_VECTOR_GET_NX(vector);
															
 
																+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
															
 
																+
															
 
																+	FPRINTF(stderr, "running task with %d CPUs.\n", starpu_combined_worker_get_size());
															
 
																+
															
 
																+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
															
 
																+	for (i = 0; i < n; i++)
															
 
																+		val[i] *= *factor;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_perfmodel_t vector_scal_model = {
															
 
																+	.type = STARPU_HISTORY_BASED,
															
 
																+	.symbol = "vector_scale_parallel"
															
 
																+};
															
 
																+
															
 
																+static starpu_codelet cl = {
															
 
																+	.where = STARPU_CPU,
															
 
																+	.type = STARPU_FORKJOIN,
															
 
																+	.max_parallelism = INT_MAX,
															
 
																+	.cpu_func = scal_cpu_func,
															
 
																+	.nbuffers = 1,
															
 
																+	.model = &vector_scal_model,
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	struct starpu_conf conf;
															
 
																+	float vector[NX];
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < NX; i++)
															
 
																+                vector[i] = (i+1.0f);
															
 
																+
															
 
																+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
															
 
																+
															
 
																+	starpu_conf_init(&conf);
															
 
																+
															
 
																+	/* Most OpenMP implementations do not support concurrent parallel
															
 
																+	 * sections, so only create one big worker */
															
 
																+	conf.single_combined_worker = 1;
															
 
																+
															
 
																+	starpu_init(&conf);
															
 
																+
															
 
																+	starpu_data_handle vector_handle;
															
 
																+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
															
 
																+
															
 
																+	float factor = 3.14;
															
 
																+
															
 
																+	struct starpu_task *task = starpu_task_create();
															
 
																+	task->synchronous = 1;
															
 
																+
															
 
																+	task->cl = &cl;
															
 
																+
															
 
																+	task->buffers[0].handle = vector_handle;
															
 
																+	task->buffers[0].mode = STARPU_RW;
															
 
																+	task->cl_arg = &factor;
															
 
																+	task->cl_arg_size = sizeof(factor);
															
 
																+
															
 
																+	starpu_task_submit(task);
															
 
																+	starpu_data_unregister(vector_handle);
															
 
																+
															
 
																+	starpu_task_destroy(task);
															
 
																+
															
 
																+	/* terminate StarPU, no task can be submitted after */
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
															
 
																+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/opt/Makefile.am
+++ b/examples/opt/Makefile.am
@@ -0,0 +1,78 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
															
 
																+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+
															
 
																+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/ $(HWLOC_CFLAGS) -arch sm_13
															
 
																+
															
 
																+.cu.o:
															
 
																+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
															
 
																+
															
 
																+endif
															
 
																+
															
 
																+TESTS	=	$(check_PROGRAMS)
															
 
																+
															
 
																+check_PROGRAMS =
															
 
																+
															
 
																+examplebindir = $(libdir)/starpu/examples/
															
 
																+
															
 
																+examplebin_PROGRAMS =
															
 
																+
															
 
																+noinst_HEADERS = 				\
															
 
																+	pi/SobolQRNG/sobol.h			\
															
 
																+	pi/SobolQRNG/sobol_gold.h		\
															
 
																+	pi/SobolQRNG/sobol_gpu.h		\
															
 
																+	pi/SobolQRNG/sobol_primitives.h
															
 
																+
															
 
																+######
															
 
																+# Pi #
															
 
																+######
															
 
																+
															
 
																+check_PROGRAMS +=				\
															
 
																+	pi/pi					\
															
 
																+	pi/pi_redux
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	pi/pi					\
															
 
																+	pi/pi_redux
															
 
																+
															
 
																+pi_pi_SOURCES =					\
															
 
																+	pi/pi.c					\
															
 
																+	pi/SobolQRNG/sobol_gold.c		\
															
 
																+	pi/SobolQRNG/sobol_primitives.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+pi_pi_SOURCES +=				\
															
 
																+	pi/pi_kernel.cu				\
															
 
																+	pi/SobolQRNG/sobol_gpu.cu
															
 
																+endif
															
 
																+
															
 
																+pi_pi_redux_SOURCES =				\
															
 
																+	pi/pi_redux.c
															
 
																+
															
 
																+if STARPU_USE_CUDA
															
 
																+pi_pi_redux_SOURCES +=				\
															
 
																+	pi/pi_redux_kernel.cu
															
 
																+pi_pi_redux_LDADD =				\
															
 
																+	$(STARPU_CURAND_LDFLAGS)
															
 
																+endif
															
 
																+
															
 
																+
															
--- a/examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
+++ b/examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
--- a/examples/opt/pi/SobolQRNG/sobol.h
+++ b/examples/opt/pi/SobolQRNG/sobol.h
@@ -0,0 +1,60 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ */
															
 
																+
															
 
																+#ifndef SOBOL_H
															
 
																+#define SOBOL_H
															
 
																+
															
 
																+/* Number of direction vectors is fixed to 32 */
															
 
																+#define n_directions 32
															
 
																+
															
 
																+#endif
															
--- a/examples/opt/pi/SobolQRNG/sobol_gold.c
+++ b/examples/opt/pi/SobolQRNG/sobol_gold.c
@@ -0,0 +1,141 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ */
															
 
																+
															
 
																+#include <stdio.h>
															
 
																+#include <stdlib.h>
															
 
																+#include <math.h>
															
 
																+#include <string.h>
															
 
																+
															
 
																+#include "sobol.h"
															
 
																+#include "sobol_gold.h"
															
 
																+#include "sobol_primitives.h"
															
 
																+
															
 
																+#define k_2powneg32 2.3283064E-10F
															
 
																+
															
 
																+/* Create the direction numbers, based on the primitive polynomials. */
															
 
																+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
															
 
																+{
															
 
																+    unsigned int *v = directions;
															
 
																+
															
 
																+    int dim;
															
 
																+    for (dim = 0 ; dim < n_dimensions ; dim++)
															
 
																+    {
															
 
																+        /* First dimension is a special case */
															
 
																+        if (dim == 0)
															
 
																+        {
															
 
																+            int i;
															
 
																+            for (i = 0 ; i < n_directions ; i++)
															
 
																+            {
															
 
																+                /* All m's are 1 */
															
 
																+                v[i] = 1 << (31 - i);
															
 
																+            }
															
 
																+        }
															
 
																+        else
															
 
																+        {
															
 
																+            int d = sobol_primitives[dim].degree;
															
 
																+            /* The first direction numbers (up to the degree of the polynomial) 
															
 
																+               are simply v[i] = m[i] / 2^i (stored in Q0.32 format) */
															
 
																+            int i;
															
 
																+            for (i = 0 ; i < d ; i++)
															
 
																+            {
															
 
																+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
															
 
																+            }
															
 
																+            /* The remaining direction numbers are computed as described in
															
 
																+               the Bratley and Fox paper. */
															
 
																+            /* v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d */
															
 
																+            for (i = d ; i < n_directions ; i++)
															
 
																+            {
															
 
																+                /* First do the v[i-d] ^ v[i-d]/2^d part */
															
 
																+                v[i] = v[i - d] ^ (v[i - d] >> d);
															
 
																+                /* Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
															
 
																+                   Note that the coefficients a[] are zero or one and for compactness in
															
 
																+                   the input tables they are stored as bits of a single integer. To extract
															
 
																+                   the relevant bit we use right shift and mask with 1.
															
 
																+                   For example, for a 10 degree polynomial there are ten useful bits in a,
															
 
																+                   so to get a[2] we need to right shift 7 times (to get the 8th bit into
															
 
																+                   the LSB) and then mask with 1. */
															
 
																+                int j;
															
 
																+                for (j = 1 ; j < d ; j++)
															
 
																+                {
															
 
																+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
															
 
																+                }
															
 
																+            }
															
 
																+        }
															
 
																+        v += n_directions;
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+/* Reference model for generating Sobol numbers on the host */
															
 
																+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
															
 
																+{
															
 
																+    unsigned int *v = directions;
															
 
																+
															
 
																+    int d;
															
 
																+    for (d = 0 ; d < n_dimensions ; d++)
															
 
																+    {
															
 
																+        unsigned int X = 0;
															
 
																+        /* x[0] is zero (in all dimensions) */
															
 
																+        output[n_vectors * d] = 0.0;        
															
 
																+        int i;
															
 
																+        for (i = 1 ; i < n_vectors ; i++)
															
 
																+        {
															
 
																+            /* x[i] = x[i-1] ^ v[c]
															
 
																+                where c is the index of the rightmost zero bit in i
															
 
																+                minus 1 (since C arrays count from zero)
															
 
																+               In the Bratley and Fox paper this is equation (**) */
															
 
																+            X ^= v[ffs(~(i - 1)) - 1];
															
 
																+            output[i + n_vectors * d] = (float)X * k_2powneg32;
															
 
																+        }
															
 
																+        v += n_directions;
															
 
																+    }
															
 
																+}
															
--- a/examples/opt/pi/SobolQRNG/sobol_gold.h
+++ b/examples/opt/pi/SobolQRNG/sobol_gold.h
@@ -0,0 +1,61 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ *
															
 
																+ */
															
 
																+
															
 
																+#ifndef SOBOL_GOLD_H
															
 
																+#define SOBOL_GOLD_H
															
 
																+
															
 
																+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
															
 
																+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
															
 
																+
															
 
																+#endif
															
--- a/examples/opt/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/opt/pi/SobolQRNG/sobol_gpu.cu
@@ -0,0 +1,170 @@
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ *
															
 
																+ */
															
 
																+
															
 
																+#include "sobol.h"
															
 
																+#include "sobol_gpu.h"
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+
															
 
																+#define k_2powneg32 2.3283064E-10F
															
 
																+
															
 
																+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
															
 
																+{
															
 
																+    __shared__ unsigned int v[n_directions];
															
 
																+
															
 
																+    // Offset into the correct dimension as specified by the
															
 
																+    // block y coordinate
															
 
																+    d_directions = d_directions + n_directions * blockIdx.y;
															
 
																+    d_output = d_output +  n_vectors * blockIdx.y;
															
 
																+
															
 
																+    // Copy the direction numbers for this dimension into shared
															
 
																+    // memory - there are only 32 direction numbers so only the
															
 
																+    // first 32 (n_directions) threads need participate.
															
 
																+    if (threadIdx.x < n_directions)
															
 
																+    {
															
 
																+	    v[threadIdx.x] = d_directions[threadIdx.x];
															
 
																+    }
															
 
																+    __syncthreads();
															
 
																+
															
 
																+    // Set initial index (i.e. which vector this thread is
															
 
																+    // computing first) and stride (i.e. step to the next vector
															
 
																+    // for this thread)
															
 
																+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
															
 
																+    int stride = gridDim.x * blockDim.x;
															
 
																+
															
 
																+    // Get the gray code of the index
															
 
																+    // c.f. Numerical Recipes in C, chapter 20
															
 
																+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
															
 
																+    unsigned int g = i0 ^ (i0 >> 1);
															
 
																+
															
 
																+    // Initialisation for first point x[i0]
															
 
																+    // In the Bratley and Fox paper this is equation (*), where
															
 
																+    // we are computing the value for x[n] without knowing the
															
 
																+    // value of x[n-1].
															
 
																+    unsigned int X = 0;
															
 
																+    unsigned int mask;
															
 
																+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
															
 
																+    {
															
 
																+        // We want X ^= g_k * v[k], where g_k is one or zero.
															
 
																+        // We do this by setting a mask with all bits equal to
															
 
																+        // g_k. In reality we keep shifting g so that g_k is the
															
 
																+        // LSB of g. This way we avoid multiplication.
															
 
																+        mask = - (g & 1);
															
 
																+        X ^= mask & v[k];
															
 
																+        g = g >> 1;
															
 
																+    }
															
 
																+    if (i0 < n_vectors)
															
 
																+    {
															
 
																+        d_output[i0] = (float)X * k_2powneg32;
															
 
																+    }
															
 
																+
															
 
																+    // Now do rest of points, using the stride
															
 
																+    // Here we want to generate x[i] from x[i-stride] where we
															
 
																+    // don't have any of the x in between, therefore we have to
															
 
																+    // revisit the equation (**), this is easiest with an example
															
 
																+    // so assume stride is 16.
															
 
																+    // From x[n] to x[n+16] there will be:
															
 
																+    //   8 changes in the first bit
															
 
																+    //   4 changes in the second bit
															
 
																+    //   2 changes in the third bit
															
 
																+    //   1 change in the fourth
															
 
																+    //   1 change in one of the remaining bits
															
 
																+    //
															
 
																+    // What this means is that in the equation:
															
 
																+    //   x[n+1] = x[n] ^ v[p]
															
 
																+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
															
 
																+    //   ...
															
 
																+    // We will apply xor with v[1] eight times, v[2] four times,
															
 
																+    // v[3] twice, v[4] once and one other direction number once.
															
 
																+    // Since two xors cancel out, we can skip even applications
															
 
																+    // and just apply xor with v[4] (i.e. log2(16)) and with
															
 
																+    // the current applicable direction number.
															
 
																+    // Note that all these indices count from 1, so we need to
															
 
																+    // subtract 1 from them all to account for C arrays counting
															
 
																+    // from zero.
															
 
																+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
															
 
																+    unsigned int v_stridemask = stride - 1;
															
 
																+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
															
 
																+    {
															
 
																+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
															
 
																+        //  where b is log2(stride) minus 1 for C array indexing
															
 
																+        //  where c is the index of the rightmost zero bit in i,
															
 
																+        //  not including the bottom log2(stride) bits, minus 1
															
 
																+        //  for C array indexing
															
 
																+        // In the Bratley and Fox paper this is equation (**)
															
 
																+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
															
 
																+        d_output[i] = (float)X * k_2powneg32;
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+extern "C"
															
 
																+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
															
 
																+{
															
 
																+    const int threadsperblock = 64;
															
 
																+
															
 
																+    // Set up the execution configuration
															
 
																+    dim3 dimGrid;
															
 
																+    dim3 dimBlock;
															
 
																+
															
 
																+    // This implementation of the generator outputs all the draws for
															
 
																+    // one dimension in a contiguous region of memory, followed by the
															
 
																+    // next dimension and so on.
															
 
																+    // Therefore all threads within a block will be processing different
															
 
																+    // vectors from the same dimension. As a result we want the total
															
 
																+    // number of blocks to be a multiple of the number of dimensions.
															
 
																+    dimGrid.y = n_dimensions;
															
 
																+
															
 
																+    // If the number of dimensions is large then we will set the number
															
 
																+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
															
 
																+    // but if the number of dimensions is small (e.g. less than 32) then
															
 
																+    // we'll partition the vectors across blocks (as well as threads).
															
 
																+    // We also need to cap the dimGrid.x where the number of vectors
															
 
																+    // is too small to be partitioned.
															
 
																+    dimGrid.x = 1 + 31 / n_dimensions;
															
 
																+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
															
 
																+    {
															
 
																+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
															
 
																+    }
															
 
																+    
															
 
																+    // Fix the number of threads
															
 
																+    dimBlock.x = threadsperblock;
															
 
																+
															
 
																+    // Execute GPU kernel
															
 
																+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
															
 
																+}
															
--- a/examples/opt/pi/SobolQRNG/sobol_gpu.h
+++ b/examples/opt/pi/SobolQRNG/sobol_gpu.h
@@ -0,0 +1,61 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ *
															
 
																+ */
															
 
																+
															
 
																+#ifndef SOBOL_GPU_H
															
 
																+#define SOBOL_GPU_H
															
 
																+
															
 
																+extern "C"
															
 
																+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
															
 
																+
															
 
																+#endif
															
--- a/examples/opt/pi/SobolQRNG/sobol_primitives.c
+++ b/examples/opt/pi/SobolQRNG/sobol_primitives.c
--- a/examples/opt/pi/SobolQRNG/sobol_primitives.h
+++ b/examples/opt/pi/SobolQRNG/sobol_primitives.h
@@ -0,0 +1,75 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+/*
															
 
																+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ *
															
 
																+ * NVIDIA Corporation and its licensors retain all intellectual property and 
															
 
																+ * proprietary rights in and to this software and related documentation and 
															
 
																+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
															
 
																+ * of this software and related documentation without an express license 
															
 
																+ * agreement from NVIDIA Corporation is strictly prohibited.
															
 
																+ * 
															
 
																+ */
															
 
																+ 
															
 
																+ /*
															
 
																+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
															
 
																+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
															
 
																+ *
															
 
																+ * Sobol Quasi-random Number Generator example
															
 
																+ *
															
 
																+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
															
 
																+ * http://people.maths.ox.ac.uk/~gilesm/
															
 
																+ *
															
 
																+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
															
 
																+ * and Frances Kuo, University of New South Wales, Australia
															
 
																+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
															
 
																+ *
															
 
																+ * For theoretical background see:
															
 
																+ *
															
 
																+ * P. Bratley and B.L. Fox.
															
 
																+ * Implementing Sobol's quasirandom sequence generator
															
 
																+ * http://portal.acm.org/citation.cfm?id=42288
															
 
																+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
															
 
																+ *
															
 
																+ * S. Joe and F. Kuo.
															
 
																+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
															
 
																+ * http://portal.acm.org/citation.cfm?id=641879
															
 
																+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
															
 
																+ *
															
 
																+ */
															
 
																+
															
 
																+#ifndef SOBOL_PRIMITIVES_H
															
 
																+#define SOBOL_PRIMITIVES_H
															
 
																+
															
 
																+#define max_m 17
															
 
																+
															
 
																+/* Each primitive is stored as a struct where
															
 
																+   dimension is the dimension number of the polynomial (unused)
															
 
																+   degree is the degree of the polynomial
															
 
																+   a is a binary word representing the coefficients 
															
 
																+   m is the array of m values */
															
 
																+struct primitive
															
 
																+{
															
 
																+    unsigned int dimension;
															
 
																+    unsigned int degree;
															
 
																+    unsigned int a;
															
 
																+    unsigned int m[max_m];
															
 
																+};
															
 
																+
															
 
																+extern const struct primitive sobol_primitives[];
															
 
																+
															
 
																+#endif
															
--- a/examples/opt/pi/pi.c
+++ b/examples/opt/pi/pi.c
@@ -0,0 +1,175 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "SobolQRNG/sobol.h"
															
 
																+#include "SobolQRNG/sobol_gold.h"
															
 
																+#include "pi.h"
															
 
																+#include <sys/time.h>
															
 
																+
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+void cuda_kernel(void **descr, void *cl_arg);
															
 
																+#endif
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																+/* default value */
															
 
																+static unsigned ntasks = 1024;
															
 
																+
															
 
																+static void cpu_kernel(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	unsigned nx = NSHOT_PER_TASK;
															
 
																+
															
 
																+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
															
 
																+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
															
 
																+
															
 
																+	TYPE *random_numbers_x = &random_numbers[0];
															
 
																+	TYPE *random_numbers_y = &random_numbers[nx];
															
 
																+
															
 
																+	unsigned current_cnt = 0;
															
 
																+
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < nx; i++)
															
 
																+	{
															
 
																+		TYPE x = random_numbers_x[i];
															
 
																+		TYPE y = random_numbers_y[i];
															
 
																+
															
 
																+		TYPE dist = (x*x + y*y);
															
 
																+
															
 
																+		unsigned success = (dist <= 1.0);
															
 
																+		current_cnt += success;
															
 
																+	}
															
 
																+
															
 
																+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																+	*cnt = current_cnt;
															
 
																+
															
 
																+	free(random_numbers);
															
 
																+}
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++) {
															
 
																+		if (strcmp(argv[i], "-ntasks") == 0) {
															
 
																+			char *argptr;
															
 
																+			ntasks = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	starpu_init(NULL);
															
 
																+
															
 
																+	/* Initialize the random number generator */
															
 
																+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
															
 
																+	STARPU_ASSERT(sobol_qrng_directions);
															
 
																+
															
 
																+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
															
 
																+
															
 
																+	/* Any worker may use that array now */
															
 
																+	starpu_data_handle sobol_qrng_direction_handle;
															
 
																+	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
															
 
																+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
															
 
																+
															
 
																+	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
															
 
																+	STARPU_ASSERT(cnt_array);
															
 
																+	starpu_data_handle cnt_array_handle;
															
 
																+	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
															
 
																+
															
 
																+	/* Use a write-through policy : when the data is modified on an
															
 
																+	 * accelerator, we know that it will only be modified once and be
															
 
																+	 * accessed by the CPU later on */
															
 
																+	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
															
 
																+
															
 
																+	struct starpu_data_filter f = {
															
 
																+		.filter_func = starpu_block_filter_func_vector,
															
 
																+		.nchildren = ntasks
															
 
																+	};
															
 
																+	
															
 
																+	starpu_data_partition(cnt_array_handle, &f);
															
 
																+
															
 
																+	static struct starpu_perfmodel_t model = {
															
 
																+		.type = STARPU_HISTORY_BASED,
															
 
																+		.symbol = "monte_carlo_pi"
															
 
																+	};
															
 
																+
															
 
																+	struct starpu_codelet_t cl = {
															
 
																+		.where = STARPU_CPU|STARPU_CUDA,
															
 
																+		.cpu_func = cpu_kernel,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+		.cuda_func = cuda_kernel,
															
 
																+#endif
															
 
																+		.nbuffers = 2,
															
 
																+		.model = &model
															
 
																+	};
															
 
																+
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																+
															
 
																+	gettimeofday(&start, NULL);
															
 
																+
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+		task->cl = &cl;
															
 
																+
															
 
																+		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
															
 
																+
															
 
																+		task->buffers[0].handle = sobol_qrng_direction_handle;
															
 
																+		task->buffers[0].mode   = STARPU_R;
															
 
																+		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
															
 
																+		task->buffers[1].mode   = STARPU_W;
															
 
																+
															
 
																+		int ret = starpu_task_submit(task);
															
 
																+		STARPU_ASSERT(!ret);
															
 
																+	}
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	/* Get the cnt_array back in main memory */
															
 
																+	starpu_data_unpartition(cnt_array_handle, 0);
															
 
																+	starpu_data_unregister(cnt_array_handle);
															
 
																+
															
 
																+	/* Count the total number of entries */
															
 
																+	unsigned long total_cnt = 0;
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																+		total_cnt += cnt_array[i];
															
 
																+
															
 
																+	gettimeofday(&end, NULL);
															
 
																+
															
 
																+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+
															
 
																+	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
															
 
																+
															
 
																+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
															
 
																+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
															
 
																+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
															
 
																+	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
															
 
																+
															
 
																+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/opt/pi/pi.h
+++ b/examples/opt/pi/pi.h
@@ -0,0 +1,33 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __PI_H__
															
 
																+#define __PI_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+#include <stdio.h>
															
 
																+
															
 
																+#define NSHOT_PER_TASK	(16*1024*1024ULL)
															
 
																+
															
 
																+#define TYPE	float
															
 
																+
															
 
																+/* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */
															
 
																+
															
 
																+static int n_dimensions = 100;
															
 
																+
															
 
																+#endif /* __PI_H__ */
															
--- a/examples/opt/pi/pi_kernel.cu
+++ b/examples/opt/pi/pi_kernel.cu
@@ -0,0 +1,150 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "SobolQRNG/sobol_gpu.h"
															
 
																+#include "pi.h"
															
 
																+#include <starpu_cuda.h>
															
 
																+
															
 
																+#define MAXNBLOCKS	128
															
 
																+#define MAXTHREADSPERBLOCK	256
															
 
																+
															
 
																+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
															
 
																+						unsigned n, unsigned *output_cnt)
															
 
																+{
															
 
																+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
															
 
																+
															
 
																+	/* Do we have a successful shot ? */
															
 
																+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
															
 
																+
															
 
																+	const int nthreads = gridDim.x * blockDim.x;
															
 
																+
															
 
																+	/* Blank the shared mem buffer */
															
 
																+	if (threadIdx.x < MAXTHREADSPERBLOCK)
															
 
																+		scnt[threadIdx.x] = 0;
															
 
																+
															
 
																+	__syncthreads();
															
 
																+	int ind;
															
 
																+	for (ind = tid; ind < n; ind += nthreads)
															
 
																+	{ 
															
 
																+		TYPE x = random_numbers_x[ind];
															
 
																+		TYPE y = random_numbers_y[ind];
															
 
																+		TYPE dist = (x*x + y*y);
															
 
																+
															
 
																+		unsigned success = (dist <= 1.0f)?1:0;
															
 
																+
															
 
																+		scnt[threadIdx.x] += success;
															
 
																+
															
 
																+	}
															
 
																+
															
 
																+	__syncthreads();
															
 
																+
															
 
																+	/* Perform a reduction to compute the sum on each thread within that block */
															
 
																+
															
 
																+	/* NB: We assume that the number of threads per block is a power of 2 ! */
															
 
																+	unsigned s;
															
 
																+	for (s = blockDim.x/2; s!=0; s>>=1)
															
 
																+	{
															
 
																+		if (threadIdx.x < s)
															
 
																+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
															
 
																+
															
 
																+		__syncthreads();
															
 
																+	}
															
 
																+
															
 
																+	/* report the number of successful shots in the block */
															
 
																+	if (threadIdx.x == 0)
															
 
																+		output_cnt[blockIdx.x] = scnt[0];
															
 
																+
															
 
																+	__syncthreads();
															
 
																+}
															
 
																+
															
 
																+static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
															
 
																+{
															
 
																+	__shared__ unsigned accumulator[MAXNBLOCKS];
															
 
																+
															
 
																+	unsigned i;
															
 
																+
															
 
																+	/* Load the values from global mem */
															
 
																+	for (i = 0; i < blockDim.x; i++)
															
 
																+		accumulator[i] = output_cnt[i];
															
 
																+
															
 
																+	__syncthreads();
															
 
																+
															
 
																+	/* Perform a reduction in shared memory */
															
 
																+	unsigned s;
															
 
																+	for (s = blockDim.x/2; s!=0; s>>=1)
															
 
																+	{
															
 
																+		if (threadIdx.x < s)
															
 
																+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
															
 
																+
															
 
																+		__syncthreads();
															
 
																+	}
															
 
																+
															
 
																+	/* Save the result in global memory */
															
 
																+	if (threadIdx.x == 0)
															
 
																+		*cnt = accumulator[0];
															
 
																+}
															
 
																+
															
 
																+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	cudaError_t cures;
															
 
																+
															
 
																+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	unsigned nx = NSHOT_PER_TASK;
															
 
																+
															
 
																+	/* Generate Random numbers */
															
 
																+	float *random_numbers;
															
 
																+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
															
 
																+	STARPU_ASSERT(random_numbers);
															
 
																+	
															
 
																+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
															
 
																+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+
															
 
																+	TYPE *random_numbers_x = &random_numbers[0];
															
 
																+	TYPE *random_numbers_y = &random_numbers[nx];
															
 
																+
															
 
																+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
															
 
																+
															
 
																+	/* How many blocks do we use ? */ 
															
 
																+	unsigned nblocks = 128; // TODO
															
 
																+
															
 
																+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
															
 
																+	
															
 
																+	unsigned *per_block_cnt;
															
 
																+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
															
 
																+
															
 
																+	STARPU_ASSERT((nx % nblocks) == 0);
															
 
																+
															
 
																+	/* How many threads per block ? At most 256, but no more threads than
															
 
																+	 * there are entries to process per block. */
															
 
																+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
															
 
																+
															
 
																+	/* each entry of per_block_cnt contains the number of successful shots
															
 
																+	 * in the corresponding block. */
															
 
																+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
															
 
																+
															
 
																+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																+
															
 
																+	/* compute the total number of successful shots by adding the elements
															
 
																+	 * of the per_block_cnt array */
															
 
																+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
															
 
																+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+	if (cures)
															
 
																+		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+
															
 
																+	cudaFree(per_block_cnt);
															
 
																+	cudaFree(random_numbers);
															
 
																+}
															
--- a/examples/opt/pi/pi_redux.c
+++ b/examples/opt/pi/pi_redux.c
@@ -0,0 +1,362 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <stdlib.h>
															
 
																+#include <sys/time.h>
															
 
																+#include <starpu_config.h>
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+#define PI	3.14159265358979323846
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
															
 
																+#warning CURAND is required to run that example on CUDA devices
															
 
																+#endif
															
 
																+
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+#include <cuda.h>
															
 
																+#include <curand.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+#endif
															
 
																+
															
 
																+#define NSHOT_PER_TASK	(1024*1024)
															
 
																+
															
 
																+/* default value */
															
 
																+static unsigned long ntasks = 1024;
															
 
																+static unsigned long ntasks_warmup = 0;
															
 
																+
															
 
																+static unsigned use_redux = 1;
															
 
																+static unsigned do_warmup = 0;
															
 
																+
															
 
																+/*
															
 
																+ *	Initialization of the Random Number Generators (RNG)
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+/* RNG for the CURAND library */
															
 
																+static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
															
 
																+#endif 
															
 
																+
															
 
																+/* state for the erand48 function : note the huge padding to avoid false-sharing */
															
 
																+#define PADDING	1024
															
 
																+static unsigned short xsubi[STARPU_NMAXWORKERS*PADDING];
															
 
																+static struct drand48_data randbuffer[STARPU_NMAXWORKERS*PADDING];
															
 
																+
															
 
																+/* Function to initialize the random number generator in the current worker */
															
 
																+static void init_rng(void *arg __attribute__((unused)))
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+	curandStatus_t res;
															
 
																+#endif
															
 
																+
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	switch (starpu_worker_get_type(workerid)) {
															
 
																+		case STARPU_CPU_WORKER:
															
 
																+			/* create a seed */
															
 
																+			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
															
 
																+
															
 
																+			xsubi[0 + PADDING*workerid] = (unsigned short)workerid;
															
 
																+			xsubi[1 + PADDING*workerid] = (unsigned short)workerid;
															
 
																+			xsubi[2 + PADDING*workerid] = (unsigned short)workerid;
															
 
																+			break;
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+		case STARPU_CUDA_WORKER:
															
 
																+
															
 
																+			/* Create a RNG */
															
 
																+			res = curandCreateGenerator(&curandgens[workerid],
															
 
																+						CURAND_RNG_PSEUDO_DEFAULT);
															
 
																+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
															
 
																+
															
 
																+			/* Seed it with worker's id */
															
 
																+			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
															
 
																+							(unsigned long long)workerid);
															
 
																+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
															
 
																+			break;
															
 
																+#endif
															
 
																+		default:
															
 
																+			STARPU_ABORT();
															
 
																+			break;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++) {
															
 
																+		if (strcmp(argv[i], "-ntasks") == 0) {
															
 
																+			char *argptr;
															
 
																+			ntasks = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-noredux") == 0) {
															
 
																+			use_redux = 0;
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-warmup") == 0) {
															
 
																+			do_warmup = 1;
															
 
																+			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
															
 
																+		}
															
 
																+
															
 
																+		if (strcmp(argv[i], "-h") == 0) {
															
 
																+			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
															
 
																+			exit(-1);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/*
															
 
																+ *	Monte-carlo kernel
															
 
																+ */
															
 
																+
															
 
																+static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
															
 
																+{
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	unsigned short *worker_xsub;
															
 
																+	worker_xsub = &xsubi[PADDING*workerid];
															
 
																+	
															
 
																+	struct drand48_data *buffer;
															
 
																+	buffer = &randbuffer[PADDING*workerid];
															
 
																+
															
 
																+	unsigned long local_cnt = 0;
															
 
																+
															
 
																+	/* Fill the scratchpad with random numbers */
															
 
																+	int i;
															
 
																+	for (i = 0; i < NSHOT_PER_TASK; i++)
															
 
																+	{
															
 
																+		double randx, randy;
															
 
																+
															
 
																+		starpu_erand48_r(worker_xsub, buffer, &randx);
															
 
																+		starpu_erand48_r(worker_xsub, buffer, &randy);
															
 
																+
															
 
																+		double x = (2.0*randx - 1.0);
															
 
																+		double y = (2.0*randy - 1.0);
															
 
																+
															
 
																+		double dist = x*x + y*y;
															
 
																+		if (dist < 1.0)
															
 
																+			local_cnt++;
															
 
																+	}
															
 
																+
															
 
																+	/* Put the contribution of that task into the counter */
															
 
																+	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+	*cnt = *cnt + local_cnt;
															
 
																+}
															
 
																+
															
 
																+extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
															
 
																+
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
															
 
																+{
															
 
																+	cudaError_t cures;
															
 
																+	curandStatus_t res;	
															
 
																+
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
															
 
																+	 * cudaGetLastError resets the last error value. */
															
 
																+	cures = cudaGetLastError();
															
 
																+/*	if (cures)
															
 
																+		STARPU_CUDA_REPORT_ERROR(cures); */
															
 
																+
															
 
																+	/* Fill the scratchpad with random numbers. Note that both x and y
															
 
																+	 * arrays are in stored the same vector. */
															
 
																+	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
															
 
																+	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
															
 
																+
															
 
																+	float *x = &scratchpad_xy[0];
															
 
																+	float *y = &scratchpad_xy[NSHOT_PER_TASK];
															
 
																+
															
 
																+	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static struct starpu_codelet_t pi_cl = {
															
 
																+	.where =
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+		STARPU_CUDA|
															
 
																+#endif
															
 
																+		STARPU_CPU,
															
 
																+	.cpu_func = pi_func_cpu,
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+	.cuda_func = pi_func_cuda,
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.model = NULL
															
 
																+};
															
 
																+
															
 
																+/*
															
 
																+ *	Codelets to implement reduction
															
 
																+ */
															
 
																+
															
 
																+static void init_cpu_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        *val = 0;
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+static void init_cuda_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        cudaMemset(val, 0, sizeof(unsigned long));
															
 
																+        cudaThreadSynchronize();
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static struct starpu_codelet_t init_codelet = {
															
 
																+	.where =
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+		STARPU_CUDA|
															
 
																+#endif
															
 
																+		STARPU_CPU,
															
 
																+        .cpu_func = init_cpu_func,
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+        .cuda_func = init_cuda_func,
															
 
																+#endif
															
 
																+        .nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+/* Dummy implementation of the addition of two unsigned longs in CUDA */
															
 
																+static void redux_cuda_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+	unsigned long h_a, h_b;
															
 
																+	
															
 
																+	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
															
 
																+	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
															
 
																+
															
 
																+	h_a += h_b;
															
 
																+
															
 
																+	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
															
 
																+};
															
 
																+#endif
															
 
																+
															
 
																+static void redux_cpu_func(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+
															
 
																+	*a = *a + *b;
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet_t redux_codelet = {
															
 
																+	.where =
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+		STARPU_CUDA|
															
 
																+#endif
															
 
																+		STARPU_CPU,
															
 
																+	.cpu_func = redux_cpu_func,
															
 
																+#ifdef STARPU_HAVE_CURAND
															
 
																+	.cuda_func = redux_cuda_func,
															
 
																+#endif
															
 
																+	.nbuffers = 2
															
 
																+};
															
 
																+
															
 
																+/*
															
 
																+ *	Main program
															
 
																+ */
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	unsigned i;
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	starpu_init(NULL);
															
 
																+
															
 
																+	/* Launch a Random Number Generator (RNG) on each worker */
															
 
																+	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
															
 
																+
															
 
																+	/* Create a scratchpad data */
															
 
																+	starpu_data_handle xy_scratchpad_handle;
															
 
																+	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
															
 
																+		2*NSHOT_PER_TASK, sizeof(float));
															
 
																+
															
 
																+	/* Create a variable that will be used to count the number of shots
															
 
																+	 * that actually hit the unit circle when shooting randomly in
															
 
																+	 * [-1,1]^2. */
															
 
																+	unsigned long shot_cnt = 0;
															
 
																+	starpu_data_handle shot_cnt_handle;
															
 
																+	starpu_variable_data_register(&shot_cnt_handle, 0,
															
 
																+			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
															
 
																+
															
 
																+	starpu_data_set_reduction_methods(shot_cnt_handle,
															
 
																+					&redux_codelet, &init_codelet);
															
 
																+
															
 
																+	struct timeval start;
															
 
																+	struct timeval end;
															
 
																+
															
 
																+	for (i = 0; i < ntasks_warmup; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+		task->cl = &pi_cl;
															
 
																+
															
 
																+		task->buffers[0].handle = xy_scratchpad_handle;
															
 
																+		task->buffers[0].mode   = STARPU_SCRATCH;
															
 
																+		task->buffers[1].handle = shot_cnt_handle;
															
 
																+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
															
 
																+
															
 
																+		int ret = starpu_task_submit(task);
															
 
																+		STARPU_ASSERT(!ret);
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	gettimeofday(&start, NULL);
															
 
																+
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+		task->cl = &pi_cl;
															
 
																+
															
 
																+		task->buffers[0].handle = xy_scratchpad_handle;
															
 
																+		task->buffers[0].mode   = STARPU_SCRATCH;
															
 
																+		task->buffers[1].handle = shot_cnt_handle;
															
 
																+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
															
 
																+
															
 
																+		int ret = starpu_task_submit(task);
															
 
																+		STARPU_ASSERT(!ret);
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_unregister(shot_cnt_handle);
															
 
																+
															
 
																+	gettimeofday(&end, NULL);
															
 
																+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
															
 
																+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
															
 
																+	 * probability to impact the disk: pi/4 */
															
 
																+	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
															
 
																+	double pi_approx = ((double)shot_cnt*4.0)/total;
															
 
																+
															
 
																+	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");
															
 
																+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", pi_approx, shot_cnt, total);
															
 
																+	FPRINTF(stderr, "Error %e \n", pi_approx - PI);
															
 
																+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
															
 
																+	FPRINTF(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	if (abs(pi_approx - PI) > 1.0)
															
 
																+		return 1;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/opt/pi/pi_redux_kernel.cu
+++ b/examples/opt/pi/pi_redux_kernel.cu
@@ -0,0 +1,128 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_cuda.h>
															
 
																+
															
 
																+#define MAXNBLOCKS	128
															
 
																+#define MAXTHREADSPERBLOCK	256
															
 
																+
															
 
																+static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
															
 
																+{
															
 
																+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
															
 
																+
															
 
																+	/* Do we have a successful shot ? */
															
 
																+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
															
 
																+
															
 
																+	const int nthreads = gridDim.x * blockDim.x;
															
 
																+
															
 
																+	/* Blank the shared mem buffer */
															
 
																+	if (threadIdx.x < MAXTHREADSPERBLOCK)
															
 
																+		scnt[threadIdx.x] = 0;
															
 
																+
															
 
																+	__syncthreads();
															
 
																+	int ind;
															
 
																+	for (ind = tid; ind < n; ind += nthreads)
															
 
																+	{ 
															
 
																+		float xval = (2.0f * x[ind] - 1.0f);
															
 
																+		float yval = (2.0f * y[ind] - 1.0f);
															
 
																+		float dist = (xval*xval + yval*yval);
															
 
																+
															
 
																+		unsigned long success = (dist <= 1.0f)?1:0;
															
 
																+
															
 
																+		scnt[threadIdx.x] += success;
															
 
																+
															
 
																+	}
															
 
																+
															
 
																+	__syncthreads();
															
 
																+
															
 
																+	/* Perform a reduction to compute the sum on each thread within that block */
															
 
																+
															
 
																+	/* NB: We assume that the number of threads per block is a power of 2 ! */
															
 
																+	unsigned long s;
															
 
																+	for (s = blockDim.x/2; s!=0; s>>=1)
															
 
																+	{
															
 
																+		if (threadIdx.x < s)
															
 
																+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
															
 
																+
															
 
																+		__syncthreads();
															
 
																+	}
															
 
																+
															
 
																+	/* report the number of successful shots in the block */
															
 
																+	if (threadIdx.x == 0)
															
 
																+		output_cnt[blockIdx.x] = scnt[0];
															
 
																+
															
 
																+	__syncthreads();
															
 
																+}
															
 
																+
															
 
																+static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
															
 
																+{
															
 
																+	__shared__ unsigned long accumulator[MAXNBLOCKS];
															
 
																+
															
 
																+	unsigned i;
															
 
																+
															
 
																+	/* Load the values from global mem */
															
 
																+	for (i = 0; i < blockDim.x; i++)
															
 
																+		accumulator[i] = output_cnt[i];
															
 
																+
															
 
																+	__syncthreads();
															
 
																+
															
 
																+	/* Perform a reduction in shared memory */
															
 
																+	unsigned s;
															
 
																+	for (s = blockDim.x/2; s!=0; s>>=1)
															
 
																+	{
															
 
																+		if (threadIdx.x < s)
															
 
																+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
															
 
																+
															
 
																+		__syncthreads();
															
 
																+	}
															
 
																+
															
 
																+	/* Save the result in global memory */
															
 
																+	if (threadIdx.x == 0)
															
 
																+		*cnt = *cnt + accumulator[0];
															
 
																+}
															
 
																+
															
 
																+extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
															
 
																+{
															
 
																+	cudaError_t cures;
															
 
																+
															
 
																+	/* How many blocks do we use ? */ 
															
 
																+	unsigned nblocks = 128; // TODO
															
 
																+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
															
 
																+	STARPU_ASSERT((n % nblocks) == 0);
															
 
																+	
															
 
																+	unsigned long *per_block_cnt;
															
 
																+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
															
 
																+
															
 
																+	/* How many threads per block ? At most 256, but no more threads than
															
 
																+	 * there are entries to process per block. */
															
 
																+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
															
 
																+
															
 
																+	/* each entry of per_block_cnt contains the number of successful shots
															
 
																+	 * in the corresponding block. */
															
 
																+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
															
 
																+
															
 
																+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																+
															
 
																+	/* compute the total number of successful shots by adding the elements
															
 
																+	 * of the per_block_cnt array */
															
 
																+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
															
 
																+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+	if (cures)
															
 
																+		STARPU_CUDA_REPORT_ERROR(cures);
															
 
																+
															
 
																+	cudaFree(per_block_cnt);
															
 
																+}
															
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -76,7 +76,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
																 	unsigned i;
															
 
																 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
															
 
																 	{
															
 
																-//		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b);
															
 
																+/*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
															
 
																 	}
															
 
																 	fclose(file);
															
@@ -136,7 +136,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
																 				{
															
 
																 					unsigned index = (big_col + i)+(big_line + j)*input_ppm->ncols;
															
 
																-//					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b);
															
 
																+/*					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b); */
															
 
																 					sum_r += (unsigned)in[index].r;
															
 
																 					sum_g += (unsigned)in[index].g;
															
@@ -148,7 +148,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
																 			out[col + line*output_ppm->ncols].g = (unsigned char)(sum_g/(FACTOR*FACTOR));
															
 
																 			out[col + line*output_ppm->ncols].b = (unsigned char)(sum_b/(FACTOR*FACTOR));
															
 
																-//			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r);
															
 
																+/*			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r); */
															
 
																 		}
															
 
																 	}
															
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																  *
															
@@ -92,16 +92,12 @@ static struct starpu_codelet_t ds_codelet = {
 
																 /* each block contains BLOCK_HEIGHT consecutive lines */
															
 
																 static struct starpu_data_filter filter_y = {
															
 
																 	.filter_func = starpu_block_filter_func,
															
 
																-	.nchildren= HEIGHT/BLOCK_HEIGHT,
															
 
																-	.get_nchildren = NULL,
															
 
																-	.get_child_ops = NULL
															
 
																+	.nchildren= HEIGHT/BLOCK_HEIGHT
															
 
																 };
															
 
																 static struct starpu_data_filter filter_uv = {
															
 
																 	.filter_func = starpu_block_filter_func,
															
 
																-	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT,
															
 
																-	.get_nchildren = NULL,
															
 
																-	.get_child_ops = NULL
															
 
																+	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
															
 
																 };
															
 
																 int main(int argc, char **argv)
															
@@ -111,7 +107,7 @@ int main(int argc, char **argv)
 
																 	parse_args(argc, argv);
															
 
																-//	fprintf(stderr, "Reading input file ...\n");
															
 
																+/*	fprintf(stderr, "Reading input file ...\n"); */
															
 
																 	/* how many frames ? */
															
 
																 	struct stat stbuf;
															
@@ -120,7 +116,7 @@ int main(int argc, char **argv)
 
																 	unsigned nframes = filesize/FRAMESIZE; 
															
 
																-//	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes);
															
 
																+/*	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */
															
 
																 	assert((filesize % sizeof(struct yuv_frame)) == 0);
															
 
																 	/* fetch input data */
															
@@ -134,7 +130,7 @@ int main(int argc, char **argv)
 
																 	FILE *f_out = fopen(filename_out, "w+");
															
 
																 	assert(f_out);
															
 
																-//	fprintf(stderr, "Alloc output file ...\n");
															
 
																+/*	fprintf(stderr, "Alloc output file ...\n"); */
															
 
																 	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
															
 
																 	assert(yuv_out_buffer);
															
@@ -199,7 +195,7 @@ int main(int argc, char **argv)
 
																 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
															
 
																-	fprintf(stderr, "Start computation: there will be %d tasks for %d frames\n", ntasks, nframes);
															
 
																+	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
															
 
																 	gettimeofday(&start, NULL);
															
 
																 	/* do the computation */
															
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -20,6 +20,8 @@
 
																 #include <assert.h>
															
 
																 #include <unistd.h>
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static unsigned niter = 500;
															
 
																 void sleep_codelet(__attribute__ ((unused)) void *descr[],
															
@@ -70,7 +72,7 @@ int main(int argc, char **argv)
 
																 		int ret = starpu_task_submit(task);
															
 
																 		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																 		{
															
 
																-			fprintf(stderr, "No worker may execute this task\n");
															
 
																+			FPRINTF(stderr, "No worker may execute this task\n");
															
 
																 			exit(0);
															
 
																 		}
															
 
																 	}
															
@@ -97,8 +99,8 @@ int main(int argc, char **argv)
 
																 	free(tasks);
															
 
																-	fprintf(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
															
 
																-	fprintf(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
															
 
																+	FPRINTF(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
															
 
																+	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
															
 
																 	/* Display the occupancy of all workers during the test */
															
 
																 	int worker;
															
@@ -117,10 +119,10 @@ int main(int argc, char **argv)
 
																 		char workername[128];
															
 
																 		starpu_worker_get_name(worker, workername, 128);
															
 
																-		fprintf(stderr, "Worker %s:\n", workername);
															
 
																-		fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
															
 
																-		fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
															
 
																-		fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
															
 
																+		FPRINTF(stderr, "Worker %s:\n", workername);
															
 
																+		FPRINTF(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
															
 
																+		FPRINTF(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
															
 
																+		FPRINTF(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
															
 
																 	}
															
 
																 	starpu_shutdown();
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -22,13 +22,15 @@
 
																 #include <cublas.h>
															
 
																 #endif
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static float *x;
															
 
																 static float *y;
															
 
																 static starpu_data_handle *x_handles;
															
 
																 static starpu_data_handle *y_handles;
															
 
																 static unsigned nblocks = 4096;
															
 
																-static unsigned entries_per_bock = 1024;
															
 
																+static unsigned entries_per_block = 1024;
															
 
																 #define DOT_TYPE double
															
@@ -75,9 +77,16 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 
																 	*dota = *dota + *dotb;
															
 
																 }
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+extern void redux_cuda_func(void *descr[], void *_args);
															
 
																+#endif
															
 
																+
															
 
																 static struct starpu_codelet_t redux_codelet = {
															
 
																-	.where = STARPU_CPU,
															
 
																+	.where = STARPU_CPU|STARPU_CUDA,
															
 
																 	.cpu_func = redux_cpu_func,
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_func = redux_cuda_func,
															
 
																+#endif
															
 
																 	.nbuffers = 2
															
 
																 };
															
@@ -118,11 +127,11 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
																 	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
															
 
																-	int ret = cudaThreadSynchronize();
															
 
																+	cudaThreadSynchronize();
															
 
																 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
															
 
																-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
															
 
																+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
															
 
																 	current_dot += local_dot;
															
 
																 	cudaThreadSynchronize();
															
@@ -146,15 +155,13 @@ static struct starpu_codelet_t dot_codelet = {
 
																  *	Tasks initialization
															
 
																  */
															
 
																-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
															
 
																-
															
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	starpu_init(NULL);
															
 
																 	starpu_helper_cublas_init();
															
 
																-	unsigned long nelems = nblocks*entries_per_bock;
															
 
																+	unsigned long nelems = nblocks*entries_per_block;
															
 
																 	size_t size = nelems*sizeof(float);
															
 
																 	x = malloc(size);
															
@@ -182,9 +189,9 @@ int main(int argc, char **argv)
 
																 	for (block = 0; block < nblocks; block++)
															
 
																 	{
															
 
																 		starpu_vector_data_register(&x_handles[block], 0,
															
 
																-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
															
 
																+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
															
 
																 		starpu_vector_data_register(&y_handles[block], 0,
															
 
																-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
															
 
																+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
															
 
																 	}
															
 
																 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
															
@@ -199,6 +206,7 @@ int main(int argc, char **argv)
 
																 		struct starpu_task *task = starpu_task_create();
															
 
																 		task->cl = &dot_codelet;
															
 
																+		task->destroy = 1;
															
 
																 		task->buffers[0].handle = x_handles[block];
															
 
																 		task->buffers[0].mode = STARPU_R;
															
@@ -208,16 +216,33 @@ int main(int argc, char **argv)
 
																 		task->buffers[2].mode = STARPU_REDUX;
															
 
																 		int ret = starpu_task_submit(task);
															
 
																+		if (ret == -ENODEV) goto enodev;
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 	}
															
 
																+	for (block = 0; block < nblocks; block++)
															
 
																+	{
															
 
																+		starpu_data_unregister(x_handles[block]);
															
 
																+		starpu_data_unregister(y_handles[block]);
															
 
																+	}
															
 
																 	starpu_data_unregister(dot_handle);
															
 
																-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
															
 
																+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
															
 
																 	starpu_helper_cublas_shutdown();
															
 
																 	starpu_shutdown();
															
 
																+	free(x);
															
 
																+	free(y);
															
 
																+	free(x_handles);
															
 
																+	free(y_handles);
															
 
																+
															
 
																 	return 0;
															
 
																+
															
 
																+enodev:
															
 
																+	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																+	/* yes, we do not perform the computation but we did detect that no one
															
 
																+ 	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	return 77;
															
 
																 }
															
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -22,6 +22,8 @@
 
																 static unsigned nblocks = 8192;
															
 
																 static unsigned entries_per_bock = 1024;
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 #define TYPE		double
															
 
																 #define TYPE_MAX	DBL_MAX
															
 
																 #define TYPE_MIN	DBL_MIN
															
@@ -171,15 +173,19 @@ int main(int argc, char **argv)
 
																 		if (ret)
															
 
																 		{
															
 
																 			STARPU_ASSERT(ret == -ENODEV);
															
 
																-			fprintf(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
															
 
																+			FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
															
 
																 			return 0;
															
 
																 		}
															
 
																 	}
															
 
																+	for (block = 0; block < nblocks; block++)
															
 
																+	{
															
 
																+		starpu_data_unregister(x_handles[block]);
															
 
																+	}
															
 
																 	starpu_data_unregister(minmax_handle);
															
 
																-	fprintf(stderr, "Min : %e\n", minmax[0]);
															
 
																-	fprintf(stderr, "Max : %e\n", minmax[1]);
															
 
																+	FPRINTF(stderr, "Min : %e\n", minmax[0]);
															
 
																+	FPRINTF(stderr, "Max : %e\n", minmax[1]);
															
 
																 	STARPU_ASSERT(minmax[0] <= minmax[1]);
															
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -19,6 +19,7 @@
 
																 #include <starpu.h>
															
 
																 #define NTASKS	32000
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																 struct starpu_task_list sched_list;
															
@@ -38,7 +39,7 @@ static void init_dummy_sched(struct starpu_machine_topology_s *topology,
 
																 	for (workerid = 0; workerid < topology->nworkers; workerid++)
															
 
																 		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
															
 
																-	fprintf(stderr, "Initialising Dummy scheduler\n");
															
 
																+	FPRINTF(stderr, "Initialising Dummy scheduler\n");
															
 
																 }
															
 
																 static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
															
@@ -49,7 +50,7 @@ static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
 
																 	pthread_cond_destroy(&sched_cond);
															
 
																 	pthread_mutex_destroy(&sched_mutex);
															
 
																-	fprintf(stderr, "Destroying Dummy scheduler\n");
															
 
																+	FPRINTF(stderr, "Destroying Dummy scheduler\n");
															
 
																 }
															
 
																 static int push_task_dummy(struct starpu_task *task)
															
@@ -80,7 +81,6 @@ static struct starpu_sched_policy_s dummy_sched_policy = {
 
																 	.init_sched = init_dummy_sched,
															
 
																 	.deinit_sched = deinit_dummy_sched,
															
 
																 	.push_task = push_task_dummy,
															
 
																-	.push_prio_task = NULL,
															
 
																 	.pop_task = pop_task_dummy,
															
 
																 	.post_exec_hook = NULL,
															
 
																 	.pop_every_task = NULL,
															
@@ -118,10 +118,16 @@ static starpu_codelet dummy_codelet =
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																+	int ntasks = NTASKS;
															
 
																+
															
 
																 	starpu_init(&conf);
															
 
																+#ifdef STARPU_SLOW_MACHINE
															
 
																+	ntasks /= 100;
															
 
																+#endif
															
 
																+
															
 
																 	unsigned i;
															
 
																-	for (i = 0; i < NTASKS; i++)
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																 	{
															
 
																 		struct starpu_task *task = starpu_task_create();
															
--- a/examples/socl/Makefile.am
+++ b/examples/socl/Makefile.am
@@ -0,0 +1,51 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+LIBS = $(top_builddir)/socl/src/libsocl.la
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/socl/include/ 
															
 
																+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
															
 
																+
															
 
																+
															
 
																+SOCL_EXAMPLES	=
															
 
																+TESTS		=	$(SOCL_EXAMPLES)
															
 
																+
															
 
																+check_PROGRAMS	=	$(STARPU_EXAMPLES)
															
 
																+
															
 
																+examplebindir = $(libdir)/starpu/examples/socl/
															
 
																+examplebin_PROGRAMS =
															
 
																+
															
 
																+
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	basic/basic		\
															
 
																+	mandelbrot/mandelbrot		\
															
 
																+	clinfo/clinfo
															
 
																+
															
 
																+
															
 
																+SOCL_EXAMPLES +=				\
															
 
																+	basic/basic		\
															
 
																+	mandelbrot/mandelbrot		\
															
 
																+	clinfo/clinfo
															
 
																+
															
 
																+basic_basic_SOURCES = basic/basic.c
															
 
																+clinfo_clinfo_SOURCES = clinfo/clinfo.c
															
 
																+mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
															
 
																+
															
 
																+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
															
 
																+if HAVE_X11
															
 
																+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
															
 
																+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
															
 
																+endif
															
--- a/examples/socl/basic/basic.c
+++ b/examples/socl/basic/basic.c
@@ -0,0 +1,211 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+
															
 
																+#include <stdio.h>
															
 
																+#include <stdlib.h>
															
 
																+#include <string.h>
															
 
																+#include <unistd.h>
															
 
																+
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
															
 
																+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
															
 
																+
															
 
																+#ifdef UNUSED
															
 
																+#elif defined(__GNUC__)
															
 
																+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
															
 
																+#else
															
 
																+# define UNUSED(x) x
															
 
																+#endif
															
 
																+
															
 
																+#define SIZE 1024
															
 
																+#define TYPE float
															
 
																+#define REALSIZE (SIZE * sizeof(TYPE))
															
 
																+
															
 
																+const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \
															
 
																+   size_t x = get_global_id(0);\
															
 
																+   size_t y = get_global_id(1);\
															
 
																+   size_t w = get_global_size(0); \
															
 
																+   int idx = y*w+x; \
															
 
																+   d[idx] = s1[idx] + s2[idx];\
															
 
																+}";
															
 
																+
															
 
																+
															
 
																+
															
 
																+int main(int UNUSED(argc), char** UNUSED(argv)) {
															
 
																+   cl_platform_id platforms[15];
															
 
																+   cl_uint num_platforms;
															
 
																+   cl_device_id devices[15];
															
 
																+   cl_uint num_devices;
															
 
																+   cl_context context;
															
 
																+   cl_program program;
															
 
																+   cl_kernel kernel;
															
 
																+   cl_mem s1m, s2m, dm;
															
 
																+   cl_command_queue cq;
															
 
																+   cl_int err;
															
 
																+
															
 
																+   TYPE s1[SIZE],s2[SIZE],d[SIZE];
															
 
																+
															
 
																+   {
															
 
																+      int i;
															
 
																+      for (i=0; i<SIZE; i++) {
															
 
																+         s1[i] = 2.0;
															
 
																+         s2[i] = 7.0;
															
 
																+         d[i] = 98.0;
															
 
																+      }
															
 
																+   }
															
 
																+
															
 
																+   printf("Querying platform...\n");
															
 
																+   err = clGetPlatformIDs(0, NULL, &num_platforms);
															
 
																+   if (num_platforms == 0) {
															
 
																+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
															
 
																+      exit(0);
															
 
																+   }
															
 
																+   err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
															
 
																+   check(err, "clGetPlatformIDs");
															
 
																+
															
 
																+   printf("Querying devices...\n");
															
 
																+   unsigned int platform_idx;
															
 
																+   for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
															
 
																+      err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
															
 
																+      check(err, "clGetDeviceIDs");
															
 
																+      if (num_devices != 0)
															
 
																+         break;
															
 
																+   }
															
 
																+   if (num_devices == 0)
															
 
																+      error("No OpenCL device found\n");
															
 
																+
															
 
																+   printf("Creating context...\n");
															
 
																+   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
															
 
																+   context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
															
 
																+   check(err, "clCreateContext");
															
 
																+
															
 
																+   printf("Creating program...\n");
															
 
																+   program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
															
 
																+   check(err, "clCreateProgram");
															
 
																+
															
 
																+   printf("Building program...\n");
															
 
																+   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
															
 
																+   check(err, "clBuildProgram");
															
 
																+
															
 
																+   printf("Creating kernel...\n");
															
 
																+   kernel = clCreateKernel(program, "add", &err);
															
 
																+   check(err, "clCreateKernel");
															
 
																+
															
 
																+   printf("Creating buffers...\n");
															
 
																+   s1m = clCreateBuffer(context, CL_MEM_READ_WRITE, REALSIZE, NULL, &err);
															
 
																+   check(err, "clCreateBuffer s1");
															
 
																+   s2m = clCreateBuffer(context, CL_MEM_READ_ONLY, REALSIZE, NULL, &err);
															
 
																+   check(err, "clCreateBuffer s2");
															
 
																+   dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY, REALSIZE, NULL, &err);
															
 
																+   check(err, "clCreateBuffer d");
															
 
																+
															
 
																+   printf("Creating command queue...\n");
															
 
																+   cl_event eventW1, eventW2, eventK, eventR;
															
 
																+
															
 
																+#ifdef PROFILING
															
 
																+   cq = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
															
 
																+#else
															
 
																+   cq = clCreateCommandQueue(context, devices[0], 0, &err);
															
 
																+#endif
															
 
																+   check(err, "clCreateCommandQueue");
															
 
																+
															
 
																+   printf("Enqueueing WriteBuffers...\n");
															
 
																+   err = clEnqueueWriteBuffer(cq, s1m, CL_FALSE, 0, REALSIZE, s1, 0, NULL, &eventW1);
															
 
																+   check(err, "clEnqueueWriteBuffer s1");
															
 
																+   err = clEnqueueWriteBuffer(cq, s2m, CL_FALSE, 0, REALSIZE, s2, 0, NULL, &eventW2);
															
 
																+   check(err, "clEnqueueWriteBuffer s2");
															
 
																+
															
 
																+   printf("Setting kernel arguments...\n");
															
 
																+   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m);
															
 
																+   check(err, "clSetKernelArg 0");
															
 
																+   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m);
															
 
																+   check(err, "clSetKernelArg 1");
															
 
																+   err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm);
															
 
																+   check(err, "clSetKernelArg 2");
															
 
																+
															
 
																+   printf("Enqueueing NDRangeKernel...\n");
															
 
																+   size_t local[3] = {16, 1, 1};
															
 
																+   size_t global[3] = {1024, 1, 1};
															
 
																+   cl_event deps[] = {eventW1,eventW2};
															
 
																+   err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, 2, deps, &eventK);
															
 
																+   check(err, "clEnqueueNDRangeKernel");
															
 
																+
															
 
																+   printf("Enqueueing ReadBuffer...\n");
															
 
																+   err = clEnqueueReadBuffer(cq, dm, CL_FALSE, 0, REALSIZE, d, 0, NULL, &eventR);
															
 
																+   check(err, "clEnqueueReadBuffer");
															
 
																+
															
 
																+   clFinish(cq);
															
 
																+
															
 
																+   {
															
 
																+      int i;
															
 
																+      for (i=0; i<SIZE; i++) {
															
 
																+        printf("%f ", d[i]);
															
 
																+      }
															
 
																+      printf("\n");
															
 
																+   }
															
 
																+
															
 
																+#ifdef PROFILING
															
 
																+   #define DURATION(event,label) do { \
															
 
																+      cl_ulong t0,t1; \
															
 
																+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t0, NULL);\
															
 
																+      check(err, "clGetEventProfilingInfo");\
															
 
																+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
															
 
																+      check(err, "clGetEventProfilingInfo");\
															
 
																+      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
															
 
																+   } while (0);
															
 
																+
															
 
																+   DURATION(eventW1, "first buffer writing");
															
 
																+   DURATION(eventW2, "second buffer writing");
															
 
																+   DURATION(eventK, "kernel execution");
															
 
																+   DURATION(eventR, "result buffer reading");
															
 
																+#endif
															
 
																+
															
 
																+   
															
 
																+   printf("Releasing events...\n");
															
 
																+   err = clReleaseEvent(eventW1);
															
 
																+   err |= clReleaseEvent(eventW2);
															
 
																+   err |= clReleaseEvent(eventK);
															
 
																+   err |= clReleaseEvent(eventR);
															
 
																+   check(err, "clReleaseCommandQueue");
															
 
																+
															
 
																+   printf("Releasing command queue...\n");
															
 
																+   err = clReleaseCommandQueue(cq);
															
 
																+   check(err, "clReleaseCommandQueue");
															
 
																+
															
 
																+   printf("Releasing buffers...\n");
															
 
																+   err = clReleaseMemObject(s1m);
															
 
																+   check(err, "clReleaseMemObject s1");
															
 
																+   err = clReleaseMemObject(s2m);
															
 
																+   check(err, "clReleaseMemObject s2");
															
 
																+   err = clReleaseMemObject(dm);
															
 
																+   check(err, "clReleaseMemObject d");
															
 
																+
															
 
																+   printf("Releasing kernel...\n");
															
 
																+   err = clReleaseKernel(kernel);
															
 
																+   check(err, "clReleaseKernel");
															
 
																+
															
 
																+   printf("Releasing program...\n");
															
 
																+   err = clReleaseProgram(program);
															
 
																+   check(err, "clReleaseProgram");
															
 
																+
															
 
																+   printf("Releasing context...\n");
															
 
																+   err = clReleaseContext(context);
															
 
																+   check(err, "clReleaseContext");
															
 
																+
															
 
																+   return 0;
															
 
																+}
															
--- a/examples/socl/clinfo/clinfo.c
+++ b/examples/socl/clinfo/clinfo.c
@@ -0,0 +1,299 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010,2011 University of Bordeaux
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+
															
 
																+#include <stdio.h>
															
 
																+#include <stdlib.h>
															
 
																+
															
 
																+#include <CL/cl.h>
															
 
																+
															
 
																+inline 
															
 
																+void 
															
 
																+checkErr(cl_int err, const char * name) {
															
 
																+    if (err != CL_SUCCESS) {
															
 
																+        fprintf(stderr, "ERROR: %s (%d)\n", name, err);
															
 
																+        exit(1);
															
 
																+    }
															
 
																+}
															
 
																+
															
 
																+int
															
 
																+main(void) {
															
 
																+   cl_int err;
															
 
																+   cl_uint num_platforms;
															
 
																+   cl_platform_id *platforms;
															
 
																+
															
 
																+   // Plaform info
															
 
																+   err = clGetPlatformIDs(0, NULL, &num_platforms);
															
 
																+   checkErr(err, "Unable to get platform count");
															
 
																+
															
 
																+   platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
															
 
																+   err = clGetPlatformIDs(num_platforms, platforms, NULL);
															
 
																+   checkErr(err, "Unable to get platform list");
															
 
																+   
															
 
																+   
															
 
																+   // Iteratate over platforms
															
 
																+   printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
															
 
																+
															
 
																+   {
															
 
																+      unsigned int i;
															
 
																+      for (i=0; i<num_platforms; i++) {
															
 
																+         char str[256];
															
 
																+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
															
 
																+         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
															
 
																+
															
 
																+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
															
 
																+         printf("  Plaform Version:\t\t\t\t %s\n", str);    
															
 
																+
															
 
																+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
															
 
																+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
															
 
																+
															
 
																+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
															
 
																+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
															
 
																+
															
 
																+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
															
 
																+         printf("  Plaform Extensions:\t\t\t %s\n", str);    
															
 
																+      }
															
 
																+   }
															
 
																+
															
 
																+   printf("\n\n");
															
 
																+
															
 
																+   // Now Iteratate over each platform and its devices
															
 
																+   {
															
 
																+      unsigned int i;
															
 
																+      for (i=0; i<num_platforms; i++) {
															
 
																+         char str[256];
															
 
																+         cl_device_id * devices;
															
 
																+         cl_uint num_devices;
															
 
																+
															
 
																+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
															
 
																+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
															
 
																+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
															
 
																+
															
 
																+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
															
 
																+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
															
 
																+         devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
															
 
																+         
															
 
																+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
															
 
																+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
															
 
																+
															
 
																+         printf("  Number of devices:\t\t\t\t %d\n", num_devices);
															
 
																+         {
															
 
																+            unsigned int j;
															
 
																+            for (j=0; j<num_devices; j++) {
															
 
																+               cl_device_type dev_type;
															
 
																+               printf("\n  DEVICE %d\n", j);
															
 
																+               
															
 
																+               err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
															
 
																+               checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
															
 
																+
															
 
																+               printf("  Device Type:\t\t\t\t\t ");
															
 
																+               if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
															
 
																+                  printf("CL_DEVICE_TYPE_ACCELERATOR ");
															
 
																+               else if (dev_type & CL_DEVICE_TYPE_CPU)
															
 
																+                  printf("CL_DEVICE_TYPE_CPU ");
															
 
																+               else if (dev_type & CL_DEVICE_TYPE_GPU)
															
 
																+                  printf("CL_DEVICE_TYPE_GPU ");
															
 
																+               else if (dev_type & CL_DEVICE_TYPE_DEFAULT)
															
 
																+                  printf("CL_DEVICE_TYPE_DEFAULT ");
															
 
																+
															
 
																+               printf("\n");
															
 
																+
															
 
																+               {
															
 
																+                  cl_uint vendor_id;
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
															
 
																+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
															
 
																+               }
															
 
																+               {
															
 
																+                  cl_uint units;
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
															
 
																+                  printf("  Max compute units:\t\t\t\t %d\n", units); 
															
 
																+               }
															
 
																+
															
 
																+               {
															
 
																+                  cl_uint dims;
															
 
																+                  size_t *sizes;
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
															
 
																+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
															
 
																+
															
 
																+                  sizes = (size_t*)malloc(dims * sizeof(size_t));
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
															
 
																+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
															
 
																+
															
 
																+                  {
															
 
																+                     unsigned int k;
															
 
																+                     printf("    Max work items:\t\t\t\t (");
															
 
																+                     for (k=0; k<dims; k++) {
															
 
																+                        printf("%u", (unsigned int)sizes[k]);
															
 
																+                        if (k != dims-1)
															
 
																+                           printf(",");
															
 
																+                     }
															
 
																+                     printf(")\n");
															
 
																+                  }
															
 
																+               }
															
 
																+
															
 
																+#define GET_SIZET(CL_D,str) { \
															
 
																+   size_t val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, (unsigned int)val); \
															
 
																+}
															
 
																+
															
 
																+#define GET_STRING(CL_D,str,size) { \
															
 
																+   char val[size]; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, val); \
															
 
																+}
															
 
																+
															
 
																+#define GET_UINT(CL_D,str) { \
															
 
																+   cl_uint val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, val); \
															
 
																+}
															
 
																+
															
 
																+#define GET_ULONG(CL_D,str) { \
															
 
																+   cl_ulong val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, val); \
															
 
																+}
															
 
																+
															
 
																+#define GET_BOOL(CL_D,str) { \
															
 
																+   cl_bool val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, (val == CL_TRUE ? "Yes" : "No")); \
															
 
																+}
															
 
																+
															
 
																+#define GET_BOOL_CUSTOM(CL_D,str,t,f) { \
															
 
																+   cl_bool val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, (val == CL_TRUE ? t : f)); \
															
 
																+}
															
 
																+
															
 
																+#define GET_BITSET_AND(TYPE,CL_D,test,str) { \
															
 
																+   TYPE val; \
															
 
																+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
															
 
																+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
															
 
																+   printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
															
 
																+}
															
 
																+      
															
 
																+               GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
															
 
																+               
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "  Preferred vector width long:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "  Preferred vector width float:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
															
 
																+               GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
															
 
																+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
															
 
																+
															
 
																+               GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
															
 
																+
															
 
																+               GET_SIZET(CL_DEVICE_MAX_PARAMETER_SIZE, "  Max size of kernel argument:\t\t\t %u\n")
															
 
																+               GET_UINT(CL_DEVICE_MEM_BASE_ADDR_ALIGN, "  Alignment of base addres:\t\t\t %u bits\n")
															
 
																+               GET_UINT(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "  Minimum alignment for any datatype:\t\t %u bytes\n")
															
 
																+
															
 
																+               printf("  Single precision floating point capability\n");
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_DENORM, "    Denorms:\t\t\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_INF_NAN, "    Quiet NaNs:\t\t\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_NEAREST, "    Round to nearest even:\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_ZERO, "    Round to zero:\t\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_INF, "    Round to +ve and infinity:\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_FMA, "    IEEE754-2008 fused multiply-add:\t\t %s\n")
															
 
																+
															
 
																+               {
															
 
																+                  cl_device_mem_cache_type cache;
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cache), &cache, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)");
															
 
																+                  printf("  Cache type:\t\t\t\t\t ");
															
 
																+                  switch (cache) {
															
 
																+                     case CL_NONE:
															
 
																+                        printf("None\n");
															
 
																+                        break;
															
 
																+                     case CL_READ_ONLY_CACHE:
															
 
																+                        printf("Read only\n");
															
 
																+                        break;
															
 
																+                     case CL_READ_WRITE_CACHE:
															
 
																+                        printf("Read/Write\n");
															
 
																+                        break;
															
 
																+                  }
															
 
																+               }
															
 
																+
															
 
																+               GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
															
 
																+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
															
 
																+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
															
 
																+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
															
 
																+               GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
															
 
																+
															
 
																+               {
															
 
																+                  cl_device_local_mem_type cache;
															
 
																+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cache), &cache, NULL);
															
 
																+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_TYPE)");
															
 
																+                  printf("  Local memory type:\t\t\t\t ");
															
 
																+                  switch (cache) {
															
 
																+                     case CL_LOCAL:
															
 
																+                        printf("Local\n");
															
 
																+                        break;
															
 
																+                     case CL_GLOBAL:
															
 
																+                        printf("Global\n");
															
 
																+                        break;
															
 
																+                  }
															
 
																+               }
															
 
																+
															
 
																+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
															
 
																+               GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
															
 
																+               GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
															
 
																+               GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")
															
 
																+               GET_BOOL(CL_DEVICE_COMPILER_AVAILABLE, "  Compiler available:\t\t\t\t %s\n")
															
 
																+
															
 
																+               printf("  Execution capabilities:\t\t\t\t \n");
															
 
																+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_KERNEL, "  Execute OpenCL kernels:\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_NATIVE_KERNEL, "  Execute native kernels:\t\t\t %s\n")
															
 
																+
															
 
																+               printf("  Queue properties:\t\t\t\t\n ");
															
 
																+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "   Out-of-Order:\t\t\t\t %s\n")
															
 
																+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, "    Profiling:\t\t\t\t\t %s\n")
															
 
																+
															
 
																+
															
 
																+               GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
															
 
																+               GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
															
 
																+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
															
 
																+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
															
 
																+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
															
 
																+               GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
															
 
																+            
															
 
																+               printf("\n");
															
 
																+            }
															
 
																+         }
															
 
																+      }
															
 
																+   }
															
 
																+
															
 
																+   return 0;
															
 
																+}
															
--- a/examples/socl/mandelbrot/mandelbrot.c
+++ b/examples/socl/mandelbrot/mandelbrot.c