14 anni fa · 03b6b6a55b
--- a/.dir-locals.el
+++ b/.dir-locals.el
@@ -0,0 +1,4 @@
 
				+;; Hey Emacs, use the ugly style!
			
 
				+
			
 
				+((c-mode . ((c-file-style . "linux")
			
 
				+	    (indent-tabs-mode . t))))
			
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,187 @@
 
				+/configure
			
 
				+/config.log
			
 
				+/config.status
			
 
				+/autom4te.cache
			
 
				+/libtool
			
 
				+/libstarpu.pc
			
 
				+/aclocal.m4
			
 
				+/build-aux
			
 
				+/GPATH
			
 
				+/GRTAGS
			
 
				+/GTAGS
			
 
				+/config.cache
			
 
				+/doc/starpu.info
			
 
				+*~
			
 
				+,*
			
 
				+Makefile
			
 
				+Makefile.in
			
 
				+.libs
			
 
				+.deps
			
 
				+*.o
			
 
				+*.lo
			
 
				+*.la
			
 
				+*.swp
			
 
				+.dirstamp
			
 
				+stamp-h[0-9]*
			
 
				+starpu.log
			
 
				+/gcc-plugin/src/starpu-gcc-config.h
			
 
				+/gcc-plugin/tests/*.c.[0-9]*.*
			
 
				+/tests/datawizard/handle_to_pointer
			
 
				+/tests/datawizard/data_lookup
			
 
				+/doc/stamp-vti
			
 
				+/doc/version.texi
			
 
				+/examples/basic_examples/block
			
 
				+/examples/basic_examples/hello_world
			
 
				+/examples/basic_examples/mult
			
 
				+/examples/basic_examples/variable
			
 
				+/examples/basic_examples/vector_scal
			
 
				+/examples/callback/callback
			
 
				+/examples/filters/fblock
			
 
				+/examples/filters/fmatrix
			
 
				+/examples/filters/fvector
			
 
				+/examples/incrementer/incrementer
			
 
				+/examples/mandelbrot/mandelbrot
			
 
				+/examples/matvecmult/matvecmult
			
 
				+/examples/pi/pi
			
 
				+/examples/pi/pi_redux
			
 
				+/examples/ppm_downscaler/ppm_downscaler
			
 
				+/examples/ppm_downscaler/yuv_downscaler
			
 
				+/examples/profiling/profiling
			
 
				+/examples/reductions/dot_product
			
 
				+/examples/reductions/minmax_reduction
			
 
				+/examples/scheduler/dummy_sched
			
 
				+/examples/spmv/dw_spmv
			
 
				+/examples/spmv/spmv
			
 
				+/examples/stencil/stencil
			
 
				+/examples/tag_example/tag_example
			
 
				+/examples/tag_example/tag_example2
			
 
				+/examples/tag_example/tag_example3
			
 
				+/examples/tag_example/tag_restartable
			
 
				+/mpi/examples/stencil/stencil5
			
 
				+/mpi/tests/block_interface
			
 
				+/mpi/tests/block_interface_pinned
			
 
				+/mpi/tests/insert_task
			
 
				+/mpi/tests/insert_task_block
			
 
				+/mpi/tests/insert_task_cache
			
 
				+/mpi/tests/insert_task_owner
			
 
				+/mpi/tests/insert_task_owner2
			
 
				+/mpi/tests/mpi_detached_tag
			
 
				+/mpi/tests/mpi_irecv
			
 
				+/mpi/tests/mpi_irecv_detached
			
 
				+/mpi/tests/mpi_isend
			
 
				+/mpi/tests/mpi_isend_detached
			
 
				+/mpi/tests/mpi_test
			
 
				+/mpi/tests/multiple_send
			
 
				+/mpi/tests/pingpong
			
 
				+/mpi/tests/ring
			
 
				+/mpi/tests/ring_async
			
 
				+/mpi/tests/ring_async_implicit
			
 
				+/tests/core/declare_deps_after_submission
			
 
				+/tests/core/declare_deps_after_submission_synchronous
			
 
				+/tests/core/declare_deps_in_callback
			
 
				+/tests/core/empty_task
			
 
				+/tests/core/empty_task_chain
			
 
				+/tests/core/empty_task_sync_point
			
 
				+/tests/core/empty_task_sync_point_tasks
			
 
				+/tests/core/execute_on_a_specific_worker
			
 
				+/tests/core/get_current_task
			
 
				+/tests/core/insert_task
			
 
				+/tests/core/multithreaded
			
 
				+/tests/core/multithreaded_init
			
 
				+/tests/core/regenerate
			
 
				+/tests/core/restart
			
 
				+/tests/core/starpu_task_wait
			
 
				+/tests/core/starpu_task_wait_for_all
			
 
				+/tests/core/static_restartable
			
 
				+/tests/core/static_restartable_tag
			
 
				+/tests/core/static_restartable_using_initializer
			
 
				+/tests/core/subgraph_repeat
			
 
				+/tests/core/subgraph_repeat_regenerate
			
 
				+/tests/core/tag_wait_api
			
 
				+/tests/core/task_wait_api
			
 
				+/tests/core/wait_all_regenerable_tasks
			
 
				+/tests/datawizard/acquire_cb
			
 
				+/tests/datawizard/acquire_release
			
 
				+/tests/datawizard/acquire_release2
			
 
				+/tests/datawizard/critical_section_with_void_interface
			
 
				+/tests/datawizard/data_implicit_deps
			
 
				+/tests/datawizard/data_invalidation
			
 
				+/tests/datawizard/dining_philosophers
			
 
				+/tests/datawizard/dsm_stress
			
 
				+/tests/datawizard/increment_redux
			
 
				+/tests/datawizard/increment_redux_v2
			
 
				+/tests/datawizard/lazy_allocation.c
			
 
				+/tests/datawizard/manual_reduction
			
 
				+/tests/datawizard/mpi_like
			
 
				+/tests/datawizard/mpi_like_async
			
 
				+/tests/datawizard/readers_and_writers
			
 
				+/tests/datawizard/reclaim
			
 
				+/tests/datawizard/scratch
			
 
				+/tests/datawizard/sync_and_notify_data
			
 
				+/tests/datawizard/sync_and_notify_data_implicit
			
 
				+/tests/datawizard/sync_with_data_with_mem
			
 
				+/tests/datawizard/sync_with_data_with_mem_non_blocking
			
 
				+/tests/datawizard/sync_with_data_with_mem_non_blocking_implicit
			
 
				+/tests/datawizard/unpartition
			
 
				+/tests/datawizard/user_interaction_implicit
			
 
				+/tests/datawizard/write_only_tmp_buffer
			
 
				+/tests/errorcheck/invalid_blocking_calls
			
 
				+/tests/errorcheck/invalid_tasks
			
 
				+/tests/errorcheck/starpu_init_noworker
			
 
				+/tests/helper/cublas_init
			
 
				+/tests/helper/execute_on_all
			
 
				+/tests/helper/pinned_memory
			
 
				+/tests/helper/starpu_create_sync_task
			
 
				+/tests/helper/starpu_data_cpy
			
 
				+/tests/microbenchs/async_tasks_overhead
			
 
				+/tests/microbenchs/display_structures_size
			
 
				+/tests/microbenchs/local_pingpong
			
 
				+/tests/microbenchs/prefetch_data_on_node
			
 
				+/tests/microbenchs/redundant_buffer
			
 
				+/tests/microbenchs/sync_tasks_overhead
			
 
				+/tests/microbenchs/tasks_overhead
			
 
				+/tests/overlap/overlap
			
 
				+/tests/parallel_tasks/explicit_combined_worker
			
 
				+/tests/parallel_tasks/parallel_kernels
			
 
				+/tests/parallel_tasks/parallel_kernels_spmd
			
 
				+/tests/parallel_tasks/spmd_pgreedy
			
 
				+/tests/perfmodels/non_linear_regression_based
			
 
				+/tests/perfmodels/regression_based
			
 
				+/tools/cbc2paje
			
 
				+/tools/lp2paje
			
 
				+/tools/starpu_calibrate_bus
			
 
				+/tools/starpu_machine_display
			
 
				+/tools/starpu_perfmodel_display
			
 
				+/tools/starpu_regression_display
			
 
				+/gcc-plugin/tests/scalar-tasks
			
 
				+/gcc-plugin/tests/pointers
			
 
				+/tests/datawizard/lazy_allocation
			
 
				+/gcc-plugin/tests/pointer-tasks
			
 
				+/gcc-plugin/tests/*.s
			
 
				+/gcc-plugin/tests/base
			
 
				+/gcc-plugin/tests/core
			
 
				+/mpi/tests/insert_task_owner_data
			
 
				+/mpi/examples/scatter_gather/mpi_scatter_gather
			
 
				+/examples/top/hello_world_top
			
 
				+/doc/starpu.aux
			
 
				+/doc/starpu.cp
			
 
				+/doc/starpu.cps
			
 
				+/doc/starpu.fn
			
 
				+/doc/starpu.fns
			
 
				+/doc/starpu.html
			
 
				+/doc/starpu.ky
			
 
				+/doc/starpu.pdf
			
 
				+/doc/starpu.pg
			
 
				+/doc/starpu.toc
			
 
				+/doc/starpu.tp
			
 
				+/doc/starpu.tps
			
 
				+/doc/starpu.vr
			
 
				+/gcc-plugin/tests/register
			
 
				+/tests/datawizard/acquire_cb_insert
			
 
				+/tools/starpu_perfmodel_plot
			
 
				+/gcc-plugin/tests/run-test
			
 
				+/gcc-plugin/tests/register-errors
			
 
				+/gcc-plugin/tests/acquire
			
 
				+/gcc-plugin/tests/unregister
			
 
				+/gcc-plugin/tests/lib-user
			
 
				+/gcc-plugin/examples/matrix-mult
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,3 +6,7 @@ Sylvain Henry <sylvain.henry@inria.fr>
 
				 Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				 François Tessier <francois.tessier@inria.fr>
			
 
				 Samuel Thibault <samuel.thibault@labri.fr>
			
 
				+William Braik <wbraik@gmail.com>
			
 
				+Yann Courtois <yann.courtois33@gmail.com>
			
 
				+Jean-Marie Couteyen <jm.couteyen@gmail.com>
			
 
				+Anthony Roy <theanthony33@gmail.com>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,7 @@
 
				-StarPU 0.5 (svn revision ????)
			
 
				+StarPU 0.9 (svn revision 3721)
			
 
				 ==============================================
			
 
				-The yet-more-stuff release
			
 
				-  
			
 
				+The extensions release
			
 
				+
			
 
				   * Provide the STARPU_REDUX data access mode
			
 
				   * Externalize the scheduler API.
			
 
				   * Add theoretical bound computation
			
@@ -19,11 +19,12 @@ The yet-more-stuff release
 
				   * Add mandelbrot OpenCL example
			
 
				   * Add cg example
			
 
				   * Add stencil MPI example
			
 
				+  * Initial support for CUDA4
			
 
				 
			
 
				 StarPU 0.4 (svn revision 2535)
			
 
				 ==============================================
			
 
				 The API strengthening release
			
 
				-  
			
 
				+
			
 
				   * Major API improvements
			
 
				     - Provide the STARPU_SCRATCH data access mode
			
 
				     - Rework data filter interface
			
@@ -41,7 +42,7 @@ The API strengthening release
 
				   * Provide a library to help accelerating MPI applications
			
 
				   * Improve data transfers overhead prediction
			
 
				     - Transparently benchmark buses to generate performance models
			
 
				-    - Bind accelerator-controlling threads with respect to NUMA locality 
			
 
				+    - Bind accelerator-controlling threads with respect to NUMA locality
			
 
				   * Improve StarPU's portability
			
 
				     - Add OpenCL support
			
 
				     - Add support for Windows
			
@@ -63,7 +64,7 @@ The asynchronous heterogeneous multi-accelerator release
 
				     - All data transfers use data requests now
			
 
				     - Implement asynchronous data transfers
			
 
				     - Implement prefetch mechanism
			
 
				-    - Chain data requests to support GPU->RAM->GPU transfers 
			
 
				+    - Chain data requests to support GPU->RAM->GPU transfers
			
 
				   * Make it possible to bypass the scheduler and to assign a task to a specific
			
 
				     worker
			
 
				   * Support restartable tasks to reinstanciate dependencies task graphs
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -21,8 +21,22 @@ SUBDIRS = src
 
				 if USE_MPI
			
 
				 SUBDIRS += mpi
			
 
				 endif
			
 
				+
			
 
				+if BUILD_SOCL
			
 
				+SUBDIRS += socl
			
 
				+endif
			
 
				+
			
 
				 SUBDIRS += tools examples tests doc
			
 
				 
			
 
				+if COND_OPT
			
 
				+SUBDIRS += tests/opt examples/opt
			
 
				+endif
			
 
				+
			
 
				+
			
 
				+if BUILD_GCC_PLUGIN
			
 
				+SUBDIRS += gcc-plugin
			
 
				+endif
			
 
				+
			
 
				 pkgconfigdir = $(libdir)/pkgconfig
			
 
				 pkgconfig_DATA = libstarpu.pc
			
 
				 
			
@@ -42,10 +56,24 @@ include_HEADERS = 				\
 
				 	include/starpu_expert.h			\
			
 
				 	include/starpu_profiling.h		\
			
 
				 	include/starpu_bound.h			\
			
 
				-	include/starpu_scheduler.h
			
 
				+	include/starpu_scheduler.h		\
			
 
				+	include/starpu_top.h
			
 
				+
			
 
				+if BUILD_STARPU_TOP
			
 
				+all-local:
			
 
				+	cd starpu-top ; $(QMAKE) ; $(MAKE)
			
 
				+clean-local:
			
 
				+	cd starpu-top ; $(MAKE) clean
			
 
				+# TODO: resources
			
 
				+install-exec-local:
			
 
				+	$(MKDIR_P) $(DESTDIR)$(bindir)
			
 
				+	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
			
 
				+endif
			
 
				 
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				 txtdir = ${prefix}
			
 
				+else
			
 
				+txtdir = ${docdir}
			
 
				+endif
			
 
				 txt_DATA = AUTHORS COPYING.LGPL README
			
 
				 EXTRA_DIST = AUTHORS COPYING.LGPL README
			
 
				-
			
 
				-
			
--- a/README
+++ b/README
@@ -59,6 +59,12 @@ advantage of their specificities in a portable fashion.
 
				    units according to the machine topology. For more details on hwloc, see
			
 
				    http://www.open-mpi.org/projects/hwloc/ .
			
 
				 
			
 
				+ * To build the StarPU-Top tool the following are also required:
			
 
				+   * libqt4 >= 4.7
			
 
				+   * libqt4-network
			
 
				+   * libqt4-opengl
			
 
				+   * libqt4-sql
			
 
				+
			
 
				 ++=====================++
			
 
				 || III. Getting StarPU ||
			
 
				 ++=====================++
			
@@ -97,61 +103,42 @@ we provide MinGW-built binaries.  The build process produces libstarpu.dll,
 
				 libstarpu.def, and libstarpu.lib, which should be enough to use it from e.g.
			
 
				 Microsoft Visual Studio.
			
 
				 
			
 
				-A few details need to be fixed when building StarPU on windows:
			
 
				-
			
 
				-- To get a .def file built, make sure that MSVC's lib.exe tool is in PATH.
			
 
				-
			
 
				-- Update the video drivers to the latest stable release available for your
			
 
				-  hardware. Older ATI drivers (< 2.3) contain bugs that cause OpenCL support in
			
 
				-  StarPU to hang or exhibit incorrect behaviour.
			
 
				+Update the video drivers to the latest stable release available for your
			
 
				+hardware. Old ATI drivers (< 2.3) contain bugs that cause OpenCL support in
			
 
				+StarPU to hang or exhibit incorrect behaviour.
			
 
				 
			
 
				-- c:\cuda\include\host_defines.h has a bogus CUDARTAPI definition which makes
			
 
				-  linking fail completely. Replace the first occurence of
			
 
				+For details on the Windows build process, see the README.dev file in the
			
 
				+subversion tree.
			
 
				 
			
 
				-    #define CUDARTAPI
			
 
				-    
			
 
				-  with
			
 
				-    
			
 
				-    #ifdef _WIN32
			
 
				-    #define CUDARTAPI __stdcall
			
 
				-    #else
			
 
				-    #define CUDARTAPI
			
 
				-    #endif
			
 
				-
			
 
				-  While at it, you can also comment the __cdecl definition to avoid spurious
			
 
				-  warnings.
			
 
				-
			
 
				-- If you have a non-english version of windows, use
			
 
				+++==================++
			
 
				+|| V. Documentation ||
			
 
				+++==================++
			
 
				 
			
 
				-    export LANG=C
			
 
				+Texinfo documentation is available in doc/ . If LaTeX is available on the
			
 
				+machine, a pdf can be generated by running
			
 
				 
			
 
				-  else libtool has troubles parsing the translated output of the toolchain.
			
 
				+  $ make -C doc pdf
			
 
				 
			
 
				-- libtool is not able to find the libraries automatically, you need to make some
			
 
				-  copies:
			
 
				+If makeinfo is available on the machine, html pages can be generated by running
			
 
				 
			
 
				-    copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
			
 
				-    copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
			
 
				-    copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
			
 
				-    copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
			
 
				-    copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
			
 
				+  $ make -C doc html
			
 
				 
			
 
				-++===========++
			
 
				-|| V. Trying ||
			
 
				-++===========++
			
 
				+++============++
			
 
				+|| VI. Trying ||
			
 
				+++============++
			
 
				 
			
 
				 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
			
 
				 
			
 
				-++=============++
			
 
				-|| VI. Upgrade ||
			
 
				-++=============++
			
 
				+++==============++
			
 
				+|| VII. Upgrade ||
			
 
				+++==============++
			
 
				 
			
 
				 To upgrade your source code from older version (there were quite a few
			
 
				 renamings), use the tools/rename.sh script
			
 
				 
			
 
				-++==============++
			
 
				-|| VII. Contact ||
			
 
				-++==============++
			
 
				+++===============++
			
 
				+|| VIII. Contact ||
			
 
				+++===============++
			
 
				 
			
 
				 For any questions regarding StarPU, please contact the starpu-devel
			
 
				 mailing-list at starpu-devel@lists.gforge.inria.fr .
			
--- a/README.dev
+++ b/README.dev
@@ -0,0 +1,169 @@
 
				+Installing StarPU on windows
			
 
				+----------------------------
			
 
				+
			
 
				+If you are building from a tarball downloaded from the website, you can skip the
			
 
				+cygwin part.
			
 
				+
			
 
				+1. Install cygwin
			
 
				+
			
 
				+http://cygwin.com/install.html
			
 
				+
			
 
				+Make sure the following packages are available:
			
 
				+- (Devel)/subversion
			
 
				+- (Devel)/libtool
			
 
				+- (Devel)/gcc
			
 
				+- (Devel)/make
			
 
				+- your favorite editor (vi, emacs, ...)
			
 
				+- (Devel)/gdb
			
 
				+- (Archive)/zip
			
 
				+- (Devel)/pkg-config
			
 
				+
			
 
				+2. Install mingw
			
 
				+
			
 
				+http://sourceforge.net/projects/mingw/
			
 
				+
			
 
				+3. Install hwloc (not mandatory)
			
 
				+
			
 
				+http://www.open-mpi.org/projects/hwloc
			
 
				+
			
 
				+4. Install Microsoft Visual C++ Studio Express
			
 
				+
			
 
				+   http://www.microsoft.com/express/Downloads
			
 
				+
			
 
				+   Add in your path the following directories.
			
 
				+   (adjusting where necessary for the Installation location according to VC
			
 
				+    version and on 64 and 32bit Windows versions)
			
 
				+
			
 
				+   On cygwin, with Visual C++ 2010 e.g.;
			
 
				+
			
 
				+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
			
 
				+   export PATH="/cygdrive/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
			
 
				+
			
 
				+   On MingW, with Visual C++ 2010, e.g.;
			
 
				+
			
 
				+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/Common7/IDE":$PATH
			
 
				+   export PATH="/c/Program Files (x86)/Microsoft Visual Studio 10.0/VC/bin":$PATH
			
 
				+
			
 
				+   Try to call <lib.exe> and <link.exe> without any option to make sure these
			
 
				+   dump their help output, else no .def or .lib file will be produced.
			
 
				+
			
 
				+5. Install GPU Drivers (not mandatory)
			
 
				+
			
 
				+  5.1 Install Cuda
			
 
				+
			
 
				+      http://developer.nvidia.com/object/cuda_3_2_downloads.html
			
 
				+
			
 
				+      You need to install at least the CUDA toolkit.
			
 
				+
			
 
				+      libtool is not able to find the libraries automatically, you
			
 
				+      need to make some copies:
			
 
				+
			
 
				+      copy c:\cuda\lib\cuda.lib c:\cuda\lib\libcuda.lib
			
 
				+      copy c:\cuda\lib\cudart.lib c:\cuda\lib\libcudart.lib
			
 
				+      copy c:\cuda\lib\cublas.lib c:\cuda\lib\libcublas.lib
			
 
				+      copy c:\cuda\lib\cufft.lib c:\cuda\lib\libcufft.lib
			
 
				+      copy c:\cuda\lib\OpenCL.lib c:\cuda\lib\libOpenCL.lib
			
 
				+
			
 
				+      (and if the version of your CUDA driver is >= 3.2)
			
 
				+
			
 
				+      copy c:\cuda\lib\curand.lib c:\cuda\lib\libcurand.lib
			
 
				+
			
 
				+      Add the CUDA bin directory in your path
			
 
				+
			
 
				+      export PATH=/cygdrive/c/CUDA/bin:$PATH
			
 
				+
			
 
				+      Since we tell nvcc to build CUDA code with gcc instead of Visual studio,
			
 
				+      a fix is needed: c:\cuda\include\host_defines.h has a bogus CUDARTAPI
			
 
				+      definition which makes linking fail completely. Replace the first
			
 
				+      occurence of
			
 
				+
			
 
				+      #define CUDARTAPI
			
 
				+
			
 
				+      with
			
 
				+
			
 
				+      #ifdef _WIN32
			
 
				+      #define CUDARTAPI __stdcall
			
 
				+      #else
			
 
				+      #define CUDARTAPI
			
 
				+      #endif
			
 
				+
			
 
				+      While at it, you can also comment the __cdecl definition to avoid spurious
			
 
				+      warnings.
			
 
				+
			
 
				+
			
 
				+  5.2 Install OpenCL
			
 
				+
			
 
				+      http://developer.nvidia.com/object/opencl-download.html
			
 
				+
			
 
				+      You need to download the NVIDIA Drivers for your version of
			
 
				+      Windows. Executing the file will extract all files in a given
			
 
				+      directory. The the driver installation will start, it will fail
			
 
				+      if no compatibles drivers can be found on your system.
			
 
				+
			
 
				+      Anyway, you should copy the *.dl_ files from the directory
			
 
				+      (extraction path) in the bin directory of the CUDA installation
			
 
				+      directory (the directory should be v3.2/bin/)
			
 
				+
			
 
				+  5.3 Install MsCompress
			
 
				+
			
 
				+      http://gnuwin32.sourceforge.net/packages/mscompress.htm
			
 
				+
			
 
				+      Go in the CUDA bin directory, uncompress .dl_ files and rename
			
 
				+      them in .dll files
			
 
				+
			
 
				+      cp /cygdrive/c/NVIDIA/DisplayDriver/190.89/International/*.dl_ .
			
 
				+      for i in *.dl_ ; do /cygdrive/c/Program\ Files/GnuWin32/bin/msexpand.exe  $i ; mv ${i%_} ${i%_}l ; done
			
 
				+
			
 
				+If you are building from a tarball downloaded from the website, you can skip the
			
 
				+autogen.sh part.
			
 
				+
			
 
				+6. Start autogen.sh from cygwin
			
 
				+
			
 
				+   cd starpu-trunk
			
 
				+   ./autogen.sh
			
 
				+
			
 
				+7. Start a MinGW shell
			
 
				+
			
 
				+   /cygdrive/c/MinGW/msys/1.0/bin/sh.exe --login -i
			
 
				+
			
 
				+8. Configure, make, install from MinGW
			
 
				+
			
 
				+   If you have a non-english version of windows, use
			
 
				+
			
 
				+     export LANG=C
			
 
				+
			
 
				+   else libtool has troubles parsing the translated output of the toolchain.
			
 
				+
			
 
				+   cd starpu-trunk
			
 
				+   mkdir build
			
 
				+   cd build
			
 
				+   ../configure --prefix=$PWD/target --disable-default-drand48 \
			
 
				+        --with-hwloc=<HWLOC installation directory> \
			
 
				+        --with-cuda-dir=<CUDA installation directory> \
			
 
				+        --with-cuda-lib-dir=<CUDA installation directory>/lib/Win32 \
			
 
				+	--with-opencl-dir=<CUDA installation directory>
			
 
				+   make
			
 
				+   make install
			
 
				+
			
 
				+   Also convert a couple of files to CRLF:
			
 
				+
			
 
				+   sed -e 's/$/'$'\015'/ < README > $PWD/target/README.txt
			
 
				+   sed -e 's/$/'$'\015'/ < AUTHORS > $PWD/target/AUTHORS.txt
			
 
				+   sed -e 's/$/'$'\015'/ < COPYING.LGPL > $PWD/target/COPYING.LGPL.txt
			
 
				+
			
 
				+9. If you want your StarPU installation to be standalone, you need to
			
 
				+   copy the DLL files from hwloc, Cuda, and OpenCL into the StarPU
			
 
				+   installation bin directory, as well as MinGW/bin/libpthread*dll
			
 
				+
			
 
				+   cp <CUDA directory>/bin/*dll target/bin
			
 
				+   cp <HWLOC directory>/bin/*dll target/bin
			
 
				+   cp /cygdrive/c/MinGW/bin/libpthread*dll target/bin
			
 
				+
			
 
				+   and set the StarPU bin directory in your path.
			
 
				+
			
 
				+   export PATH=<StarPU installation directory>/bin:$PATH
			
 
				+
			
 
				+
			
 
				+Developers warning
			
 
				+------------------
			
 
				+They are only enabled if the STARPU_DEVEL environment is defined to a non-empty value.
			
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -1,3 +1,19 @@
 
				+dnl Copyright (C) Free Software Foundation, Inc.
			
 
				+dnl
			
 
				+dnl This program is free software; you can redistribute it and/or modify
			
 
				+dnl it under the terms of the GNU General Public License as published by
			
 
				+dnl the Free Software Foundation; either version 2 of the License, or
			
 
				+dnl (at your option) any later version.
			
 
				+dnl 
			
 
				+dnl This program is distributed in the hope that it will be useful,
			
 
				+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+dnl GNU General Public License for more details.
			
 
				+dnl 
			
 
				+dnl You should have received a copy of the GNU General Public License
			
 
				+dnl along with this program; if not, write to the Free Software
			
 
				+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+dnl
			
 
				 dnl This test is taken from libgfortran
			
 
				 
			
 
				 dnl Check whether the target supports __sync_val_compare_and_swap.
			
--- a/configure.ac
+++ b/configure.ac
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
--- a/doc/tutorial/Makefile
+++ b/doc/tutorial/Makefile
@@ -0,0 +1,45 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+CFLAGS          +=      $$(pkg-config --cflags libstarpu)
			
 
				+LDFLAGS         +=      $$(pkg-config --libs libstarpu)
			
 
				+
			
 
				+HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
			
 
				+NVCC		?=	nvcc
			
 
				+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	nvcc $(CFLAGS) $< -c
			
 
				+
			
 
				+all: hello_world vector_scal
			
 
				+
			
 
				+VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o 
			
 
				+ifneq ($(strip $(HAS_CUDA)),)
			
 
				+VECTOR_SCAL_PREREQUISITES	+=	vector_scal_cuda.o
			
 
				+VECTOR_SCAL_COMPILER		=	$(NVCC)
			
 
				+else
			
 
				+VECTOR_SCAL_COMPILER		=	$(CC)
			
 
				+endif
			
 
				+ifneq ($(strip $(HAS_OPENCL)),)
			
 
				+VECTOR_SCAL_PREREQUISITES += vector_scal_opencl.o
			
 
				+endif
			
 
				+
			
 
				+vector_scal: $(VECTOR_SCAL_PREREQUISITES)
			
 
				+	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+clean:
			
 
				+	rm -f hello_world vector_scal *.o
			
 
				+
			
--- a/doc/tutorial/README
+++ b/doc/tutorial/README
@@ -0,0 +1,33 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+
			
 
				+Instructions on how to compile and run StarPU examples
			
 
				+------------------------------------------------------
			
 
				+
			
 
				+% export STARPU_DIR=<directory where StarPU is installed>
			
 
				+% export PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+% export LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
			
 
				+
			
 
				+% make hello_world
			
 
				+% ./hello_world
			
 
				+
			
 
				+% make vector_scal
			
 
				+% ./vector_scal
			
 
				+
			
 
				+% STARPU_NCPUS=0 ./vector_scal
			
 
				+% STARPU_NCPUS=0 STARPU_NCUDA=0 ./vector_scal
			
 
				+
			
--- a/doc/tutorial/hello_world.c
+++ b/doc/tutorial/hello_world.c
@@ -0,0 +1,70 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+struct params {
			
 
				+    int i;
			
 
				+    float f;
			
 
				+};
			
 
				+
			
 
				+void cpu_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    struct params *params = cl_arg;
			
 
				+
			
 
				+    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				+}
			
 
				+
			
 
				+starpu_codelet cl =
			
 
				+{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_func = cpu_func,
			
 
				+    .nbuffers = 0
			
 
				+};
			
 
				+
			
 
				+void callback_func(void *callback_arg)
			
 
				+{
			
 
				+    printf("Callback function (arg %x)\n", callback_arg);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+    /* initialize StarPU */
			
 
				+    starpu_init(NULL);
			
 
				+
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->cl = &cl; /* Pointer to the codelet defined above */
			
 
				+
			
 
				+    struct params params = { 1, 2.0f };
			
 
				+    task->cl_arg = &params;
			
 
				+    task->cl_arg_size = sizeof(params);
			
 
				+
			
 
				+    task->callback_func = callback_func;
			
 
				+    task->callback_arg = 0x42;
			
 
				+
			
 
				+    /* starpu_task_submit will be a blocking call */
			
 
				+    task->synchronous = 1;
			
 
				+
			
 
				+    /* submit the task to StarPU */
			
 
				+    starpu_task_submit(task);
			
 
				+
			
 
				+    /* terminate StarPU */
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/doc/tutorial/vector_scal.c
+++ b/doc/tutorial/vector_scal.c
@@ -0,0 +1,124 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				+ * It shows how to manipulate data with StarPU's data management library.
			
 
				+ *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				+ *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				+ *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+#define    NX    2048
			
 
				+
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				+extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				+
			
 
				+static starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				+    /* CPU implementation of the codelet */
			
 
				+    .cpu_func = scal_cpu_func,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+    /* CUDA implementation of the codelet */
			
 
				+    .cuda_func = scal_cuda_func,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    /* OpenCL implementation of the codelet */
			
 
				+    .opencl_func = scal_opencl_func,
			
 
				+#endif
			
 
				+    .nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program programs;
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+    /* We consider a vector of float that is initialized just as any of C
			
 
				+      * data */
			
 
				+    float vector[NX];
			
 
				+    unsigned i;
			
 
				+    for (i = 0; i < NX; i++)
			
 
				+        vector[i] = 1.0f;
			
 
				+
			
 
				+    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
			
 
				+
			
 
				+    /* Initialize StarPU with default configuration */
			
 
				+    starpu_init(NULL);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
			
 
				+#endif
			
 
				+
			
 
				+    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
 
				+     * identifier. When a task needs to access a piece of data, it should
			
 
				+     * refer to the handle that is associated to it.
			
 
				+     * In the case of the "vector" data interface:
			
 
				+     *  - the first argument of the registration method is a pointer to the
			
 
				+     *    handle that should describe the data
			
 
				+     *  - the second argument is the memory node where the data (ie. "vector")
			
 
				+     *    resides initially: 0 stands for an address in main memory, as
			
 
				+     *    opposed to an adress on a GPU for instance.
			
 
				+     *  - the third argument is the adress of the vector in RAM
			
 
				+     *  - the fourth argument is the number of elements in the vector
			
 
				+     *  - the fifth argument is the size of each element.
			
 
				+     */
			
 
				+    starpu_data_handle vector_handle;
			
 
				+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				+                                NX, sizeof(vector[0]));
			
 
				+
			
 
				+    float factor = 3.14;
			
 
				+
			
 
				+    /* create a synchronous task: any call to starpu_task_submit will block
			
 
				+      * until it is terminated */
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+    task->synchronous = 1;
			
 
				+
			
 
				+    task->cl = &cl;
			
 
				+
			
 
				+    /* the codelet manipulates one buffer in RW mode */
			
 
				+    task->buffers[0].handle = vector_handle;
			
 
				+    task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+    /* an argument is passed to the codelet, beware that this is a
			
 
				+     * READ-ONLY buffer and that the codelet may be given a pointer to a
			
 
				+     * COPY of the argument */
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+    /* execute the task on any eligible computational ressource */
			
 
				+    starpu_task_submit(task);
			
 
				+
			
 
				+    /* StarPU does not need to manipulate the array anymore so we can stop
			
 
				+      * monitoring it */
			
 
				+    starpu_data_unregister(vector_handle);
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    starpu_opencl_unload_opencl(&programs);
			
 
				+#endif
			
 
				+
			
 
				+    /* terminate StarPU, no task can be submitted after */
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/doc/tutorial/vector_scal_cpu.c
+++ b/doc/tutorial/vector_scal_cpu.c
@@ -0,0 +1,50 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+/* This kernel takes a buffer and scales it by a constant factor */
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /*
			
 
				+     * The "buffers" array matches the task->buffers array: for instance
			
 
				+     * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				+     * vector "interface", so that the first entry of the array in the
			
 
				+     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				+     * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				+     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				+     * in the array, ptr gives the location of the array (that was possibly
			
 
				+     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				+     */
			
 
				+    starpu_vector_interface_t *vector = buffers[0];
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+
			
 
				+    /* get a pointer to the local copy of the vector : note that we have to
			
 
				+     * cast it in (float *) since a vector could contain any type of
			
 
				+     * elements so that the .ptr field is actually a uintptr_t */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+    /* scale the vector */
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+}
			
 
				+
			
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -0,0 +1,43 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+        if (i < n)
			
 
				+               val[i] *= factor;
			
 
				+}
			
 
				+
			
 
				+extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+        float *factor = (float *)_args;
			
 
				+
			
 
				+        /* length of the vector */
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+        /* local copy of the vector pointer */
			
 
				+        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+        unsigned threads_per_block = 64;
			
 
				+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
			
 
				+
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
--- a/doc/tutorial/vector_scal_opencl.c
+++ b/doc/tutorial/vector_scal_opencl.c
@@ -0,0 +1,60 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+extern struct starpu_opencl_program programs;
			
 
				+
			
 
				+void scal_opencl_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+    float *factor = _args;
			
 
				+    int id, devid, err;
			
 
				+    cl_kernel kernel;
			
 
				+    cl_command_queue queue;
			
 
				+    cl_event event;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* OpenCL copy of the vector pointer */
			
 
				+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+    id = starpu_worker_get_id();
			
 
				+    devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
			
 
				+                    "vector_mult_opencl", devid);   /* Name of the codelet defined above */
			
 
				+    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				+    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
			
 
				+    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				+    if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+    {
			
 
				+        size_t global=1;
			
 
				+        size_t local=1;
			
 
				+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				+        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+    }
			
 
				+
			
 
				+    clFinish(queue);
			
 
				+    starpu_opencl_collect_stats(event);
			
 
				+    clReleaseEvent(event);
			
 
				+
			
 
				+    starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/doc/tutorial/vector_scal_opencl_kernel.cl
+++ b/doc/tutorial/vector_scal_opencl_kernel.cl
@@ -0,0 +1,25 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx) {
			
 
				+                val[i] *= factor;
			
 
				+        }
			
 
				+}
			
 
				+
			
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -49,7 +49,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         starpu_opencl_load_opencl_from_file(
			
 
				-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs);
			
 
				+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
			
 
				 #endif
			
 
				 
			
 
				     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -1,9 +1,10 @@
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				 
			
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
 
				 @{
			
 
				-        unsigned i;
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				         if (i < n)
			
 
				                val[i] *= factor;
			
 
				 @}
			
--- a/doc/vector_scal_opencl.texi
+++ b/doc/vector_scal_opencl.texi
@@ -13,8 +13,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-    /* local copy of the vector pointer */
			
 
				-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    /* OpenCL copy of the vector pointer */
			
 
				+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				 
			
 
				     id = starpu_worker_get_id();
			
 
				     devid = starpu_worker_get_devid(id);
			
@@ -23,7 +23,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				                                     devid);
			
 
				     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				     err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
			
 
				     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				     if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/.gitignore
+++ b/examples/.gitignore
@@ -0,0 +1 @@
 
				+/.deps
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,24 +14,23 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				-AUTOMAKE_OPTIONS = subdir-objects
			
 
				-
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS) -Wall
			
 
				-LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
			
 
				+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
 
				-TESTS	=	$(check_PROGRAMS)
			
 
				+SUBDIRS = stencil
			
 
				 
			
 
				-SUBDIRS = stencil stencil_ctx
			
 
				+if STARPU_USE_SOCL
			
 
				+SUBDIRS += socl
			
 
				+endif
			
 
				 
			
 
				 if STARPU_HAVE_FFTW
			
 
				-if STARPU_HAVE_FFTWL
			
 
				+if STARPU_HAVE_FFTWF
			
 
				 SUBDIRS += starpufft
			
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-check_PROGRAMS =
			
 
				-
			
 
				 BUILT_SOURCES =
			
 
				 
			
 
				 if STARPU_USE_OPENCL
			
@@ -40,7 +39,9 @@ endif
 
				 
			
 
				 EXTRA_DIST = 					\
			
 
				 	basic_examples/vector_scal_opencl_kernel.cl \
			
 
				+	common/blas_model.c			\
			
 
				 	spmv/spmv_cuda.cu			\
			
 
				+	spmv/spmv_opencl.cl			\
			
 
				 	gordon/null_kernel_gordon.c		\
			
 
				 	mult/xgemm.c				\
			
 
				 	lu/xlu.c				\
			
@@ -54,6 +55,7 @@ EXTRA_DIST = 					\
 
				 	basic_examples/variable_kernels_opencl_kernel.cl	\
			
 
				 	matvecmult/matvecmult_kernel.cl				\
			
 
				 	basic_examples/block_opencl_kernel.cl			\
			
 
				+	openmp/vector_scal.c			\
			
 
				 	filters/fblock_opencl_kernel.cl
			
 
				 
			
 
				 CLEANFILES = 					\
			
@@ -64,7 +66,7 @@ CLEANFILES += *.gcno *.gcda *.linkinfo
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				 
			
 
				-NVCCFLAGS += --compiler-options -fno-strict-aliasing  $(HWLOC_CFLAGS) -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  -arch sm_13
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS) -arch sm_13
			
 
				 
			
 
				 .cu.o:
			
 
				 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
@@ -106,11 +108,6 @@ noinst_HEADERS = 				\
 
				 	lu/xlu_kernels.h			\
			
 
				 	lu/float.h				\
			
 
				 	lu/double.h				\
			
 
				-	pi/pi.h					\
			
 
				-	pi/SobolQRNG/sobol.h			\
			
 
				-	pi/SobolQRNG/sobol_gold.h		\
			
 
				-	pi/SobolQRNG/sobol_gpu.h		\
			
 
				-	pi/SobolQRNG/sobol_primitives.h		\
			
 
				 	cholesky/cholesky.h			\
			
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
@@ -122,22 +119,134 @@ noinst_HEADERS = 				\
 
				 	ppm_downscaler/yuv_downscaler.h		\
			
 
				 	spmv/matrix_market/mmio.h		\
			
 
				 	spmv/matrix_market/mm_to_bcsr.h		\
			
 
				-	spmv/dw_spmv.h				\
			
 
				+	spmv/spmv.h				\
			
 
				 	spmv/dw_block_spmv.h
			
 
				 
			
 
				+#####################################
			
 
				+# What to install and what to check #
			
 
				+#####################################
			
 
				 
			
 
				-##################
			
 
				-# Basic examples #
			
 
				-##################
			
 
				+STARPU_EXAMPLES	=
			
 
				+TESTS		=	$(STARPU_EXAMPLES)
			
 
				+
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+check_PROGRAMS	=	$(STARPU_EXAMPLES)
			
 
				+else
			
 
				+check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_HAVE_WINDOWS
			
 
				+## test loader program
			
 
				+LOADER			=	loader
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
			
 
				+loader_SOURCES		=	../tests/loader.c
			
 
				+TESTS_ENVIRONMENT	=	$(LOADER_BIN)
			
 
				+endif
			
 
				 
			
 
				 examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/hello_world
			
 
				+	basic_examples/hello_world		\
			
 
				+	basic_examples/vector_scal		\
			
 
				+	basic_examples/mult			\
			
 
				+	basic_examples/block			\
			
 
				+	basic_examples/variable			\
			
 
				+	filters/fvector				\
			
 
				+	filters/fblock				\
			
 
				+	filters/fmatrix				\
			
 
				+	tag_example/tag_example			\
			
 
				+	tag_example/tag_example3		\
			
 
				+	tag_example/tag_example2		\
			
 
				+	tag_example/tag_restartable		\
			
 
				+	spmv/spmv				\
			
 
				+	callback/callback			\
			
 
				+	incrementer/incrementer			\
			
 
				+	matvecmult/matvecmult			\
			
 
				+	profiling/profiling			\
			
 
				+	scheduler/dummy_sched			\
			
 
				+	reductions/dot_product			\
			
 
				+	reductions/minmax_reduction		\
			
 
				+	mandelbrot/mandelbrot			\
			
 
				+	ppm_downscaler/ppm_downscaler		\
			
 
				+	ppm_downscaler/yuv_downscaler
			
 
				 
			
 
				-basic_examples_hello_world_SOURCES =		\
			
 
				-	basic_examples/hello_world.c
			
 
				+if STARPU_HAVE_F77_H
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	basic_examples/vector_scal_fortran
			
 
				+endif
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	axpy/axpy				\
			
 
				+	mult/sgemm 				\
			
 
				+	mult/dgemm				\
			
 
				+	cholesky/cholesky_tag			\
			
 
				+	cholesky/cholesky_tile_tag		\
			
 
				+	cholesky/cholesky_grain_tag		\
			
 
				+	cholesky/cholesky_implicit		\
			
 
				+	lu/lu_example_float			\
			
 
				+	lu/lu_example_double			\
			
 
				+	lu/lu_implicit_example_float		\
			
 
				+	lu/lu_implicit_example_double		\
			
 
				+	heat/heat				\
			
 
				+	cg/cg
			
 
				+endif
			
 
				 
			
 
				+if ATLAS_BLAS_LIB
			
 
				 examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/vector_scal
			
 
				+	spmv/dw_block_spmv
			
 
				+endif
			
 
				+
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	basic_examples/hello_world		\
			
 
				+	basic_examples/vector_scal		\
			
 
				+	basic_examples/mult			\
			
 
				+	basic_examples/block			\
			
 
				+	basic_examples/variable			\
			
 
				+	filters/fvector				\
			
 
				+	filters/fblock				\
			
 
				+	filters/fmatrix				\
			
 
				+	tag_example/tag_example			\
			
 
				+	tag_example/tag_example3		\
			
 
				+	tag_example/tag_example2		\
			
 
				+	tag_example/tag_restartable		\
			
 
				+	spmv/spmv				\
			
 
				+	callback/callback			\
			
 
				+	incrementer/incrementer			\
			
 
				+	matvecmult/matvecmult			\
			
 
				+	profiling/profiling			\
			
 
				+	scheduler/dummy_sched			\
			
 
				+	reductions/dot_product			\
			
 
				+	reductions/minmax_reduction
			
 
				+
			
 
				+if STARPU_HAVE_F77_H
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	basic_examples/vector_scal_fortran
			
 
				+endif
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	axpy/axpy				\
			
 
				+	mult/sgemm 				\
			
 
				+	mult/dgemm				\
			
 
				+	cholesky/cholesky_tag			\
			
 
				+	cholesky/cholesky_tile_tag		\
			
 
				+	cholesky/cholesky_grain_tag		\
			
 
				+	cholesky/cholesky_implicit		\
			
 
				+	lu/lu_example_float			\
			
 
				+	lu/lu_example_double			\
			
 
				+	lu/lu_implicit_example_float		\
			
 
				+	lu/lu_implicit_example_double		\
			
 
				+	heat/heat				\
			
 
				+	cg/cg
			
 
				+endif
			
 
				+
			
 
				+if ATLAS_BLAS_LIB
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	spmv/dw_block_spmv
			
 
				+endif
			
 
				+
			
 
				+##################
			
 
				+# Basic examples #
			
 
				+##################
			
 
				 
			
 
				 basic_examples_vector_scal_SOURCES =		\
			
 
				 	basic_examples/vector_scal.c		\
			
@@ -156,9 +265,6 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_F77_H
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/vector_scal_fortran
			
 
				-
			
 
				 basic_examples_vector_scal_fortran_SOURCES =	\
			
 
				 	basic_examples/vector_scal_fortran.F	\
			
 
				 	basic_examples/vector_scal_c.c		\
			
@@ -167,25 +273,15 @@ basic_examples_vector_scal_fortran_SOURCES =	\
 
				 if STARPU_USE_CUDA
			
 
				 basic_examples_vector_scal_fortran_SOURCES +=	\
			
 
				 	basic_examples/vector_scal_cuda.cu
			
 
				+basic_examples_vector_scal_fortran_LDADD =	\
			
 
				+	$(STARPU_CUDA_FORTRAN_LDFLAGS)
			
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/mult
			
 
				-
			
 
				-basic_examples_mult_SOURCES =			\
			
 
				-	basic_examples/mult.c
			
 
				-
			
 
				 #################
			
 
				 # block example #
			
 
				 #################
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	basic_examples/block
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/block
			
 
				-
			
 
				 basic_examples_block_SOURCES =			\
			
 
				 	basic_examples/block.c			\
			
 
				 	basic_examples/block_cpu.c
			
@@ -206,12 +302,6 @@ endif
 
				 # Variable example #
			
 
				 ####################
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	basic_examples/variable
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	basic_examples/variable
			
 
				-
			
 
				 basic_examples_variable_SOURCES =		\
			
 
				 	basic_examples/variable.c		\
			
 
				 	basic_examples/variable_kernels_cpu.c
			
@@ -232,14 +322,6 @@ endif
 
				 # Filters #
			
 
				 ###########
			
 
				 
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	filters/fvector				\
			
 
				-	filters/fblock				\
			
 
				-	filters/fmatrix
			
 
				-
			
 
				-filters_fvector_SOURCES =			\
			
 
				-	filters/fvector.c
			
 
				-
			
 
				 filters_fblock_SOURCES =			\
			
 
				 	filters/fblock.c			\
			
 
				 	filters/fblock_cpu.c
			
@@ -255,69 +337,17 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
				 	filters/fblock_opencl_kernel.cl
			
 
				 endif
			
 
				 
			
 
				-filters_fmatrix_SOURCES =			\
			
 
				-	filters/fmatrix.c
			
 
				-
			
 
				-###################
			
 
				-# PPM downscaling #
			
 
				-###################
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	ppm_downscaler/ppm_downscaler
			
 
				-
			
 
				-ppm_downscaler_ppm_downscaler_SOURCES =		\
			
 
				-	ppm_downscaler/ppm_downscaler.c
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	ppm_downscaler/yuv_downscaler
			
 
				-
			
 
				-ppm_downscaler_yuv_downscaler_SOURCES =		\
			
 
				-	ppm_downscaler/yuv_downscaler.c
			
 
				-
			
 
				-######
			
 
				-# Pi #
			
 
				-######
			
 
				-
			
 
				-check_PROGRAMS +=				\
			
 
				-	pi/pi_redux
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	pi/pi					\
			
 
				-	pi/pi_redux
			
 
				-
			
 
				-pi_pi_SOURCES =					\
			
 
				-	pi/pi.c					\
			
 
				-	pi/SobolQRNG/sobol_gold.c		\
			
 
				-	pi/SobolQRNG/sobol_primitives.c
			
 
				-
			
 
				-if STARPU_USE_CUDA
			
 
				-pi_pi_SOURCES +=				\
			
 
				-	pi/pi_kernel.cu				\
			
 
				-	pi/SobolQRNG/sobol_gpu.cu
			
 
				-endif
			
 
				-
			
 
				-pi_pi_redux_SOURCES =				\
			
 
				-	pi/pi_redux.c
			
 
				-
			
 
				-if STARPU_USE_CUDA
			
 
				-pi_pi_redux_SOURCES +=				\
			
 
				-	pi/pi_redux_kernel.cu
			
 
				-endif
			
 
				-
			
 
				-
			
 
				 ################
			
 
				 # AXPY example #
			
 
				 ################
			
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	axpy/axpy
			
 
				-
			
 
				 axpy_axpy_SOURCES =				\
			
 
				 	axpy/axpy.c				\
			
 
				 	common/blas.c
			
 
				 
			
 
				+axpy_axpy_LDADD =				\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				 ################
			
@@ -326,18 +356,20 @@ endif
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	mult/sgemm 				\
			
 
				-	mult/dgemm
			
 
				-
			
 
				 mult_sgemm_SOURCES = 				\
			
 
				 	mult/sgemm.c				\
			
 
				 	common/blas.c
			
 
				 
			
 
				+mult_sgemm_LDADD =				\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 mult_dgemm_SOURCES = 				\
			
 
				 	mult/dgemm.c				\
			
 
				 	common/blas.c
			
 
				 
			
 
				+mult_dgemm_LDADD =				\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 endif
			
 
				 
			
 
				 ####################
			
@@ -346,36 +378,42 @@ endif
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	cholesky/cholesky_tag			\
			
 
				-	cholesky/cholesky_tile_tag		\
			
 
				-	cholesky/cholesky_grain_tag		\
			
 
				-	cholesky/cholesky_implicit
			
 
				-
			
 
				 cholesky_cholesky_tag_SOURCES =			\
			
 
				 	cholesky/cholesky_tag.c			\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				 	common/blas.c
			
 
				 
			
 
				+cholesky_cholesky_tag_LDADD =			\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 cholesky_cholesky_tile_tag_SOURCES =		\
			
 
				 	cholesky/cholesky_tile_tag.c		\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				 	common/blas.c
			
 
				 
			
 
				+cholesky_cholesky_tile_tag_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 cholesky_cholesky_grain_tag_SOURCES =		\
			
 
				 	cholesky/cholesky_grain_tag.c		\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				 	common/blas.c
			
 
				 
			
 
				+cholesky_cholesky_grain_tag_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 cholesky_cholesky_implicit_SOURCES =		\
			
 
				 	cholesky/cholesky_implicit.c		\
			
 
				 	cholesky/cholesky_models.c		\
			
 
				 	cholesky/cholesky_kernels.c		\
			
 
				 	common/blas.c
			
 
				 
			
 
				+cholesky_cholesky_implicit_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 endif
			
 
				 
			
 
				 ##############
			
@@ -384,14 +422,6 @@ endif
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	lu/lu_example_float			\
			
 
				-	lu/lu_implicit_example_float
			
 
				-
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	lu/lu_example_float			\
			
 
				-	lu/lu_example_double
			
 
				-
			
 
				 lu_lu_example_float_SOURCES =			\
			
 
				 	lu/lu_example_float.c			\
			
 
				 	lu/slu.c				\
			
@@ -399,6 +429,9 @@ lu_lu_example_float_SOURCES =			\
 
				 	lu/slu_kernels.c			\
			
 
				 	common/blas.c
			
 
				 
			
 
				+lu_lu_example_float_LDADD =			\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 lu_lu_example_double_SOURCES =			\
			
 
				 	lu/lu_example_double.c			\
			
 
				 	lu/dlu.c				\
			
@@ -406,9 +439,8 @@ lu_lu_example_double_SOURCES =			\
 
				 	lu/dlu_kernels.c			\
			
 
				 	common/blas.c
			
 
				 
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	lu/lu_implicit_example_float		\
			
 
				-	lu/lu_implicit_example_double
			
 
				+lu_lu_example_double_LDADD =			\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				 
			
 
				 lu_lu_implicit_example_float_SOURCES =		\
			
 
				 	lu/lu_example_float.c			\
			
@@ -417,6 +449,9 @@ lu_lu_implicit_example_float_SOURCES =		\
 
				 	lu/slu_kernels.c			\
			
 
				 	common/blas.c
			
 
				 
			
 
				+lu_lu_implicit_example_float_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 lu_lu_implicit_example_double_SOURCES =		\
			
 
				 	lu/lu_example_double.c			\
			
 
				 	lu/dlu_implicit.c			\
			
@@ -424,6 +459,8 @@ lu_lu_implicit_example_double_SOURCES =		\
 
				 	lu/dlu_kernels.c			\
			
 
				 	common/blas.c
			
 
				 
			
 
				+lu_lu_implicit_example_double_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				 ###########################
			
@@ -448,8 +485,6 @@ endif
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				-examplebin_PROGRAMS += heat/heat
			
 
				-
			
 
				 heat_heat_SOURCES =				\
			
 
				 	heat/heat.c				\
			
 
				 	heat/dw_factolu.c			\
			
@@ -462,6 +497,10 @@ heat_heat_SOURCES =				\
 
				 	heat/dw_factolu_kernels.c		\
			
 
				 	common/blas.c
			
 
				 
			
 
				+heat_heat_LDADD =				\
			
 
				+	$(STARPU_OPENGL_RENDER_LDFLAGS)		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				 endif
			
 
				 
			
 
				 ##############
			
@@ -470,8 +509,6 @@ endif
 
				 
			
 
				 if !NO_BLAS_LIB
			
 
				 
			
 
				-examplebin_PROGRAMS += cg/cg
			
 
				-
			
 
				 cg_cg_SOURCES =					\
			
 
				 	cg/cg.c					\
			
 
				 	cg/cg_kernels.c				\
			
@@ -482,62 +519,33 @@ cg_cg_SOURCES +=				\
 
				 	cg/cg_dot_kernel.cu
			
 
				 endif
			
 
				 
			
 
				-endif
			
 
				-
			
 
				+cg_cg_LDADD =					\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				 
			
 
				-
			
 
				-################
			
 
				-# Tag examples #
			
 
				-################
			
 
				-
			
 
				-check_PROGRAMS +=			\
			
 
				-	tag_example/tag_example			\
			
 
				-	tag_example/tag_example3			\
			
 
				-	tag_example/tag_example2	\
			
 
				-	tag_example/tag_restartable
			
 
				-
			
 
				-examplebin_PROGRAMS +=			\
			
 
				-	tag_example/tag_example			\
			
 
				-	tag_example/tag_example3		\
			
 
				-	tag_example/tag_example2	\
			
 
				-	tag_example/tag_restartable
			
 
				-
			
 
				-tag_example_tag_example_SOURCES =		\
			
 
				-	tag_example/tag_example.c
			
 
				-
			
 
				-tag_example_tag_example2_SOURCES =		\
			
 
				-	tag_example/tag_example2.c
			
 
				-
			
 
				-tag_example_tag_example3_SOURCES =		\
			
 
				-	tag_example/tag_example3.c
			
 
				-
			
 
				-tag_example_tag_restartable_SOURCES =		\
			
 
				-	tag_example/tag_restartable.c
			
 
				+endif
			
 
				 
			
 
				 ################
			
 
				 # SpMV example #
			
 
				 ################
			
 
				 
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	spmv/dw_spmv
			
 
				-
			
 
				-spmv_dw_spmv_SOURCES = 				\
			
 
				-	spmv/dw_spmv.c
			
 
				+spmv_spmv_SOURCES = 				\
			
 
				+	spmv/spmv.c				\
			
 
				+	spmv/spmv_kernels.c
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				-spmv_dw_spmv_SOURCES +=				\
			
 
				+spmv_spmv_SOURCES +=				\
			
 
				 	spmv/spmv_cuda.cu
			
 
				 endif
			
 
				 
			
 
				 if ATLAS_BLAS_LIB
			
 
				-examplebin_PROGRAMS += 				\
			
 
				-	spmv/dw_block_spmv
			
 
				-
			
 
				 spmv_dw_block_spmv_SOURCES =			\
			
 
				 	spmv/dw_block_spmv.c			\
			
 
				 	spmv/dw_block_spmv_kernels.c		\
			
 
				 	spmv/matrix_market/mm_to_bcsr.c		\
			
 
				 	spmv/matrix_market/mmio.c
			
 
				+
			
 
				+spmv_dw_block_spmv_LDADD =			\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				 #######################
			
@@ -545,12 +553,6 @@ endif
 
				 #######################
			
 
				 
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	incrementer/incrementer
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	incrementer/incrementer
			
 
				-
			
 
				 incrementer_incrementer_SOURCES =	\
			
 
				 	incrementer/incrementer.c
			
 
				 if STARPU_USE_CUDA
			
@@ -568,78 +570,38 @@ endif
 
				 # matVecMult example #
			
 
				 ######################
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	matvecmult/matvecmult
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	matvecmult/matvecmult
			
 
				-
			
 
				-matvecmult_matvecmult_SOURCES =	\
			
 
				-	matvecmult/matvecmult.c
			
 
				-
			
 
				 if STARPU_USE_OPENCL
			
 
				 nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				 	matvecmult/matvecmult_kernel.cl
			
 
				 endif
			
 
				 
			
 
				-#####################
			
 
				-# profiling example #
			
 
				-#####################
			
 
				-
			
 
				-check_PROGRAMS +=				\
			
 
				-	profiling/profiling
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	profiling/profiling
			
 
				-
			
 
				-profiling_profiling_SOURCES =			\
			
 
				-	profiling/profiling.c
			
 
				-
			
 
				-#####################
			
 
				-# scheduler example #
			
 
				-#####################
			
 
				-
			
 
				-check_PROGRAMS +=				\
			
 
				-	scheduler/dummy_sched
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	scheduler/dummy_sched
			
 
				-
			
 
				-scheduler_dummy_sched_SOURCES =			\
			
 
				-	scheduler/dummy_sched.c
			
 
				-
			
 
				 #######################
			
 
				 # dot_product example #
			
 
				 #######################
			
 
				 
			
 
				-check_PROGRAMS +=				\
			
 
				-	reductions/dot_product
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	reductions/dot_product
			
 
				-
			
 
				 reductions_dot_product_SOURCES =		\
			
 
				 	reductions/dot_product.c
			
 
				-
			
 
				-#####################
			
 
				-# Min/Max reduction #
			
 
				-#####################
			
 
				-
			
 
				-check_PROGRAMS +=				\
			
 
				-	reductions/minmax_reduction
			
 
				-
			
 
				-examplebin_PROGRAMS +=				\
			
 
				-	reductions/minmax_reduction
			
 
				-
			
 
				-reductions_minmax_reduction_SOURCES =		\
			
 
				-	reductions/minmax_reduction.c
			
 
				+if STARPU_USE_CUDA
			
 
				+reductions_dot_product_SOURCES +=		\
			
 
				+	reductions/dot_product_kernels.cu
			
 
				+endif
			
 
				 
			
 
				 ##################
			
 
				 # Mandelbrot Set #
			
 
				 ##################
			
 
				 
			
 
				+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
			
 
				+if HAVE_X11
			
 
				+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
			
 
				+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
			
 
				+endif
			
 
				+
			
 
				+################
			
 
				+# Top Examples #
			
 
				+################
			
 
				+
			
 
				 examplebin_PROGRAMS +=				\
			
 
				-	mandelbrot/mandelbrot
			
 
				+	top/hello_world_top
			
 
				 
			
 
				-mandelbrot_mandelbrot_SOURCES =			\
			
 
				-	mandelbrot/mandelbrot.c
			
 
				+top_hello_world_top_SOURCES =			\
			
 
				+	top/hello_world_top.c
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -30,7 +30,7 @@
 
				 #include <cufft.h>
			
 
				 #endif
			
 
				 
			
 
				-//#define SAVE_RAW	1
			
 
				+/* #define SAVE_RAW	1 */
			
 
				 
			
 
				 #define DEFAULTINPUTFILE	"input.wav"
			
 
				 #define DEFAULTOUTPUTFILE	"output.wav"
			
@@ -328,14 +328,14 @@ static void init_problem(void)
 
				 	/* allocate a buffer to store the content of input file */
			
 
				 	if (use_pin)
			
 
				 	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
			
 
				+		starpu_malloc((void **)&A, length_data*sizeof(float));
			
 
				 	}
			
 
				 	else {
			
 
				 		A = malloc(length_data*sizeof(float));
			
 
				 	}
			
 
				 
			
 
				 	/* allocate working buffer (this could be done online, but we'll keep it simple) */
			
 
				-	//starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
			
 
				+	/* starpu_data_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex)); */
			
 
				 
			
 
				 	/* read input data into buffer "A" */
			
 
				 	read_16bit_wav(infile, length_data, A, infile_raw);
			
@@ -396,9 +396,7 @@ int main(int argc, char **argv)
 
				 	struct starpu_data_filter f = 
			
 
				 	{
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				-		.nchildren = niter,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = niter
			
 
				 	};
			
 
				 
			
 
				 	starpu_data_partition(A_handle, &f);
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,6 +36,8 @@
 
				 
			
 
				 #define NBLOCKS	8
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 TYPE *vec_x, *vec_y;
			
 
				 
			
 
				 /* descriptors for StarPU */
			
@@ -93,21 +95,21 @@ int main(int argc, char **argv)
 
				 		vec_a = malloc(N*sizeof(TYPE));
			
 
				 		vec_b = malloc(N*sizeof(TYPE));
			
 
				 	*/
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&vec_x, N*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&vec_x, N*sizeof(TYPE));
			
 
				 	assert(vec_x);
			
 
				 
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&vec_y, N*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&vec_y, N*sizeof(TYPE));
			
 
				 	assert(vec_y);
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < N; i++)
			
 
				 	{
			
 
				-		vec_x[i] = 1.0f;//(TYPE)starpu_drand48();
			
 
				-		vec_y[i] = 4.0f;//(TYPE)starpu_drand48();
			
 
				+		vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */
			
 
				+		vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
			
 
				-	fprintf(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
			
 
				+	FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
			
 
				+	FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
			
 
				 
			
 
				 	/* Declare the data to StarPU */
			
 
				 	starpu_vector_data_register(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
			
@@ -116,9 +118,7 @@ int main(int argc, char **argv)
 
				 	/* Divide the vector into blocks */
			
 
				 	struct starpu_data_filter block_filter = {
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				-		.nchildren = NBLOCKS,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = NBLOCKS
			
 
				 	};
			
 
				 
			
 
				 	starpu_data_partition(handle_x, &block_filter);
			
@@ -151,16 +151,21 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				+	starpu_data_unpartition(handle_x, 0);
			
 
				 	starpu_data_unpartition(handle_y, 0);
			
 
				+	starpu_data_unregister(handle_x);
			
 
				 	starpu_data_unregister(handle_y);
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				         double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
 
				                                         (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	fprintf(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
			
 
				+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
			
 
				+
			
 
				+	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
			
 
				 
			
 
				-	fprintf(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
			
 
				+	starpu_free((void *)vec_x);
			
 
				+	starpu_free((void *)vec_y);
			
 
				 
			
 
				 	/* Stop StarPU */
			
 
				 	starpu_shutdown();
			
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				 #include <pthread.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 extern void cpu_codelet(void *descr[], void *_args);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern void cuda_codelet(void *descr[], void *_args);
			
@@ -52,24 +54,23 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 
				         task->buffers[0].handle = block_handle;
			
 
				         task->buffers[0].mode = STARPU_RW;
			
 
				 	task->cl_arg = &multiplier;
			
 
				+	task->cl_arg_size = sizeof(multiplier);
			
 
				 
			
 
				         int ret = starpu_task_submit(task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 return 1;
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	/* update the array in RAM */
			
 
				-        starpu_data_acquire(block_handle, STARPU_R);
			
 
				+	starpu_data_unregister(block_handle);
			
 
				 
			
 
				         for(i=0 ; i<pnx*pny*pnz; i++) {
			
 
				-          fprintf(stderr, "%f ", block[i]);
			
 
				+          FPRINTF(stderr, "%f ", block[i]);
			
 
				         }
			
 
				-        fprintf(stderr, "\n");
			
 
				-
			
 
				-        starpu_data_release(block_handle);
			
 
				+        FPRINTF(stderr, "\n");
			
 
				 
			
 
				         return 0;
			
 
				 }
			
@@ -98,7 +99,7 @@ int main(int argc, char **argv)
 
				         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
			
 
				         if (!ret) multiplier *= 1.0;
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code);
			
 
				+        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
			
 
				         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
			
 
				         if (!ret) multiplier *= 2.0;
			
 
				 #endif
			
@@ -107,7 +108,7 @@ int main(int argc, char **argv)
 
				         if (!ret) multiplier *= 3.0;
			
 
				 #endif
			
 
				 
			
 
				-        // Check result is correct
			
 
				+        /* Check result is correct */
			
 
				         ret=1;
			
 
				         for(i=0 ; i<nx*ny*nz ; i++) {
			
 
				           if (block[i] != (i+1) * multiplier) {
			
@@ -116,7 +117,9 @@ int main(int argc, char **argv)
 
				           }
			
 
				         }
			
 
				 
			
 
				-        fprintf(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
			
 
				+        FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
			
 
				+	free(block);
			
 
				+
			
 
				         starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -25,8 +25,8 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				 	cl_event event;
			
 
				-	int id, devid, err, n;
			
 
				-	float *block = (float *)STARPU_BLOCK_GET_PTR(descr[0]);
			
 
				+	int id, devid, err;
			
 
				+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
			
 
				 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
			
 
				 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
			
@@ -41,14 +41,13 @@ void opencl_codelet(void *descr[], void *_args)
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	err = 0;
			
 
				-        n=0;
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
			
 
				-	err = clSetKernelArg(kernel, 1, sizeof(int), &nx);
			
 
				-	err = clSetKernelArg(kernel, 2, sizeof(int), &ny);
			
 
				-	err = clSetKernelArg(kernel, 3, sizeof(int), &nz);
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
			
 
				+	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
			
 
				+	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
			
 
				+	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
			
 
				 	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
			
 
				 	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
			
 
				-	err = clSetKernelArg(kernel, 6, sizeof(float), multiplier);
			
 
				+	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
			
 
				         if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	{
			
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,12 +31,14 @@
 
				 #include <stdint.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 /* When the task is done, task->callback_func(task->callback_arg) is called. Any
			
 
				  * callback function must have the prototype void (*)(void *).
			
 
				  * NB: Callback are NOT allowed to perform potentially blocking operations */
			
 
				 void callback_func(void *callback_arg)
			
 
				 {
			
 
				-	printf("Callback function got argument %p\n", callback_arg);
			
 
				+        FPRINTF(stdout, "Callback function got argument %p\n", callback_arg);
			
 
				 }
			
 
				 
			
 
				 /* Every implementation of a codelet must have this prototype, the first
			
@@ -52,22 +54,16 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 {
			
 
				 	struct params *params = cl_arg;
			
 
				 
			
 
				-	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				+	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				 }
			
 
				 
			
 
				-starpu_codelet cl =
			
 
				-{
			
 
				-	/* this codelet may only be executed on a CPU, and its cpu
			
 
				- 	 * implementation is function "cpu_func" */
			
 
				-	.where = STARPU_CPU,
			
 
				-	.cpu_func = cpu_func,
			
 
				-	/* the codelet does not manipulate any data that is managed
			
 
				-	 * by our DSM */
			
 
				-	.nbuffers = 0
			
 
				-};
			
 
				+starpu_codelet cl;
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	struct starpu_task *task;
			
 
				+	struct params params = {1, 2.0f};
			
 
				+
			
 
				 	/* initialize StarPU : passing a NULL argument means that we use
			
 
				  	* default configuration for the scheduling policies and the number of
			
 
				 	* processors/accelerators */
			
@@ -76,7 +72,15 @@ int main(int argc, char **argv)
 
				 	/* create a new task that is non-blocking by default : the task is not
			
 
				 	 * submitted to the scheduler until the starpu_task_submit function is
			
 
				 	 * called */
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				+	task = starpu_task_create();
			
 
				+
			
 
				+	/* this codelet may only be executed on a CPU, and its cpu
			
 
				+ 	 * implementation is function "cpu_func" */
			
 
				+	cl.where = STARPU_CPU;
			
 
				+	cl.cpu_func = cpu_func;
			
 
				+	/* the codelet does not manipulate any data that is managed
			
 
				+	 * by our DSM */
			
 
				+	cl.nbuffers = 0;
			
 
				 
			
 
				 	/* the task uses codelet "cl" */
			
 
				 	task->cl = &cl;
			
@@ -89,7 +93,6 @@ int main(int argc, char **argv)
 
				 	 * is read-only so that any modification is not passed to other copies
			
 
				 	 * of the buffer.  For this reason, a buffer passed as a codelet
			
 
				 	 * argument (cl_arg) is NOT a valid synchronization medium! */
			
 
				-	struct params params = { 1, 2.0f };
			
 
				 	task->cl_arg = &params;
			
 
				 	task->cl_arg_size = sizeof(params);
			
 
				 		
			
@@ -103,6 +106,9 @@ int main(int argc, char **argv)
 
				 	
			
 
				 	/* submit the task to StarPU */
			
 
				 	starpu_task_submit(task);
			
 
				+
			
 
				+	/* destroy the task */
			
 
				+	starpu_task_destroy(task);
			
 
				 	
			
 
				 	/* terminate StarPU: statistics and other debug outputs are not
			
 
				 	 * guaranteed to be generated unless this function is called. Once it
			
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,8 +28,7 @@
 
				  *    monitoring data (starpu_data_unregister)
			
 
				  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
			
 
				  *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
			
 
				- *  - how to submit asynchronous tasks and how to use callback to handle task
			
 
				- *    termination
			
 
				+ *  - how to submit asynchronous tasks
			
 
				  */
			
 
				 
			
 
				 #include <string.h>
			
@@ -44,11 +43,6 @@
 
				 static float *A, *B, *C;
			
 
				 static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				 
			
 
				-static pthread_mutex_t mutex;
			
 
				-static pthread_cond_t cond;
			
 
				-static unsigned taskcounter;
			
 
				-static unsigned terminated = 0;
			
 
				-
			
 
				 static unsigned nslicesx = 4;
			
 
				 static unsigned nslicesy = 4;
			
 
				 static unsigned xdim = 1024;
			
@@ -77,37 +71,11 @@ static unsigned zdim = 512;
 
				 
			
 
				  */
			
 
				 
			
 
				-static void callback_func(void *arg)
			
 
				-{
			
 
				-	/* the argument is a pointer to a counter of the remaining tasks */
			
 
				-	int *counterptr = arg;
			
 
				-
			
 
				-	/* counterptr points to a variable with the number of remaining tasks,
			
 
				- 	 * when it reaches 0, all tasks are done */
			
 
				-	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
			
 
				-	if (counter == 0)
			
 
				-	{
			
 
				-		/* IMPORTANT : note that we CANNOT call blocking operations
			
 
				-		 * within callbacks as it may lead to a deadlock of StarPU.
			
 
				-		 * starpu_data_unpartition is for instance called by the main
			
 
				-		 * thread since it may cause /potentially/ blocking operations
			
 
				-		 * such as memory transfers from a GPU to a CPU. */
			
 
				-		
			
 
				-		/* wake the application to notify the termination of all the
			
 
				- 		 * tasks */
			
 
				-		pthread_mutex_lock(&mutex);
			
 
				-		terminated = 1;
			
 
				-		pthread_cond_signal(&cond);
			
 
				-		pthread_mutex_unlock(&mutex);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * The codelet is passed 3 matrices, the "descr" union-type field gives a
			
 
				  * description of the layout of those 3 matrices in the local memory (ie. RAM
			
 
				  * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
			
 
				- * registered data with the "blas" data interface, we manipulate the .blas
			
 
				- * field of the descr[x] elements which are union types.
			
 
				+ * registered data with the "matrix" data interface, we use the matrix macros.
			
 
				  */
			
 
				 
			
 
				 static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
@@ -218,18 +186,14 @@ static void partition_mult_data(void)
 
				 	/* StarPU supplies some basic filters such as the partition of a matrix
			
 
				 	 * into blocks, note that we are using a FORTRAN ordering so that the
			
 
				 	 * name of the filters are a bit misleading */
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter vert = {
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				-		.nchildren = nslicesx,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = nslicesx
			
 
				 	};
			
 
				 		
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter horiz = {
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				-		.nchildren = nslicesy,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = nslicesy
			
 
				 	};
			
 
				 		
			
 
				 /*
			
@@ -269,17 +233,17 @@ static void partition_mult_data(void)
 
				  *	enforce memory consistency.
			
 
				  */
			
 
				 
			
 
				-	starpu_data_partition(B_handle, &f);
			
 
				-	starpu_data_partition(A_handle, &f2);
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				 
			
 
				 	/* starpu_data_map_filters is a variable-arity function, the first argument
			
 
				 	 * is the handle of the data to partition, the second argument is the
			
 
				 	 * number of filters to apply recursively. Filters are applied in the
			
 
				 	 * same order as the arguments.
			
 
				-	 * This would be equivalent to starpu_data_partition(C_handle, &f) and
			
 
				-	 * then applying f2 on each sub-data (ie. each column of C)
			
 
				+	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
			
 
				+	 * then applying horiz on each sub-data (ie. each column of C)
			
 
				 	 */
			
 
				-	starpu_data_map_filters(C_handle, 2, &f, &f2);
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel_t mult_perf_model = {
			
@@ -287,28 +251,23 @@ static struct starpu_perfmodel_t mult_perf_model = {
 
				 	.symbol = "mult_perf_model"
			
 
				 };
			
 
				 
			
 
				+static starpu_codelet cl = {
			
 
				+        /* we can only execute that kernel on a CPU yet */
			
 
				+        .where = STARPU_CPU,
			
 
				+        /* CPU implementation of the codelet */
			
 
				+        .cpu_func = cpu_mult,
			
 
				+        /* the codelet manipulates 3 buffers that are managed by the
			
 
				+         * DSM */
			
 
				+        .nbuffers = 3,
			
 
				+        /* in case the scheduling policy may use performance models */
			
 
				+        .model = &mult_perf_model
			
 
				+};
			
 
				+
			
 
				 static void launch_tasks(void)
			
 
				 {
			
 
				 	/* partition the work into slices */
			
 
				 	unsigned taskx, tasky;
			
 
				 
			
 
				-	/* the callback decrements this value every time a task is terminated
			
 
				-	 * and notify the termination of the computation to the application
			
 
				-	 * when the counter reaches 0 */
			
 
				-	taskcounter = nslicesx * nslicesy;
			
 
				-
			
 
				-	starpu_codelet cl = {
			
 
				-		/* we can only execute that kernel on a CPU yet */
			
 
				-		.where = STARPU_CPU,
			
 
				-		/* CPU implementation of the codelet */
			
 
				-		.cpu_func = cpu_mult,
			
 
				-		/* the codelet manipulates 3 buffers that are managed by the
			
 
				- 		 * DSM */
			
 
				-		.nbuffers = 3,
			
 
				-		/* in case the scheduling policy may use performance models */
			
 
				-		.model = &mult_perf_model
			
 
				-	};
			
 
				-
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++) 
			
 
				 	{
			
 
				 		for (tasky = 0; tasky < nslicesy; tasky++)
			
@@ -322,9 +281,6 @@ static void launch_tasks(void)
 
				 			/* this task implements codelet "cl" */
			
 
				 			task->cl = &cl;
			
 
				 
			
 
				-			task->callback_func = callback_func;
			
 
				-			task->callback_arg = &taskcounter;
			
 
				-
			
 
				 			/*
			
 
				 			 *              |---|---|---|---|
			
 
				 			 *              |   | * |   |   | B
			
@@ -371,9 +327,6 @@ static void launch_tasks(void)
 
				 int main(__attribute__ ((unused)) int argc, 
			
 
				 	 __attribute__ ((unused)) char **argv)
			
 
				 {
			
 
				-	pthread_mutex_init(&mutex, NULL);
			
 
				-	pthread_cond_init(&cond, NULL);
			
 
				-
			
 
				 	/* start the runtime */
			
 
				 	starpu_init(NULL);
			
 
				 
			
@@ -387,26 +340,30 @@ int main(__attribute__ ((unused)) int argc,
 
				 	/* submit all tasks in an asynchronous fashion */
			
 
				 	launch_tasks();
			
 
				 
			
 
				-	/* the different tasks are asynchronous so we use a callback to get
			
 
				-	 * notified of the termination of the computation */
			
 
				-	pthread_mutex_lock(&mutex);
			
 
				-	if (!terminated)
			
 
				-		pthread_cond_wait(&cond, &mutex);
			
 
				-	pthread_mutex_unlock(&mutex);
			
 
				+	/* wait for termination */
			
 
				+        starpu_task_wait_for_all();
			
 
				 
			
 
				 	/* remove the filters applied by the means of starpu_data_map_filters; now
			
 
				  	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
			
 
				 	 * starpu_data_map_filters is called again on C_handle.
			
 
				 	 * The second argument is the memory node where the different subsets
			
 
				 	 * should be reassembled, 0 = main memory (RAM) */
			
 
				+	starpu_data_unpartition(A_handle, 0);
			
 
				+	starpu_data_unpartition(B_handle, 0);
			
 
				 	starpu_data_unpartition(C_handle, 0);
			
 
				 
			
 
				 	/* stop monitoring matrix C : after this, it is not possible to pass C 
			
 
				 	 * (or any subset of C) as a codelet input/output. This also implements
			
 
				 	 * a barrier so that the piece of data is put back into main memory in
			
 
				 	 * case it was only available on a GPU for instance. */
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(B_handle);
			
 
				 	starpu_data_unregister(C_handle);
			
 
				-	
			
 
				+
			
 
				+	free(A);
			
 
				+	free(B);
			
 
				+	free(C);
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -18,6 +18,8 @@
 
				 #include <starpu.h>
			
 
				 #include <pthread.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static unsigned niter = 50000;
			
 
				 
			
 
				 extern void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
@@ -41,6 +43,9 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	niter /= 100;
			
 
				+#endif
			
 
				         if (argc == 2) niter = atoi(argv[1]);
			
 
				         foo = 0.0f;
			
 
				 
			
@@ -48,7 +53,7 @@ int main(int argc, char **argv)
 
				                                       (uintptr_t)&foo, sizeof(float));
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program);
			
 
				+        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
			
 
				 #endif
			
 
				 
			
 
				 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
@@ -77,7 +82,7 @@ int main(int argc, char **argv)
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 		{
			
 
				-			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 			exit(0);
			
 
				 		}
			
 
				 	}
			
@@ -85,11 +90,9 @@ int main(int argc, char **argv)
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	/* update the array in RAM */
			
 
				-	starpu_data_acquire(float_array_handle, STARPU_R);
			
 
				-
			
 
				-	fprintf(stderr, "variable -> %f\n", foo);
			
 
				+	starpu_data_unregister(float_array_handle);
			
 
				 
			
 
				-	starpu_data_release(float_array_handle);
			
 
				+	FPRINTF(stderr, "variable -> %f\n", foo);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/basic_examples/variable_kernels_opencl.c
+++ b/examples/basic_examples/variable_kernels_opencl.c
@@ -21,7 +21,7 @@
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 void opencl_codelet(void *descr[], void *_args)
			
 
				 {
			
 
				-	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				 	cl_event event;
			
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	err = 0;
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	{
			
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -28,6 +28,7 @@
 
				 #include <stdio.h>
			
 
				 
			
 
				 #define	NX	2048
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
@@ -71,16 +72,17 @@ int main(int argc, char **argv)
 
				 	float vector[NX];
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < NX; i++)
			
 
				-		vector[i] = 1.0f;
			
 
				+                vector[i] = (i+1.0f);
			
 
				 
			
 
				-	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
			
 
				 
			
 
				 	/* Initialize StarPU with default configuration */
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
			
 
				-					    &opencl_program);
			
 
				+					    &opencl_program, NULL);
			
 
				 #endif
			
 
				 
			
 
				 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
@@ -125,6 +127,8 @@ int main(int argc, char **argv)
 
				  	 * monitoring it */
			
 
				 	starpu_data_unregister(vector_handle);
			
 
				 
			
 
				+	starpu_task_destroy(task);
			
 
				+
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         starpu_opencl_unload_opencl(&opencl_program);
			
 
				 #endif
			
@@ -132,7 +136,8 @@ int main(int argc, char **argv)
 
				 	/* terminate StarPU, no task can be submitted after */
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,7 +25,7 @@
 
				 static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				                                         float factor)
			
 
				 {
			
 
				-        unsigned i = threadIdx.x;
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				 
			
 
				 	if (i < n)
			
 
				                val[i] *= factor;
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -36,8 +36,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 
			
 
				 	/* length of the vector */
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				-	/* local copy of the vector pointer */
			
 
				-	float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	/* OpenCL copy of the vector pointer */
			
 
				+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				 
			
 
				 	id = starpu_worker_get_id();
			
 
				 	devid = starpu_worker_get_devid(id);
			
@@ -45,7 +45,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
			
 
				 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				 	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
			
 
				 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
			
 
				 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/callback/callback.c
+++ b/examples/callback/callback.c
@@ -0,0 +1,73 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <pthread.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+starpu_data_handle handle;
			
 
				+
			
 
				+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+
			
 
				+	*val += 1;
			
 
				+}
			
 
				+
			
 
				+starpu_codelet cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_func = cpu_codelet,
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+void callback_func(void *callback_arg)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &cl;
			
 
				+	task->buffers[0].handle = handle;
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+	starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int v=40;
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
			
 
				+
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->cl = &cl;
			
 
				+	task->callback_func = callback_func;
			
 
				+	task->callback_arg = NULL;
			
 
				+	task->buffers[0].handle = handle;
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	FPRINTF(stderr, "v -> %d\n", v);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -13,6 +13,7 @@
 
				  *
			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				+
			
 
				 #include <math.h>
			
 
				 #include <assert.h>
			
 
				 #include <sys/time.h>
			
@@ -24,6 +25,8 @@
 
				 #include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 /*
			
 
				  *	Conjugate Gradient
			
 
				  *
			
@@ -92,23 +95,19 @@ extern starpu_codelet bzero_vector_cl;
 
				 
			
 
				 static void generate_random_problem(void)
			
 
				 {
			
 
				-	srand48(0xdeadbeef);
			
 
				-
			
 
				 	int i, j;
			
 
				 
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&A, n*n*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&b, n*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&x, n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&A, n*n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&b, n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&x, n*sizeof(TYPE));
			
 
				 	assert(A && b && x);
			
 
				 
			
 
				-	/* Create a random matrix (A) and two random vectors (x and b) */
			
 
				 	for (j = 0; j < n; j++)
			
 
				 	{
			
 
				 		b[j] = (TYPE)1.0;
			
 
				 		x[j] = (TYPE)0.0;
			
 
				 
			
 
				 		/* We take Hilbert matrix that is not well conditionned but definite positive: H(i,j) = 1/(1+i+j) */
			
 
				-
			
 
				 		for (i = 0; i < n; i++)
			
 
				 		{
			
 
				 			A[n*j + i] = (TYPE)(1.0/(1.0+i+j));
			
@@ -116,9 +115,9 @@ static void generate_random_problem(void)
 
				 	}
			
 
				 
			
 
				 	/* Internal vectors */
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&r, n*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&d, n*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&q, n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&r, n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&d, n*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&q, n*sizeof(TYPE));
			
 
				 	assert(r && d && q);
			
 
				 
			
 
				 	memset(r, 0, n*sizeof(TYPE));
			
@@ -205,12 +204,12 @@ static void display_vector(starpu_data_handle handle, TYPE *ptr)
 
				 		starpu_data_acquire(starpu_data_get_sub_data(handle, 1, b), STARPU_R);
			
 
				 		for (ind = 0; ind < block_size; ind++)
			
 
				 		{
			
 
				-			fprintf(stderr, "%2.2e ", ptr[b*block_size + ind]);
			
 
				+			FPRINTF(stderr, "%2.2e ", ptr[b*block_size + ind]);
			
 
				 		}
			
 
				-		fprintf(stderr, "| ");
			
 
				+		FPRINTF(stderr, "| ");
			
 
				 		starpu_data_release(starpu_data_get_sub_data(handle, 1, b));
			
 
				 	}
			
 
				-	fprintf(stderr, "\n");
			
 
				+	FPRINTF(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 static void display_matrix(void)
			
@@ -220,9 +219,9 @@ static void display_matrix(void)
 
				 	{
			
 
				 		for (j = 0; j < n; j++)
			
 
				 		{
			
 
				-			fprintf(stderr, "%2.2e ", A[j*n + i]);
			
 
				+			FPRINTF(stderr, "%2.2e ", A[j*n + i]);
			
 
				 		}
			
 
				-		fprintf(stderr, "\n");
			
 
				+		FPRINTF(stderr, "\n");
			
 
				 	}
			
 
				 }
			
 
				 #endif
			
@@ -255,8 +254,8 @@ static void cg(void)
 
				 	delta_0 = delta_new;
			
 
				 	starpu_data_release(rtr_handle);
			
 
				 
			
 
				-	fprintf(stderr, "*************** INITIAL ************ \n");
			
 
				-	fprintf(stderr, "Delta 0: %e\n", delta_new);
			
 
				+	FPRINTF(stderr, "*************** INITIAL ************ \n");
			
 
				+	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
			
 
				 
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -307,8 +306,8 @@ static void cg(void)
 
				 		{
			
 
				 			/* We here take the error as ||r||_2 / (n||b||_2) */
			
 
				 			double error = sqrt(delta_new/delta_0)/(1.0*n);
			
 
				-			fprintf(stderr, "*****************************************\n");
			
 
				-			fprintf(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				+			FPRINTF(stderr, "*****************************************\n");
			
 
				+			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
			
 
				 		}
			
 
				 
			
 
				 		i++;
			
@@ -317,8 +316,8 @@ static void cg(void)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
			
 
				-	fprintf(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
			
 
				-	fprintf(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
			
 
				+	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
			
 
				+	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
			
 
				 }
			
 
				 
			
 
				 static int check(void)
			
@@ -351,7 +350,7 @@ static void parse_args(int argc, char **argv)
 
				 		}
			
 
				 
			
 
				 	        if (strcmp(argv[i], "-h") == 0) {
			
 
				-			fprintf(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				+			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				 			exit(-1);
			
 
				 			continue;
			
 
				 		}
			
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -82,4 +82,4 @@ void copy_handle(starpu_data_handle dst,
 
				 		starpu_data_handle src,
			
 
				 		unsigned nblocks);
			
 
				 
			
 
				-#endif // __STARPU_EXAMPLE_CG_H__
			
 
				+#endif /* __STARPU_EXAMPLE_CG_H__ */
			
--- a/examples/cg/cg_dot_kernel.cu
+++ b/examples/cg/cg_dot_kernel.cu
@@ -126,3 +126,22 @@ extern "C" void dot_host(TYPE *x, TYPE *y, unsigned nelems, TYPE *dot)
 
				 
			
 
				 	cudaFree(per_block_sum);
			
 
				 }
			
 
				+
			
 
				+static __global__ void zero_vector_device(TYPE *x, unsigned nelems, unsigned nelems_per_thread)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	unsigned first_i = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+
			
 
				+	for (i = first_i; i < nelems; i += nelems_per_thread)
			
 
				+		x[i] = 0.0;
			
 
				+}
			
 
				+
			
 
				+extern "C" void zero_vector(TYPE *x, unsigned nelems)
			
 
				+{
			
 
				+	unsigned nblocks = STARPU_MIN(128, nelems);
			
 
				+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nelems / nblocks));
			
 
				+
			
 
				+	unsigned nelems_per_thread = nelems / (nblocks * nthread_per_block);
			
 
				+
			
 
				+	zero_vector_device<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, nelems, nelems_per_thread);
			
 
				+}
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -16,6 +16,7 @@
 
				 
			
 
				 #include "cg.h"
			
 
				 #include <math.h>
			
 
				+#include <limits.h>
			
 
				 
			
 
				 #if 0
			
 
				 static void print_vector_from_descr(unsigned nx, TYPE *v)
			
@@ -123,11 +124,14 @@ starpu_codelet accumulate_vector_cl = {
 
				  */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+extern void zero_vector(TYPE *x, unsigned nelems);
			
 
				+
			
 
				 static void bzero_variable_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+
			
 
				+	zero_vector(v, 1);
			
 
				  
			
 
				-	cublasscal (1, (TYPE)0.0, v, 1);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
@@ -159,7 +163,8 @@ static void bzero_vector_cuda(void *descr[], void *cl_arg)
 
				 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				  
			
 
				-	cublasscal (n, (TYPE)0.0, v, 1);
			
 
				+	zero_vector(v, n);
			
 
				+
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
@@ -578,8 +583,8 @@ static void copy_handle_cuda(void *descr[], void *cl_arg)
 
				 	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				 	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				 
			
 
				-	cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				+	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -31,6 +31,7 @@
 
				 #include <common/blas.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define NMAXBLOCKS	32
			
 
				 
			
 
				 #define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
@@ -112,4 +113,4 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-#endif // __DW_CHOLESKY_H__
			
 
				+#endif /* __DW_CHOLESKY_H__ */
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
			
 
				 {
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
			
 
				 	
			
@@ -121,7 +121,7 @@ static starpu_codelet cl22 =
 
				 
			
 
				 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				 {
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
			
 
				 
			
@@ -173,17 +173,15 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
@@ -214,7 +212,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				-		fprintf(stderr, "No worker may execute this task\n");
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 		exit(-1);
			
 
				 	}
			
 
				 
			
@@ -261,7 +259,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
			
 
				+		starpu_malloc((void **)A, dim*dim*sizeof(float));
			
 
				 	} 
			
 
				 	else {
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
@@ -280,11 +278,11 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	double flop = (1.0f*size*size*size)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
@@ -311,26 +309,26 @@ int main(int argc, char **argv)
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				 
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
@@ -338,43 +336,43 @@ int main(int argc, char **argv)
 
				 	cholesky_grain(mat, size, size, nblocks, nbigblocks);
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				 
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				+	FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				 	SSYRK("L", "N", size, size, 1.0f, 
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				-	fprintf(stderr, "comparing results ...\n");
			
 
				+	FPRINTF(stderr, "comparing results ...\n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -126,13 +126,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				 
			
 
				 	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
@@ -143,17 +143,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
@@ -174,7 +172,7 @@ int main(int argc, char **argv)
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	float *mat;
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				 
			
 
				 	unsigned i,j;
			
 
				 	for (i = 0; i < size; i++)
			
@@ -182,58 +180,58 @@ int main(int argc, char **argv)
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-//#define PRINT_OUTPUT
			
 
				+/* #define PRINT_OUTPUT */
			
 
				 #ifdef PRINT_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				 
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	cholesky(mat, size, size, nblocks);
			
 
				 
			
 
				 #ifdef PRINT_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	if (check)
			
 
				 	{
			
 
				-		fprintf(stderr, "compute explicit LLt ...\n");
			
 
				+		FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				 				if (i > j) {
			
 
				-					mat[j+i*size] = 0.0f; // debug
			
 
				+					mat[j+i*size] = 0.0f; /* debug */
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -243,20 +241,20 @@ int main(int argc, char **argv)
 
				 		SSYRK("L", "N", size, size, 1.0f,
			
 
				 					mat, size, 0.0f, test_mat, size);
			
 
				 	
			
 
				-		fprintf(stderr, "comparing results ...\n");
			
 
				+		FPRINTF(stderr, "comparing results ...\n");
			
 
				 #ifdef PRINT_OUTPUT
			
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				 				if (i <= j) {
			
 
				-					printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 				}
			
 
				 				else {
			
 
				-					printf(".\t");
			
 
				+					FPRINTF(stdout, ".\t");
			
 
				 				}
			
 
				 			}
			
 
				-			printf("\n");
			
 
				+			FPRINTF(stdout, "\n");
			
 
				 		}
			
 
				 #endif
			
 
				 	
			
@@ -268,7 +266,7 @@ int main(int argc, char **argv)
 
				 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				 	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				 	                                if (err > 0.00001) {
			
 
				-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				 	                                        assert(0);
			
 
				 	                                }
			
 
				 	                        }
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -20,6 +20,10 @@
 
				 #include "../common/blas.h"
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <starpu_cuda.h>
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include "magma.h"
			
 
				+#include "magma_lapack.h"
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -28,7 +32,7 @@
 
				 
			
 
				 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				 {
			
 
				-	//printf("22\n");
			
 
				+	/* printf("22\n"); */
			
 
				 	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				 	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				 	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
@@ -88,7 +92,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
				 {
			
 
				 	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				 }
			
 
				-#endif// STARPU_USE_CUDA
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 /* 
			
 
				  * U21
			
@@ -96,7 +100,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 
				 
			
 
				 static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				 {
			
 
				-//	printf("21\n");
			
 
				+/*	printf("21\n"); */
			
 
				 	float *sub11;
			
 
				 	float *sub21;
			
 
				 
			
@@ -143,7 +147,7 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 
				 
			
 
				 static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
			
 
				 {
			
 
				-//	printf("11\n");
			
 
				+/*	printf("11\n"); */
			
 
				 	float *sub11;
			
 
				 
			
 
				 	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
			
@@ -179,13 +183,27 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				 			{
			
 
				+			int ret;
			
 
				+			int info;
			
 
				+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
			
 
				+			if (ret != MAGMA_SUCCESS) {
			
 
				+				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				+				STARPU_ABORT();
			
 
				+			}
			
 
				+			cudaError_t cures = cudaThreadSynchronize();
			
 
				+			STARPU_ASSERT(!cures);
			
 
				+			}
			
 
				+#else
			
 
				+			{
			
 
				+
			
 
				 			float *lambda11;
			
 
				 			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
			
 
				 
			
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				-
			
 
				+				
			
 
				 				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
@@ -193,7 +211,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 				
			
 
				 				*lambda11 = sqrt(*lambda11);
			
 
				 
			
 
				-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
			
 
				+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
			
 
				 				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				 
			
 
				 				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
			
@@ -206,8 +224,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 			cudaFreeHost(lambda11);
			
 
				 			}
			
 
				-		
			
 
				-
			
 
				+#endif
			
 
				 			break;
			
 
				 #endif
			
 
				 		default:
			
@@ -227,4 +244,4 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 
				 {
			
 
				 	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				 }
			
 
				-#endif// STARPU_USE_CUDA
			
 
				+#endif/* STARPU_USE_CUDA */
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,7 +26,7 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-//#define USE_PERTURBATION	1
			
 
				+/* #define USE_PERTURBATION	1 */
			
 
				 
			
 
				 #ifdef USE_PERTURBATION
			
 
				 #define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
			
@@ -43,7 +43,7 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cpu_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
@@ -58,7 +58,7 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
@@ -73,7 +73,7 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cpu_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
@@ -88,7 +88,7 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
@@ -103,7 +103,7 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cpu_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
@@ -118,7 +118,7 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				 
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
 
				-	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				 #endif
			
 
				 
			
 
				 	return PERTURBATE(cost);
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -49,7 +49,7 @@ static starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				 {
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11(k));
			
 
				 	
			
@@ -108,7 +108,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
 
				 
			
@@ -127,7 +127,7 @@ static starpu_codelet cl22 =
 
				 
			
 
				 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				 
			
@@ -155,7 +155,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
 
				 }
			
@@ -189,7 +189,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 		else {
			
 
				 			int ret = starpu_task_submit(task);
			
 
				                         if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                                 exit(0);
			
 
				                         }
			
 
				 
			
@@ -210,7 +210,7 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	/* schedule the codelet */
			
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				         if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				+                FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
 
				 
			
@@ -224,13 +224,13 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				 
			
 
				 	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
@@ -241,7 +241,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				 	} 
			
 
				 	else {
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
@@ -258,17 +258,15 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
@@ -299,26 +297,26 @@ int main(int argc, char **argv)
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				+	FPRINTF(stdout, "Input :\n");
			
 
				 
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
@@ -326,43 +324,43 @@ int main(int argc, char **argv)
 
				 	cholesky(mat, size, size, nblocks);
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				+	FPRINTF(stdout, "Results :\n");
			
 
				 
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				+	FPRINTF(stderr, "compute explicit LLt ...\n");
			
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				 	SSYRK("L", "N", size, size, 1.0f, 
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				-	fprintf(stderr, "comparing results ...\n");
			
 
				+	FPRINTF(stderr, "comparing results ...\n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			if (i <= j) {
			
 
				-				printf("%2.2f\t", test_mat[j +i*size]);
			
 
				+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				 			else {
			
 
				-				printf(".\t");
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -59,7 +59,7 @@ static starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
			
 
				 {
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11(k));
			
 
				 	
			
@@ -145,7 +145,7 @@ static starpu_codelet cl22 =
 
				 
			
 
				 static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				 
			
@@ -224,11 +224,11 @@ static void cholesky_no_stride(void)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	double flop = (1.0f*size*size*size)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -239,7 +239,7 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 	assert(nblocks <= NMAXBLOCKS);
			
 
				 
			
 
				-	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
--- a/examples/common/blas.h
+++ b/examples/common/blas.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -154,4 +154,4 @@ extern void dswap_(const int *n, double *x, const int *incx, double *y, const in
 
				 
			
 
				 #endif
			
 
				 
			
 
				-#endif // __BLAS_H__
			
 
				+#endif /* __BLAS_H__ */
			
--- a/examples/common/blas_model.c
+++ b/examples/common/blas_model.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -37,11 +37,11 @@ double gemm_cost(starpu_buffer_descr *descr)
 
				 	nyC = starpu_matrix_get_ny(descr[2].handle);
			
 
				 	nxA = starpu_matrix_get_nx(descr[0].handle);
			
 
				 
			
 
				-//	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
			
 
				+/*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
			
 
				 
			
 
				 	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
			
 
				 
			
 
				-//	printf("cost %e \n", cost);
			
 
				+/*	printf("cost %e \n", cost); */
			
 
				 
			
 
				 	return cost;
			
 
				 }
			
--- a/examples/common/blas_model.h
+++ b/examples/common/blas_model.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -54,4 +54,4 @@ static struct starpu_perfmodel_t starpu_dgemm_model_common = {
 
				 	.type = STARPU_COMMON,
			
 
				 };
			
 
				 
			
 
				-#endif // __BLAS_MODEL_H__
			
 
				+#endif /* __BLAS_MODEL_H__ */
			
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +23,8 @@
 
				 #define NZ    3
			
 
				 #define PARTS 2
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 extern void cpu_func(void *buffers[], void *cl_arg);
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -36,17 +38,17 @@ extern void opencl_func(void *buffers[], void *cl_arg);
 
				 void print_block(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz)
			
 
				 {
			
 
				         int i, j, k;
			
 
				-        fprintf(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%d ldz=%d\n", block, nx, ny, nz, ldy, ldz);
			
 
				+        FPRINTF(stderr, "block=%p nx=%d ny=%d nz=%d ldy=%u ldz=%u\n", block, nx, ny, nz, ldy, ldz);
			
 
				         for(k=0 ; k<nz ; k++) {
			
 
				                 for(j=0 ; j<ny ; j++) {
			
 
				                         for(i=0 ; i<nx ; i++) {
			
 
				-                                fprintf(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
			
 
				+                                FPRINTF(stderr, "%2d ", block[(k*ldz)+(j*ldy)+i]);
			
 
				                         }
			
 
				-                        fprintf(stderr,"\n");
			
 
				+                        FPRINTF(stderr,"\n");
			
 
				                 }
			
 
				-                fprintf(stderr,"\n");
			
 
				+                FPRINTF(stderr,"\n");
			
 
				         }
			
 
				-        fprintf(stderr,"\n");
			
 
				+        FPRINTF(stderr,"\n");
			
 
				 }
			
 
				 
			
 
				 void print_data(starpu_data_handle block_handle)
			
@@ -96,30 +98,28 @@ int main(int argc, char **argv)
 
				         starpu_init(NULL);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program);
			
 
				+        starpu_opencl_load_opencl_from_file("examples/filters/fblock_opencl_kernel.cl", &opencl_program, NULL);
			
 
				 #endif
			
 
				 
			
 
				         /* Declare data to StarPU */
			
 
				         starpu_block_data_register(&handle, 0, (uintptr_t)block, NX, NX*NY, NX, NY, NZ, sizeof(int));
			
 
				-        fprintf(stderr, "IN  Block\n");
			
 
				+        FPRINTF(stderr, "IN  Block\n");
			
 
				         print_data(handle);
			
 
				 
			
 
				         /* Partition the block in PARTS sub-blocks */
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				 		.filter_func = starpu_block_filter_func_block,
			
 
				-		.nchildren = PARTS,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = PARTS
			
 
				 	};
			
 
				         starpu_data_partition(handle, &f);
			
 
				 
			
 
				-        fprintf(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
			
 
				+        FPRINTF(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
			
 
				 
			
 
				         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
			
 
				         {
			
 
				                 starpu_data_handle sblock = starpu_data_get_sub_data(handle, 1, i);
			
 
				-                fprintf(stderr, "Sub block %d\n", i);
			
 
				+                FPRINTF(stderr, "Sub block %d\n", i);
			
 
				                 print_data(sblock);
			
 
				         }
			
 
				 
			
@@ -129,7 +129,7 @@ int main(int argc, char **argv)
 
				                 int ret,multiplier=i;
			
 
				                 struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				-                fprintf(stderr,"Dealing with sub-block %d\n", i);
			
 
				+                FPRINTF(stderr,"Dealing with sub-block %d\n", i);
			
 
				                 task->cl = &cl;
			
 
				                 task->synchronous = 1;
			
 
				                 task->callback_func = NULL;
			
@@ -139,9 +139,10 @@ int main(int argc, char **argv)
 
				 
			
 
				                 ret = starpu_task_submit(task);
			
 
				                 if (ret) {
			
 
				-                        fprintf(stderr, "Error when submitting task\n");
			
 
				+                        FPRINTF(stderr, "Error when submitting task\n");
			
 
				                         exit(ret);
			
 
				                 }
			
 
				+		starpu_task_destroy(task);
			
 
				         }
			
 
				 
			
 
				         /* Unpartition the data, unregister it from StarPU and shutdown */
			
@@ -150,10 +151,11 @@ int main(int argc, char **argv)
 
				         starpu_data_unregister(handle);
			
 
				 
			
 
				         /* Print result block */
			
 
				-        fprintf(stderr, "OUT Block\n");
			
 
				+        FPRINTF(stderr, "OUT Block\n");
			
 
				         print_block(block, NX, NY, NZ, NX, NX*NY);
			
 
				 
			
 
				-	starpu_shutdown();
			
 
				+	free(block);
			
 
				 
			
 
				+	starpu_shutdown();
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/filters/fblock_opencl.c
+++ b/examples/filters/fblock_opencl.c
@@ -28,7 +28,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 
				 	cl_event event;
			
 
				 
			
 
				         int *factor = cl_arg;
			
 
				-	int *block = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(buffers[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(buffers[0]);
			
 
				 	int ny = (int)STARPU_BLOCK_GET_NY(buffers[0]);
			
 
				 	int nz = (int)STARPU_BLOCK_GET_NZ(buffers[0]);
			
@@ -42,7 +42,7 @@ void opencl_func(void *buffers[], void *cl_arg)
 
				 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	err = 0;
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block);
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
			
 
				 	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
			
 
				 	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
			
 
				 	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
			
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				 #define NY    4
			
 
				 #define PARTS 2
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				         unsigned i, j;
			
@@ -43,15 +45,15 @@ int main(int argc, char **argv)
 
				 	unsigned i, j, n=1;
			
 
				         int matrix[NX*NY];
			
 
				 
			
 
				-        fprintf(stderr,"IN  Matrix: \n");
			
 
				+        FPRINTF(stderr,"IN  Matrix: \n");
			
 
				         for(j=0 ; j<NY ; j++) {
			
 
				                 for(i=0 ; i<NX ; i++) {
			
 
				                         matrix[(j*NX)+i] = n++;
			
 
				-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
			
 
				+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
			
 
				                 }
			
 
				-                fprintf(stderr,"\n");
			
 
				+                FPRINTF(stderr,"\n");
			
 
				         }
			
 
				-        fprintf(stderr,"\n");
			
 
				+        FPRINTF(stderr,"\n");
			
 
				 
			
 
				         starpu_data_handle handle;
			
 
				         starpu_codelet cl = {
			
@@ -68,9 +70,7 @@ int main(int argc, char **argv)
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				-		.nchildren = PARTS,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = PARTS
			
 
				 	};
			
 
				 	starpu_data_partition(handle, &f);
			
 
				 
			
@@ -86,6 +86,7 @@ int main(int argc, char **argv)
 
				                 task->cl_arg = &factor;
			
 
				                 task->cl_arg_size = sizeof(factor);
			
 
				 		starpu_task_submit(task);
			
 
				+		starpu_task_destroy(task);
			
 
				 	}
			
 
				 
			
 
				         /* Unpartition the data, unregister it from StarPU and shutdown */
			
@@ -94,14 +95,14 @@ int main(int argc, char **argv)
 
				 	starpu_shutdown();
			
 
				 
			
 
				         /* Print result matrix */
			
 
				-        fprintf(stderr,"OUT Matrix: \n");
			
 
				+        FPRINTF(stderr,"OUT Matrix: \n");
			
 
				         for(j=0 ; j<NY ; j++) {
			
 
				                 for(i=0 ; i<NX ; i++) {
			
 
				-                        fprintf(stderr, "%2d ", matrix[(j*NX)+i]);
			
 
				+                        FPRINTF(stderr, "%2d ", matrix[(j*NX)+i]);
			
 
				                 }
			
 
				-                fprintf(stderr,"\n");
			
 
				+                FPRINTF(stderr,"\n");
			
 
				         }
			
 
				-        fprintf(stderr,"\n");
			
 
				+        FPRINTF(stderr,"\n");
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,8 @@
 
				 #define NX    21
			
 
				 #define PARTS 3
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				         unsigned i;
			
@@ -47,9 +49,9 @@ int main(int argc, char **argv)
 
				         };
			
 
				 
			
 
				         for(i=0 ; i<NX ; i++) vector[i] = i;
			
 
				-        fprintf(stderr,"IN  Vector: ");
			
 
				-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
			
 
				-        fprintf(stderr,"\n");
			
 
				+        FPRINTF(stderr,"IN  Vector: ");
			
 
				+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
			
 
				+        FPRINTF(stderr,"\n");
			
 
				 
			
 
				 	starpu_init(NULL);
			
 
				 
			
@@ -60,9 +62,7 @@ int main(int argc, char **argv)
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				-		.nchildren = PARTS,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				+		.nchildren = PARTS
			
 
				 	};
			
 
				 	starpu_data_partition(handle, &f);
			
 
				 
			
@@ -81,15 +81,16 @@ int main(int argc, char **argv)
 
				                 task->cl_arg_size = sizeof(factor);
			
 
				 
			
 
				 		starpu_task_submit(task);
			
 
				+		starpu_task_destroy(task);
			
 
				 	}
			
 
				 
			
 
				 	starpu_data_unpartition(handle, 0);
			
 
				         starpu_data_unregister(handle);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-        fprintf(stderr,"OUT Vector: ");
			
 
				-        for(i=0 ; i<NX ; i++) fprintf(stderr, "%5d ", vector[i]);
			
 
				-        fprintf(stderr,"\n");
			
 
				+        FPRINTF(stderr,"OUT Vector: ");
			
 
				+        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
			
 
				+        FPRINTF(stderr,"\n");
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -624,12 +624,12 @@ void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				 	double flop = (2.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
			
@@ -666,7 +666,7 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 
				 	int ret = starpu_task_submit(task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				-		fprintf(stderr, "No worker may execute this task\n");
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 		exit(0);
			
 
				 	}
			
 
				 
			
@@ -681,12 +681,12 @@ void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				 	double flop = (2.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
			
@@ -697,8 +697,8 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
			
 
				+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				+		starpu_malloc((void **)B, (size_t)dim*sizeof(float));
			
 
				 	} 
			
 
				 	else {
			
 
				 		*A = malloc((size_t)dim*dim*sizeof(float));
			
@@ -714,7 +714,7 @@ void dw_factoLU(float *matA, unsigned size,
 
				 {
			
 
				 
			
 
				 #ifdef CHECK_RESULTS
			
 
				-	fprintf(stderr, "Checking results ...\n");
			
 
				+	FPRINTF(stderr, "Checking results ...\n");
			
 
				 	float *Asaved;
			
 
				 	Asaved = malloc((size_t)ld*ld*sizeof(float));
			
 
				 
			
@@ -730,17 +730,15 @@ void dw_factoLU(float *matA, unsigned size,
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, 
			
 
				 			size, size, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
--- a/examples/heat/dw_factolu.h
+++ b/examples/heat/dw_factolu.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,6 +36,8 @@
 
				 
			
 
				 #include "lu_kernels_model.h"
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 #define BLAS3_FLOP(n1,n2,n3)    \
			
 
				         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				 
			
@@ -82,53 +84,53 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
				 
			
 
				 #if 0
			
 
				 	/* display L */
			
 
				-	printf("(LU): \n");
			
 
				+	FPRINTF(stdout, "(LU): \n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-//			if (i <= j) {
			
 
				-				printf("%2.2f\t", LU[j +i*size]);
			
 
				-//			}
			
 
				-//			else {
			
 
				-//				printf(".\t");
			
 
				-//			}
			
 
				+/*			if (i <= j) { */
			
 
				+				FPRINTF(stdout, "%2.2f\t", LU[j +i*size]);
			
 
				+/*			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			} */
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				 
			
 
				 
			
 
				 	/* display L */
			
 
				-	printf("L: \n");
			
 
				+	FPRINTF(stdout, "L: \n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-//			if (i <= j) {
			
 
				-				printf("%2.2f\t", L[j +i*size]);
			
 
				-//			}
			
 
				-//			else {
			
 
				-//				printf(".\t");
			
 
				-//			}
			
 
				+/*			if (i <= j) { */
			
 
				+				FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
			
 
				+/*			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			} */
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				 	/* display U */
			
 
				-	printf("U: \n");
			
 
				+	FPRINTF(stdout, "U: \n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-//			if (i <= j) {
			
 
				-				printf("%2.2f\t", U[j +i*size]);
			
 
				-//			}
			
 
				-//			else {
			
 
				-//				printf(".\t");
			
 
				-//			}
			
 
				+/*			if (i <= j) { */
			
 
				+				FPRINTF(stdout, "%2.2f\t", U[j +i*size]);
			
 
				+/*			}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			} */
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				 #endif
			
@@ -148,42 +150,42 @@ static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
 
				 
			
 
				 #if 0
			
 
				 	/* display A */
			
 
				-	printf("A: \n");
			
 
				+	FPRINTF(stdout, "A: \n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-	//		if (i <= j) {
			
 
				-	      			printf("%2.2f\t", A[j +i*size]);
			
 
				-	//		}
			
 
				-	//		else {
			
 
				-	//			printf(".\t");
			
 
				-	//		}
			
 
				+	/*		if (i <= j) { */
			
 
				+	      			FPRINTF(stdout, "%2.2f\t", A[j +i*size]);
			
 
				+	/*		}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			} */
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 
			
 
				 
			
 
				 	/* display LU */
			
 
				-	printf("LU: \n");
			
 
				+	FPRINTF(stdout, "LU: \n");
			
 
				 	for (j = 0; j < size; j++)
			
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-	//		if (i <= j) {
			
 
				-	      			printf("%2.2f\t", L[j +i*size]);
			
 
				-	//		}
			
 
				-	//		else {
			
 
				-	//			printf(".\t");
			
 
				-	//		}
			
 
				+	/*		if (i <= j) { */
			
 
				+	      			FPRINTF(stdout, "%2.2f\t", L[j +i*size]);
			
 
				+	/*		}
			
 
				+			else {
			
 
				+				FPRINTF(stdout, ".\t");
			
 
				+			} */
			
 
				 		}
			
 
				-		printf("\n");
			
 
				+		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	printf("max error between A and L*U = %f \n", max_err);
			
 
				+	FPRINTF(stdout, "max error between A and L*U = %f \n", max_err);
			
 
				 }
			
 
				-#endif // CHECK_RESULTS
			
 
				+#endif /* CHECK_RESULTS */
			
 
				 
			
 
				 void dw_cpu_codelet_update_u11(void **, void *);
			
 
				 void dw_cpu_codelet_update_u12(void **, void *);
			
@@ -211,4 +213,4 @@ extern struct starpu_perfmodel_t model_12;
 
				 extern struct starpu_perfmodel_t model_21;
			
 
				 extern struct starpu_perfmodel_t model_22;
			
 
				 
			
 
				-#endif // __DW_FACTO_LU_H__
			
 
				+#endif /* __DW_FACTO_LU_H__ */
			
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -54,7 +54,7 @@ static starpu_codelet cl11 = {
 
				 
			
 
				 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
			
 
				 {
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
			
 
				 
			
@@ -87,7 +87,7 @@ static starpu_codelet cl12 = {
 
				 
			
 
				 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
			
 
				 {
			
 
				-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
			
 
				+/*	FPRINTF(stdout, "task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
			
 
				 	
			
@@ -163,7 +163,7 @@ static starpu_codelet cl22 = {
 
				 
			
 
				 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
			
 
				 {
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
			
 
				 
			
@@ -207,17 +207,15 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
				 	unsigned nblocks = size / blocksize;
			
 
				 	unsigned maxk = inner_size / blocksize;
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
@@ -262,7 +260,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				-		fprintf(stderr, "No worker may execute this task\n");
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 		exit(-1);
			
 
				 	}
			
 
				 
			
@@ -299,13 +297,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
				 
			
 
				 		float *newmatA = &matA[inner_size*(ld+1)];
			
 
				 
			
 
				-//		if (tag_prefix < 2)
			
 
				-//		{
			
 
				-//			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
			
 
				-//		}
			
 
				-//		else {
			
 
				+/*		if (tag_prefix < 2)
			
 
				+		{
			
 
				+			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
			
 
				+		}
			
 
				+		else { */
			
 
				 			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
			
 
				-//		}
			
 
				+/*		} */
			
 
				 	}
			
 
				 
			
 
				 }
			
@@ -314,7 +312,7 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 
				 {
			
 
				 
			
 
				 #ifdef CHECK_RESULTS
			
 
				-	fprintf(stderr, "Checking results ...\n");
			
 
				+	FPRINTF(stderr, "Checking results ...\n");
			
 
				 	float *Asaved;
			
 
				 	Asaved = malloc(ld*ld*sizeof(float));
			
 
				 
			
@@ -333,12 +331,12 @@ void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks,
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned n = size;
			
 
				 	double flop = (2.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 
			
 
				 #ifdef CHECK_RESULTS
			
 
				 	compare_A_LU(Asaved, matA, size, ld);
			
--- a/examples/heat/dw_factolu_kernels.c
+++ b/examples/heat/dw_factolu_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -33,7 +33,7 @@ void display_stat_heat(void)
 
				 {
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 
			
 
				-	fprintf(stderr, "STATS : \n");
			
 
				+	FPRINTF(stderr, "STATS : \n");
			
 
				 
			
 
				 	unsigned worker;
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
@@ -49,7 +49,7 @@ void display_stat_heat(void)
 
				 		count_22_total += count_22_per_worker[worker];
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "\t11 (diagonal block LU)\n");
			
 
				+	FPRINTF(stderr, "\t11 (diagonal block LU)\n");
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		if (count_total_per_worker[worker])
			
@@ -57,11 +57,11 @@ void display_stat_heat(void)
 
				 			char name[32];
			
 
				 			starpu_worker_get_name(worker, name, 32);
			
 
				 			
			
 
				-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
			
 
				+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stderr, "\t12 (TRSM)\n");
			
 
				+	FPRINTF(stderr, "\t12 (TRSM)\n");
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		if (count_total_per_worker[worker])
			
@@ -69,12 +69,12 @@ void display_stat_heat(void)
 
				 			char name[32];
			
 
				 			starpu_worker_get_name(worker, name, 32);
			
 
				 			
			
 
				-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
			
 
				+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
			
 
				 		}
			
 
				 	}
			
 
				 	
			
 
				 	
			
 
				-	fprintf(stderr, "\t21 (TRSM)\n");
			
 
				+	FPRINTF(stderr, "\t21 (TRSM)\n");
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		if (count_total_per_worker[worker])
			
@@ -82,11 +82,11 @@ void display_stat_heat(void)
 
				 			char name[32];
			
 
				 			starpu_worker_get_name(worker, name, 32);
			
 
				 			
			
 
				-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
			
 
				+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
			
 
				 		}
			
 
				 	}
			
 
				 	
			
 
				-	fprintf(stderr, "\t22 (SGEMM)\n");
			
 
				+	FPRINTF(stderr, "\t22 (SGEMM)\n");
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		if (count_total_per_worker[worker])
			
@@ -94,7 +94,7 @@ void display_stat_heat(void)
 
				 			char name[32];
			
 
				 			starpu_worker_get_name(worker, name, 32);
			
 
				 			
			
 
				-			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
			
 
				+			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -162,7 +162,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 
				 	int id = starpu_worker_get_id();
			
 
				 	count_22_per_worker[id]++;
			
 
				 }
			
 
				-#endif// STARPU_USE_CUDA
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 /*
			
 
				  * U12
			
@@ -225,7 +225,7 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 
				 	int id = starpu_worker_get_id();
			
 
				 	count_12_per_worker[id]++;
			
 
				 }
			
 
				-#endif // STARPU_USE_CUDA
			
 
				+#endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 /* 
			
 
				  * U21
			
@@ -298,12 +298,12 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 
				 	{
			
 
				 		for (i = 0; i < n; i++)
			
 
				 		{
			
 
				-			fprintf(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
			
 
				+			FPRINTF(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
			
 
				 		}
			
 
				-		fprintf(stderr, "\n");
			
 
				+		FPRINTF(stderr, "\n");
			
 
				 	}
			
 
				 	
			
 
				-	fprintf(stderr, "\n");
			
 
				+	FPRINTF(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
			
@@ -378,4 +378,4 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 
				 	int id = starpu_worker_get_id();
			
 
				 	count_11_per_worker[id]++;
			
 
				 }
			
 
				-#endif// STARPU_USE_CUDA
			
 
				+#endif /* STARPU_USE_CUDA */
			
--- a/examples/heat/dw_factolu_tag.c
+++ b/examples/heat/dw_factolu_tag.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -56,7 +56,7 @@ static starpu_codelet cl11 = {
 
				 
			
 
				 static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				 {
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				+/*	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11(k));
			
 
				 
			
@@ -90,7 +90,7 @@ static starpu_codelet cl12 = {
 
				 
			
 
				 static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
			
 
				 {
			
 
				-//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
			
 
				+/*	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG12(k, i));
			
 
				 	
			
@@ -166,7 +166,7 @@ static starpu_codelet cl22 = {
 
				 
			
 
				 static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				+/*	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				 
			
@@ -241,7 +241,7 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				-		fprintf(stderr, "No worker may execute this task\n");
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 		exit(-1);
			
 
				 	}
			
 
				 
			
@@ -253,19 +253,19 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				+	FPRINTF(stderr, "Computation took (in ms)\n");
			
 
				 	printf("%2.2f\n", timing/1000);
			
 
				 
			
 
				 	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				 	double flop = (2.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				 void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
			
 
				 {
			
 
				 
			
 
				 #ifdef CHECK_RESULTS
			
 
				-	fprintf(stderr, "Checking results ...\n");
			
 
				+	FPRINTF(stderr, "Checking results ...\n");
			
 
				 	float *Asaved;
			
 
				 	Asaved = malloc((size_t)ld*ld*sizeof(float));
			
 
				 
			
@@ -280,17 +280,15 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,7 @@
 
				  */
			
 
				 
			
 
				 #include "dw_sparse_cg.h"
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 static struct starpu_task *create_task(starpu_tag_t id)
			
 
				 {
			
@@ -298,13 +299,13 @@ void iteration_cg(void *problem)
 
				 {
			
 
				 	struct cg_problem *pb = problem;
			
 
				 
			
 
				-	printf("i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
			
 
				+	FPRINTF(stdout, "i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
			
 
				 
			
 
				 	if ((pb->i < MAXITER) && 
			
 
				 		(pb->delta_new > pb->epsilon) )
			
 
				 	{
			
 
				 		if (pb->i % 1000 == 0)
			
 
				-			printf("i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
			
 
				+			FPRINTF(stdout, "i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
			
 
				 
			
 
				 		pb->i++;
			
 
				 
			
@@ -313,8 +314,8 @@ void iteration_cg(void *problem)
 
				 	}
			
 
				 	else {
			
 
				 		/* we may stop */
			
 
				-		printf("We are done ... after %d iterations \n", pb->i - 1);
			
 
				-		printf("i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
			
 
				+		FPRINTF(stdout, "We are done ... after %d iterations \n", pb->i - 1);
			
 
				+		FPRINTF(stdout, "i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
			
 
				 		sem_post(pb->sem);
			
 
				 	}
			
 
				 }
			
@@ -353,7 +354,7 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 
				 		ptr_vecq[i] = 0.0f;
			
 
				 	}
			
 
				 
			
 
				-	printf("nrow = %d \n", nrow);
			
 
				+	FPRINTF(stdout, "nrow = %u \n", nrow);
			
 
				 
			
 
				 	/* and register them as well */
			
 
				 	starpu_vector_data_register(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));
			
--- a/examples/heat/dw_sparse_cg.h
+++ b/examples/heat/dw_sparse_cg.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -101,7 +101,7 @@ static void __attribute__ ((unused)) print_results(float *result, unsigned size)
 
				 
			
 
				 	for (i = 0; i < STARPU_MIN(size, 16); i++)
			
 
				 	{
			
 
				-		printf("%d -> %f\n", i, result[i]);
			
 
				+		printf("%u -> %f\n", i, result[i]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -134,4 +134,4 @@ void iteration_cg(void *problem);
 
				 void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
			
 
				 			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
			
 
				 
			
 
				-#endif // __DW_SPARSE_CG_H__
			
 
				+#endif /* __DW_SPARSE_CG_H__ */
			
--- a/examples/heat/dw_sparse_cg_kernels.c
+++ b/examples/heat/dw_sparse_cg_kernels.c
@@ -64,10 +64,8 @@ void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
 
				 	float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
			
 
				 
			
 
				 
			
 
				-	uint32_t nnz;
			
 
				 	uint32_t nrow;
			
 
				 
			
 
				-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
			
 
				 	nrow = STARPU_CSR_GET_NROW(descr[0]);
			
 
				 
			
 
				 	unsigned row;
			
@@ -173,10 +171,8 @@ void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
 
				 	float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				 	float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				 
			
 
				-	uint32_t nnz;
			
 
				 	uint32_t nrow;
			
 
				 
			
 
				-	nnz = STARPU_CSR_GET_NNZ(descr[0]);
			
 
				 	nrow = STARPU_CSR_GET_NROW(descr[0]);
			
 
				 
			
 
				 	unsigned row;
			
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -321,7 +321,7 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
				 	/* solve the actual problem LU X = B */
			
 
				         /* solve LX' = Y with X' = UX */
			
 
				         /* solve UX = X' */
			
 
				-	fprintf(stderr, "Solving the problem ...\n");
			
 
				+	FPRINTF(stderr, "Solving the problem ...\n");
			
 
				 
			
 
				 	float *savedB;
			
 
				 	float *LUB;
			
@@ -360,10 +360,10 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 
				 	
			
 
				 		/* check if LUB is close to the 0 vector */
			
 
				 		int maxind = ISAMAX(subsize, LUB, 1);
			
 
				-		fprintf(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
			
 
				+		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
			
 
				 
			
 
				 		float sum = SASUM(subsize, LUB, 1);
			
 
				-		fprintf(stderr,"avg. error %e\n", sum/subsize);
			
 
				+		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
			
 
				 	
			
 
				 		free(LUB);
			
 
				 		free(savedB);
			
@@ -494,10 +494,10 @@ static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned n
 
				 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
			
 
				 				{
			
 
				 					/* we got a possible neighbour */
			
 
				-					unsigned node = 
			
 
				+					unsigned pnode = 
			
 
				 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
			
 
				 
			
 
				-					neighbours[nneighbours++] = TRANSLATEBACK(node);
			
 
				+					neighbours[nneighbours++] = TRANSLATEBACK(pnode);
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -569,10 +569,10 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 
				 
			
 
				 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
			
 
				 		{
			
 
				-			unsigned i = neighbours[neighbour]; 
			
 
				-			if (i >= newsize)
			
 
				+			unsigned n = neighbours[neighbour]; 
			
 
				+			if (n >= newsize)
			
 
				 			{
			
 
				-				B[j] -= compute_A_value(TRANSLATE(i), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(i)];
			
 
				+				B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -729,7 +729,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
			
 
				 
			
 
				-		fprintf(stderr, "Problem size : %dx%d (%dx%d) (%ld MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
			
 
				+		FPRINTF(stderr, "Problem size : %ux%u (%ux%u) (%lu MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
			
 
				 
			
 
				 		STARPU_ASSERT(newsize % nblocks == 0);
			
 
				 
			
--- a/examples/heat/heat.h
+++ b/examples/heat/heat.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,7 +24,7 @@
 
				 #include <assert.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				-// needed for STARPU_OPENGL_RENDER
			
 
				+/* needed for STARPU_OPENGL_RENDER */
			
 
				 #include <starpu_config.h>
			
 
				 #include <starpu.h>
			
 
				 
			
@@ -36,6 +36,8 @@
 
				 #include <GL/glut.h>
			
 
				 #endif
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 #define X	0
			
 
				 #define Y	1
			
 
				 
			
@@ -66,4 +68,4 @@ void display_stat_heat(void);
 
				 extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
			
 
				 #endif
			
 
				 
			
 
				-#endif // __HEAT_H__
			
 
				+#endif /* __HEAT_H__ */
			
--- a/examples/heat/heat_display.c
+++ b/examples/heat/heat_display.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -133,8 +133,8 @@ static void display(void)
 
				 	float factor = 1.0/amplitude;
			
 
				 	glScalef (factor, factor, factor);      /* modeling transformation */
			
 
				 	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
			
 
				-//	printf("factor %f\n", factor);
			
 
				-	//   glRotatef(-0,0.0,0.0,0.0);
			
 
				+/*	printf("factor %f\n", factor);
			
 
				+	   glRotatef(-0,0.0,0.0,0.0); */
			
 
				 	generate_graph();
			
 
				 	glFlush ();
			
 
				 }
			
@@ -211,7 +211,7 @@ void find_limits(void)
 
				 
			
 
				 void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
			
 
				 {
			
 
				-	fprintf(stderr, "OpenGL rendering ... \n");
			
 
				+	FPRINTF(stderr, "OpenGL rendering ... \n");
			
 
				 
			
 
				 	ntheta = _ntheta;
			
 
				 	nthick = _nthick;
			
@@ -236,4 +236,4 @@ void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_p
 
				 	glutReshapeFunc(reshape);
			
 
				 	glutMainLoop();
			
 
				 }
			
 
				-#endif // STARPU_OPENGL_RENDER
			
 
				+#endif /* STARPU_OPENGL_RENDER */
			
--- a/examples/heat/lu_kernels_model.c
+++ b/examples/heat/lu_kernels_model.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -26,7 +26,7 @@
 
				  *	Number of flops of Gemm 
			
 
				  */
			
 
				 
			
 
				-//#define USE_PERTURBATION	1
			
 
				+/* #define USE_PERTURBATION	1 */
			
 
				 
			
 
				 
			
 
				 #ifdef USE_PERTURBATION
			
@@ -58,10 +58,10 @@ double task_12_cost(starpu_buffer_descr *descr)
 
				 
			
 
				 	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				 
			
 
				-//	double cost = ((n*n*n)/1744.695);
			
 
				+/*	double cost = ((n*n*n)/1744.695); */
			
 
				 	double cost = ((n*n*n)/3210.80);
			
 
				 
			
 
				-	//fprintf(stderr, "task 12 predicts %e\n", cost);
			
 
				+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -72,10 +72,10 @@ double task_21_cost(starpu_buffer_descr *descr)
 
				 
			
 
				 	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				 
			
 
				-//	double cost = ((n*n*n)/1744.695);
			
 
				+/*	double cost = ((n*n*n)/1744.695); */
			
 
				 	double cost = ((n*n*n)/3691.53);
			
 
				 
			
 
				-	//fprintf(stderr, "task 12 predicts %e\n", cost);
			
 
				+	/* fprintf(stderr, "task 12 predicts %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -109,7 +109,7 @@ double task_11_cost_cuda(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/1853.7806);
			
 
				 
			
 
				-//	printf("CUDA task 11 ; predict %e\n", cost);
			
 
				+/*	printf("CUDA task 11 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -121,7 +121,7 @@ double task_12_cost_cuda(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/42838.5718);
			
 
				 
			
 
				-//	printf("CUDA task 12 ; predict %e\n", cost);
			
 
				+/*	printf("CUDA task 12 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -134,7 +134,7 @@ double task_21_cost_cuda(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/49208.667);
			
 
				 
			
 
				-//	printf("CUDA task 21 ; predict %e\n", cost);
			
 
				+/*	printf("CUDA task 21 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -150,7 +150,7 @@ double task_22_cost_cuda(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((nx*ny*nz)/57523.560);
			
 
				 
			
 
				-//	printf("CUDA task 22 ; predict %e\n", cost);
			
 
				+/*	printf("CUDA task 22 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -168,7 +168,7 @@ double task_11_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/537.5);
			
 
				 
			
 
				-//	printf("CPU task 11 ; predict %e\n", cost);
			
 
				+/*	printf("CPU task 11 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -180,7 +180,7 @@ double task_12_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/6668.224);
			
 
				 
			
 
				-//	printf("CPU task 12 ; predict %e\n", cost);
			
 
				+/*	printf("CPU task 12 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -193,7 +193,7 @@ double task_21_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((n*n*n)/6793.8423);
			
 
				 
			
 
				-//	printf("CPU task 21 ; predict %e\n", cost);
			
 
				+/*	printf("CPU task 21 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
@@ -209,7 +209,7 @@ double task_22_cost_cpu(starpu_buffer_descr *descr)
 
				 
			
 
				 	double cost = ((nx*ny*nz)/4203.0175);
			
 
				 
			
 
				-//	printf("CPU task 22 ; predict %e\n", cost);
			
 
				+/*	printf("CPU task 22 ; predict %e\n", cost); */
			
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
--- a/examples/heat/lu_kernels_model.h
+++ b/examples/heat/lu_kernels_model.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -20,4 +20,4 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#endif // __LU_KERNELS_MODEL_H__
			
 
				+#endif /* __LU_KERNELS_MODEL_H__ */
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -20,6 +20,7 @@
 
				 #include <sys/time.h>
			
 
				 
			
 
				 static unsigned niter = 50000;
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
@@ -42,6 +43,9 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	niter /= 100;
			
 
				+#endif
			
 
				 	if (argc == 2)
			
 
				 		niter = atoi(argv[1]);
			
 
				 
			
@@ -52,7 +56,7 @@ int main(int argc, char **argv)
 
				 			(uintptr_t)&float_array, 4, sizeof(float));
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program);
			
 
				+        starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
			
 
				 #endif
			
 
				 
			
 
				 	starpu_codelet cl =
			
@@ -88,7 +92,7 @@ int main(int argc, char **argv)
 
				 		int ret = starpu_task_submit(task);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 		{
			
 
				-			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 			exit(0);
			
 
				 		}
			
 
				 	}
			
@@ -96,24 +100,24 @@ int main(int argc, char **argv)
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				 	/* update the array in RAM */
			
 
				-	starpu_data_acquire(float_array_handle, STARPU_R);
			
 
				+	starpu_data_unregister(float_array_handle);
			
 
				 
			
 
				 	gettimeofday(&end, NULL);
			
 
				 
			
 
				-	fprintf(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
			
 
				+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
			
 
				                 float_array[1], float_array[2], float_array[3]);
			
 
				 
			
 
				+	STARPU_ASSERT(float_array[0] == niter);
			
 
				+
			
 
				 	if (float_array[0] != float_array[1] + float_array[2] + float_array[3]) {
			
 
				-		fprintf(stderr, "Incorrect result\n");
			
 
				+		FPRINTF(stderr, "Incorrect result\n");
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				-	starpu_data_release(float_array_handle);
			
 
				-
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
 
				 					(end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	fprintf(stderr, "%d elems took %lf ms\n", niter, timing/1000);
			
 
				+	FPRINTF(stderr, "%u elems took %f ms\n", niter, timing/1000);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/incrementer/incrementer_kernels_opencl.c
+++ b/examples/incrementer/incrementer_kernels_opencl.c
@@ -21,7 +21,7 @@
 
				 extern struct starpu_opencl_program opencl_program;
			
 
				 void opencl_codelet(void *descr[], void *_args)
			
 
				 {
			
 
				-	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				 	cl_event event;
			
@@ -34,7 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	err = 0;
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	{
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -137,7 +137,7 @@ void copy_matrix_into_blocks(void)
 
				 	for (bj = 0; bj < nblocks; bj++)
			
 
				 	for (bi = 0; bi < nblocks; bi++)
			
 
				 	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
			
 
				+		starpu_malloc((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
			
 
				 
			
 
				 		for (j = 0; j < blocksize; j++)
			
 
				 		for (i = 0; i < blocksize; i++)
			
@@ -151,7 +151,7 @@ void copy_matrix_into_blocks(void)
 
				 static void init_matrix(void)
			
 
				 {
			
 
				 	/* allocate matrix */
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&A, (size_t)size*size*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&A, (size_t)size*size*sizeof(TYPE));
			
 
				 	STARPU_ASSERT(A);
			
 
				 
			
 
				 	starpu_srand48((long int)time(NULL));
			
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
 
				 		} else {
			
 
				 			starpu_bound_compute(&min, NULL, 0);
			
 
				 			if (min != 0.)
			
 
				-				FPRINTF(stderr, "theoretical min: %lf ms\n", min);
			
 
				+				FPRINTF(stderr, "theoretical min: %f ms\n", min);
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/examples/lu/xlu.c
+++ b/examples/lu/xlu.c
@@ -236,17 +236,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 
				 	/* We already enforce deps by hand */
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
--- a/examples/lu/xlu_implicit.c
+++ b/examples/lu/xlu_implicit.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -143,17 +143,15 @@ void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigne
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
			
 
				 	
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -189,17 +189,15 @@ void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_data_filter f2 = {
			
 
				+		.filter_func = starpu_block_filter_func,
			
 
				+		.nchildren = nblocks
			
 
				+	};
			
 
				 
			
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -29,14 +29,15 @@ int use_x11 = 1;
 
				 #endif
			
 
				 
			
 
				 int demo = 0;
			
 
				+static double demozoom = 0.05;
			
 
				 
			
 
				 /* NB: The X11 code is inspired from the http://locklessinc.com/articles/mandelbrot/ article */
			
 
				 
			
 
				 static int nblocks = 20;
			
 
				 static int height = 400;
			
 
				 static int width = 640;
			
 
				-static int maxIt = 20000; // max number of iteration in the Mandelbrot function
			
 
				-static int niter = -1; // number of loops in case we don't use X11, -1 means infinite
			
 
				+static int maxIt = 20000; /* max number of iteration in the Mandelbrot function */
			
 
				+static int niter = -1; /* number of loops in case we don't use X11, -1 means infinite */
			
 
				 static int use_spmd = 0;
			
 
				 
			
 
				 static double leftX = -0.745;
			
@@ -233,7 +234,7 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
				 {
			
 
				 	int iby, block_size;
			
 
				 	double stepX, stepY;
			
 
				-	int *pcnt; // unused for CUDA tasks
			
 
				+	int *pcnt; /* unused for CUDA tasks */
			
 
				 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
			
 
				 
			
 
				 	cl_mem data = (cl_mem)STARPU_VECTOR_GET_PTR(descr[0]);
			
@@ -247,15 +248,15 @@ static void compute_block_opencl(void *descr[], void *cl_arg)
 
				 
			
 
				 	starpu_opencl_load_kernel(&kernel, &queue, &opencl_programs, "mandelbrot_kernel", devid);
			
 
				 
			
 
				-	clSetKernelArg(kernel, 0, sizeof(cl_mem), &data);
			
 
				-	clSetKernelArg(kernel, 1, sizeof(double), &leftX);
			
 
				-	clSetKernelArg(kernel, 2, sizeof(double), &topY);
			
 
				-	clSetKernelArg(kernel, 3, sizeof(double), &stepX);
			
 
				-	clSetKernelArg(kernel, 4, sizeof(double), &stepY);
			
 
				-	clSetKernelArg(kernel, 5, sizeof(int), &maxIt);
			
 
				-	clSetKernelArg(kernel, 6, sizeof(int), &iby);
			
 
				-	clSetKernelArg(kernel, 7, sizeof(int), &block_size);
			
 
				-	clSetKernelArg(kernel, 8, sizeof(int), &width);
			
 
				+	clSetKernelArg(kernel, 0, sizeof(data), &data);
			
 
				+	clSetKernelArg(kernel, 1, sizeof(leftX), &leftX);
			
 
				+	clSetKernelArg(kernel, 2, sizeof(topY), &topY);
			
 
				+	clSetKernelArg(kernel, 3, sizeof(stepX), &stepX);
			
 
				+	clSetKernelArg(kernel, 4, sizeof(stepY), &stepY);
			
 
				+	clSetKernelArg(kernel, 5, sizeof(maxIt), &maxIt);
			
 
				+	clSetKernelArg(kernel, 6, sizeof(iby), &iby);
			
 
				+	clSetKernelArg(kernel, 7, sizeof(block_size), &block_size);
			
 
				+	clSetKernelArg(kernel, 8, sizeof(width), &width);
			
 
				 
			
 
				 	unsigned dim = 16;
			
 
				 	size_t local[2] = {dim, 1};
			
@@ -278,7 +279,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
				 
			
 
				 	int iby, block_size;
			
 
				 	double stepX, stepY;
			
 
				-	int *pcnt; // unused for sequential tasks
			
 
				+	int *pcnt; /* unused for sequential tasks */
			
 
				 	starpu_unpack_cl_args(cl_arg, &iby, &block_size, &stepX, &stepY, &pcnt);
			
 
				 
			
 
				 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
@@ -291,7 +292,7 @@ static void compute_block(void *descr[], void *cl_arg)
 
				 		{
			
 
				 			double cx = leftX + ix * stepX;
			
 
				 			double cy = topY - iy * stepY;
			
 
				-			// Z = X+I*Y
			
 
				+			/* Z = X+I*Y */
			
 
				 			double x = 0;
			
 
				 			double y = 0;
			
 
				 			int it;
			
@@ -300,13 +301,13 @@ static void compute_block(void *descr[], void *cl_arg)
 
				 				double x2 = x*x;
			
 
				 				double y2 = y*y;
			
 
				 
			
 
				-				// Stop iterations when |Z| > 2
			
 
				+				/* Stop iterations when |Z| > 2 */
			
 
				 				if (x2 + y2 > 4.0)
			
 
				 					break;
			
 
				 
			
 
				 				double twoxy = 2.0*x*y;
			
 
				 
			
 
				-				// Z = Z^2 + C
			
 
				+				/* Z = Z^2 + C */
			
 
				 				x = x2 - y2 + cx;
			
 
				 				y = twoxy + cy;
			
 
				 			}
			
@@ -327,8 +328,8 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
				 
			
 
				 	unsigned *data = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	int ix, iy; // global coordinates
			
 
				-	int local_iy; // current line
			
 
				+	int ix, iy; /* global coordinates */
			
 
				+	int local_iy; /* current line */
			
 
				 
			
 
				 	while (1)
			
 
				 	{
			
@@ -342,7 +343,7 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
				 		{
			
 
				 			double cx = leftX + ix * stepX;
			
 
				 			double cy = topY - iy * stepY;
			
 
				-			// Z = X+I*Y
			
 
				+			/* Z = X+I*Y */
			
 
				 			double x = 0;
			
 
				 			double y = 0;
			
 
				 			int it;
			
@@ -351,13 +352,13 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
				 				double x2 = x*x;
			
 
				 				double y2 = y*y;
			
 
				 
			
 
				-				// Stop iterations when |Z| > 2
			
 
				+				/* Stop iterations when |Z| > 2 */
			
 
				 				if (x2 + y2 > 4.0)
			
 
				 					break;
			
 
				 
			
 
				 				double twoxy = 2.0*x*y;
			
 
				 
			
 
				-				// Z = Z^2 + C
			
 
				+				/* Z = Z^2 + C */
			
 
				 				x = x2 - y2 + cx;
			
 
				 				y = twoxy + cy;
			
 
				 			}
			
@@ -396,7 +397,7 @@ static void parse_args(int argc, char **argv)
 
				 	int i;
			
 
				 	for (i = 1; i < argc; i++) {
			
 
				 		if (strcmp(argv[i], "-h") == 0) {
			
 
				-			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd]\n", argv[0]);
			
 
				+			fprintf(stderr, "Usage: %s [-h] [ -width 800] [-height 600] [-nblocks 16] [-no-x11] [-pos leftx:rightx:bottomy:topy] [-niter 1000] [-spmd] [-demo] [-demozoom 0.2]\n", argv[0]);
			
 
				 			exit(-1);
			
 
				 		}
			
 
				 
			
@@ -434,6 +435,11 @@ static void parse_args(int argc, char **argv)
 
				 
			
 
				 		}
			
 
				 
			
 
				+		if (strcmp(argv[i], "-demozoom") == 0) {
			
 
				+			char *argptr;
			
 
				+			demozoom = strtof(argv[++i], &argptr);
			
 
				+		}
			
 
				+
			
 
				 		if (strcmp(argv[i], "-no-x11") == 0) {
			
 
				 #ifdef STARPU_HAVE_X11
			
 
				 			use_x11 = 0;
			
@@ -461,7 +467,7 @@ int main(int argc, char **argv)
 
				 	starpu_init(&conf);
			
 
				 
			
 
				 	unsigned *buffer;
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&buffer, height*width*sizeof(unsigned));
			
 
				+	starpu_malloc((void **)&buffer, height*width*sizeof(unsigned));
			
 
				 
			
 
				 #ifdef STARPU_HAVE_X11
			
 
				 	if (use_x11)
			
@@ -472,7 +478,7 @@ int main(int argc, char **argv)
 
				 	STARPU_ASSERT((height % nblocks) == 0);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs);
			
 
				+	starpu_opencl_load_opencl_from_string(mandelbrot_opencl_src, &opencl_programs, NULL);
			
 
				 #endif
			
 
				 
			
 
				 	starpu_data_handle block_handles[nblocks];
			
@@ -520,24 +526,24 @@ int main(int argc, char **argv)
 
				 
			
 
				 		for (iby = 0; iby < nblocks; iby++)
			
 
				 		{
			
 
				-			starpu_data_acquire(block_handles[iby], STARPU_R);
			
 
				 #ifdef STARPU_HAVE_X11
			
 
				 			if (use_x11)
			
 
				 			{
			
 
				+				starpu_data_acquire(block_handles[iby], STARPU_R);
			
 
				 				XPutImage(dpy, win, gc, bitmap,
			
 
				 					0, iby*block_size,
			
 
				 					0, iby*block_size,
			
 
				 					width, block_size);
			
 
				+				starpu_data_release(block_handles[iby]);
			
 
				 			}
			
 
				 #endif
			
 
				-			starpu_data_release(block_handles[iby]);
			
 
				 		}
			
 
				 
			
 
				 
			
 
				 		if (demo)
			
 
				 		{
			
 
				 			/* Zoom in */
			
 
				-			double zoom_factor = 0.05;
			
 
				+			double zoom_factor = demozoom;
			
 
				 			double widthX = rightX - leftX;
			
 
				 			double heightY = topY - bottomY;
			
 
				 
			
@@ -554,7 +560,7 @@ int main(int argc, char **argv)
 
				 				gettimeofday(&end, NULL);
			
 
				 				double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-				fprintf(stderr, "Time to generate %d frames : %f s\n", iter, timing/1000000.0);
			
 
				+				fprintf(stderr, "Time to generate %u frames : %f s\n", iter, timing/1000000.0);
			
 
				 				fprintf(stderr, "Average FPS: %f\n", ((double)iter*1e+6)/timing);
			
 
				 
			
 
				 				/* Reset counters */
			
@@ -583,7 +589,7 @@ int main(int argc, char **argv)
 
				 	for (iby = 0; iby < nblocks; iby++)
			
 
				 		starpu_data_unregister(block_handles[iby]);
			
 
				 
			
 
				-//	starpu_data_free_pinned_if_possible(buffer);
			
 
				+/*	starpu_data_free_pinned_if_possible(buffer); */
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				 #include <pthread.h>
			
 
				 #include <math.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 struct starpu_opencl_program opencl_code;
			
 
				 void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
@@ -27,9 +29,9 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	cl_kernel kernel;
			
 
				 	cl_command_queue queue;
			
 
				 	int id, devid, err, n;
			
 
				-	float *matrix = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	float *vector = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	float *mult = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				+	cl_mem matrix = (cl_mem)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+	cl_mem vector = (cl_mem)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_PTR(descr[2]);
			
 
				 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				 	cl_event event;
			
@@ -41,11 +43,11 @@ void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				         n=0;
			
 
				-        err = clSetKernelArg(kernel, n++, sizeof(cl_mem), &matrix);
			
 
				-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &vector);
			
 
				-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&nx);
			
 
				-        err |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&ny);
			
 
				-        err |= clSetKernelArg(kernel, n++, sizeof(cl_mem), &mult);
			
 
				+        err = clSetKernelArg(kernel, n++, sizeof(matrix), &matrix);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(vector), &vector);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(nx), (void*)&nx);
			
 
				+        err |= clSetKernelArg(kernel, n++, sizeof(ny), (void*)&ny);
			
 
				+	err |= clSetKernelArg(kernel, n++, sizeof(mult), &mult);
			
 
				         if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				 	{
			
@@ -73,9 +75,9 @@ void fillArray(float* pfData, int iSize) {
 
				 void printArray(float* pfData, int iSize) {
			
 
				     int i;
			
 
				     for (i = 0; i < iSize; ++i) {
			
 
				-            fprintf(stderr, "%f ", pfData[i]);
			
 
				+            FPRINTF(stderr, "%f ", pfData[i]);
			
 
				     }
			
 
				-    fprintf(stderr, "\n");
			
 
				+    FPRINTF(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 void matVecMult(const float *matrix, const float *vector, int width, int height, float *mult) {
			
@@ -121,8 +123,8 @@ int main(int argc, char **argv)
 
				                 .nopencl = 1,
			
 
				 	};
			
 
				 
			
 
				-        //int width=1100;
			
 
				-        //int height=244021;
			
 
				+        /* int width=1100; */
			
 
				+        /* int height=244021; */
			
 
				         int width=20;
			
 
				         int height=4;
			
 
				 
			
@@ -131,8 +133,14 @@ int main(int argc, char **argv)
 
				         unsigned int mem_size_matrix, mem_size_vector, mem_size_mult;
			
 
				 
			
 
				 	starpu_data_handle matrix_handle, vector_handle, mult_handle;
			
 
				+	int ret, submit;
			
 
				 
			
 
				-        starpu_init(&conf);
			
 
				+        ret = starpu_init(&conf);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                FPRINTF(stderr, "This application requires an OpenCL worker.\n");
			
 
				+		starpu_shutdown();
			
 
				+		exit(0);
			
 
				+	}
			
 
				 
			
 
				         mem_size_matrix = width * height * sizeof(float);
			
 
				         matrix = (float*)malloc(mem_size_matrix);
			
@@ -157,7 +165,7 @@ int main(int argc, char **argv)
 
				 	starpu_vector_data_register(&mult_handle, 0, (uintptr_t)mult, height, sizeof(float));
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code);
			
 
				+        starpu_opencl_load_opencl_from_file("examples/matvecmult/matvecmult_kernel.cl", &opencl_code, NULL);
			
 
				 #endif
			
 
				 
			
 
				 	cl.where = STARPU_OPENCL;
			
@@ -177,30 +185,28 @@ int main(int argc, char **argv)
 
				         task->buffers[2].handle = mult_handle;
			
 
				         task->buffers[2].mode = STARPU_RW;
			
 
				 
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
			
 
				-                exit(0);
			
 
				+        submit = starpu_task_submit(task);
			
 
				+        if (STARPU_UNLIKELY(submit == -ENODEV)) {
			
 
				+                FPRINTF(stderr, "No worker may execute this task. This application requires an OpenCL worker.\n");
			
 
				+	}
			
 
				+	else {
			
 
				+		starpu_task_wait_for_all();
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(matrix_handle);
			
 
				+	starpu_data_unregister(vector_handle);
			
 
				+	starpu_data_unregister(mult_handle);
			
 
				 
			
 
				-	/* update the array in RAM */
			
 
				-        starpu_data_acquire(matrix_handle, STARPU_R);
			
 
				-        starpu_data_acquire(vector_handle, STARPU_R);
			
 
				-        starpu_data_acquire(mult_handle, STARPU_R);
			
 
				+        if (STARPU_LIKELY(submit != -ENODEV)) {
			
 
				+		int res = compareL2fe(correctResult, mult, height, 1e-6f);
			
 
				+		FPRINTF(stdout, "TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
			
 
				+	}
			
 
				 
			
 
				-        int res = compareL2fe(correctResult, mult, height, 1e-6f);
			
 
				-        printf("TEST %s\n\n", (res == 0) ? "PASSED" : "FAILED !!!");
			
 
				 #if 0
			
 
				         printArray(matrix, width*height);
			
 
				         printArray(vector, width);
			
 
				         printArray(mult, height);
			
 
				 #endif
			
 
				-        starpu_data_release(matrix_handle);
			
 
				-        starpu_data_release(vector_handle);
			
 
				-        starpu_data_release(mult_handle);
			
 
				-
			
 
				         starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -42,6 +42,8 @@ static unsigned check = 0;
 
				 static TYPE *A, *B, *C;
			
 
				 static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static void check_output(void)
			
 
				 {
			
 
				 	/* compute C = C - AB */
			
@@ -52,14 +54,14 @@ static void check_output(void)
 
				 	err = CPU_ASUM(xdim*ydim, C, 1);
			
 
				 
			
 
				 	if (err < xdim*ydim*0.001) {
			
 
				-		fprintf(stderr, "Results are OK\n");
			
 
				+		FPRINTF(stderr, "Results are OK\n");
			
 
				 	}
			
 
				 	else {
			
 
				 		int max;
			
 
				 		max = CPU_IAMAX(xdim*ydim, C, 1);
			
 
				 
			
 
				-		fprintf(stderr, "There were errors ... err = %f\n", err);
			
 
				-		fprintf(stderr, "Max error : %e\n", C[max]);
			
 
				+		FPRINTF(stderr, "There were errors ... err = %f\n", err);
			
 
				+		FPRINTF(stderr, "Max error : %e\n", C[max]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -67,9 +69,9 @@ static void init_problem_data(void)
 
				 {
			
 
				 	unsigned i,j;
			
 
				 
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
			
 
				+	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));
			
 
				 
			
 
				 	/* fill the A and B matrices */
			
 
				 	for (j=0; j < ydim; j++) {
			
@@ -100,20 +102,20 @@ static void partition_mult_data(void)
 
				 	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C, 
			
 
				 		ydim, ydim, xdim, sizeof(TYPE));
			
 
				 
			
 
				-	struct starpu_data_filter f;
			
 
				-	memset(&f, 0, sizeof(f));
			
 
				-	f.filter_func = starpu_vertical_block_filter_func;
			
 
				-	f.nchildren = nslicesx;
			
 
				+	struct starpu_data_filter vert;
			
 
				+	memset(&vert, 0, sizeof(vert));
			
 
				+	vert.filter_func = starpu_vertical_block_filter_func;
			
 
				+	vert.nchildren = nslicesx;
			
 
				 		
			
 
				-	struct starpu_data_filter f2;
			
 
				-	memset(&f2, 0, sizeof(f2));
			
 
				-	f2.filter_func = starpu_block_filter_func;
			
 
				-	f2.nchildren = nslicesy;
			
 
				+	struct starpu_data_filter horiz;
			
 
				+	memset(&horiz, 0, sizeof(horiz));
			
 
				+	horiz.filter_func = starpu_block_filter_func;
			
 
				+	horiz.nchildren = nslicesy;
			
 
				 		
			
 
				-	starpu_data_partition(B_handle, &f);
			
 
				-	starpu_data_partition(A_handle, &f2);
			
 
				+	starpu_data_partition(B_handle, &vert);
			
 
				+	starpu_data_partition(A_handle, &horiz);
			
 
				 
			
 
				-	starpu_data_map_filters(C_handle, 2, &f, &f2);
			
 
				+	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				 }
			
 
				 
			
 
				 static void mult_kernel_common(void *descr[], int type)
			
@@ -145,10 +147,12 @@ static void mult_kernel_common(void *descr[], int type)
 
				 			int block_size = (nyC + worker_size - 1)/worker_size;
			
 
				 			int new_nyC = STARPU_MIN(nyC, block_size*(rank+1)) - block_size*rank;
			
 
				 
			
 
				-			TYPE *new_subA = &subA[block_size*rank];
			
 
				+			STARPU_ASSERT(nyC = STARPU_MATRIX_GET_NY(descr[1]));
			
 
				+
			
 
				+			TYPE *new_subB = &subB[block_size*rank];
			
 
				 			TYPE *new_subC = &subC[block_size*rank];
			
 
				 
			
 
				-			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, new_subA, ldA, subB, ldB, (TYPE)0.0, new_subC, ldC);
			
 
				+			CPU_GEMM("N", "N", nxC, new_nyC, nyA, (TYPE)1.0, subA, ldA, new_subB, ldB, (TYPE)0.0, new_subC, ldC);
			
 
				 		}
			
 
				 	}
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -282,11 +286,11 @@ int main(int argc, char **argv)
 
				 	gettimeofday(&end, NULL);
			
 
				 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	fprintf(stderr, "Time: %2.2f ms\n", timing/1000.0);
			
 
				+	FPRINTF(stderr, "Time: %2.2f ms\n", timing/1000.0);
			
 
				 
			
 
				 	double flops = 2.0*((unsigned long)niter)*((unsigned long)xdim)
			
 
				 				*((unsigned long)ydim)*((unsigned long)zdim);
			
 
				-	fprintf(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
			
 
				+	FPRINTF(stderr, "GFlop/s: %.2f\n", flops/timing/1000.0);
			
 
				 
			
 
				 	starpu_data_unpartition(C_handle, 0);
			
 
				 	starpu_data_unregister(C_handle);
			
--- a/examples/openmp/vector_scal.c
+++ b/examples/openmp/vector_scal.c
@@ -0,0 +1,105 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* gcc build:
			
 
				+
			
 
				+   gcc -fopenmp vector_scal.c -o vector_scal $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu)
			
 
				+
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdio.h>
			
 
				+#include <limits.h>
			
 
				+
			
 
				+#define	NX	2048
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+void scal_cpu_func(void *buffers[], void *_args) {
			
 
				+	unsigned i;
			
 
				+	float *factor = _args;
			
 
				+	starpu_vector_interface_t *vector = buffers[0];
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+	FPRINTF(stderr, "running task with %d CPUs.\n", starpu_combined_worker_get_size());
			
 
				+
			
 
				+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		val[i] *= *factor;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel_t vector_scal_model = {
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "vector_scale_parallel"
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet cl = {
			
 
				+	.where = STARPU_CPU,
			
 
				+	.type = STARPU_FORKJOIN,
			
 
				+	.max_parallelism = INT_MAX,
			
 
				+	.cpu_func = scal_cpu_func,
			
 
				+	.nbuffers = 1,
			
 
				+	.model = &vector_scal_model,
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	struct starpu_conf conf;
			
 
				+	float vector[NX];
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < NX; i++)
			
 
				+                vector[i] = (i+1.0f);
			
 
				+
			
 
				+	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+
			
 
				+	/* Most OpenMP implementations do not support concurrent parallel
			
 
				+	 * sections, so only create one big worker */
			
 
				+	conf.single_combined_worker = 1;
			
 
				+
			
 
				+	starpu_init(&conf);
			
 
				+
			
 
				+	starpu_data_handle vector_handle;
			
 
				+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				+
			
 
				+	float factor = 3.14;
			
 
				+
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	task->synchronous = 1;
			
 
				+
			
 
				+	task->cl = &cl;
			
 
				+
			
 
				+	task->buffers[0].handle = vector_handle;
			
 
				+	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->cl_arg = &factor;
			
 
				+	task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+	starpu_task_submit(task);
			
 
				+	starpu_data_unregister(vector_handle);
			
 
				+
			
 
				+	starpu_task_destroy(task);
			
 
				+
			
 
				+	/* terminate StarPU, no task can be submitted after */
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
			
 
				+	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/opt/Makefile.am
+++ b/examples/opt/Makefile.am
@@ -0,0 +1,78 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+AM_CFLAGS = $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+LIBS = $(top_builddir)/src/libstarpu.la $(HWLOC_LIBS) @LIBS@
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/ $(HWLOC_CFLAGS) -arch sm_13
			
 
				+
			
 
				+.cu.o:
			
 
				+	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
			
 
				+
			
 
				+endif
			
 
				+
			
 
				+TESTS	=	$(check_PROGRAMS)
			
 
				+
			
 
				+check_PROGRAMS =
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/examples/
			
 
				+
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+noinst_HEADERS = 				\
			
 
				+	pi/SobolQRNG/sobol.h			\
			
 
				+	pi/SobolQRNG/sobol_gold.h		\
			
 
				+	pi/SobolQRNG/sobol_gpu.h		\
			
 
				+	pi/SobolQRNG/sobol_primitives.h
			
 
				+
			
 
				+######
			
 
				+# Pi #
			
 
				+######
			
 
				+
			
 
				+check_PROGRAMS +=				\
			
 
				+	pi/pi					\
			
 
				+	pi/pi_redux
			
 
				+
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	pi/pi					\
			
 
				+	pi/pi_redux
			
 
				+
			
 
				+pi_pi_SOURCES =					\
			
 
				+	pi/pi.c					\
			
 
				+	pi/SobolQRNG/sobol_gold.c		\
			
 
				+	pi/SobolQRNG/sobol_primitives.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+pi_pi_SOURCES +=				\
			
 
				+	pi/pi_kernel.cu				\
			
 
				+	pi/SobolQRNG/sobol_gpu.cu
			
 
				+endif
			
 
				+
			
 
				+pi_pi_redux_SOURCES =				\
			
 
				+	pi/pi_redux.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+pi_pi_redux_SOURCES +=				\
			
 
				+	pi/pi_redux_kernel.cu
			
 
				+pi_pi_redux_LDADD =				\
			
 
				+	$(STARPU_CURAND_LDFLAGS)
			
 
				+endif
			
 
				+
			
 
				+
			
--- a/examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
+++ b/examples/opt/pi/SobolQRNG/CforCUDA_SDK_license.txt
--- a/examples/opt/pi/SobolQRNG/sobol.h
+++ b/examples/opt/pi/SobolQRNG/sobol.h
@@ -0,0 +1,60 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_H
			
 
				+#define SOBOL_H
			
 
				+
			
 
				+/* Number of direction vectors is fixed to 32 */
			
 
				+#define n_directions 32
			
 
				+
			
 
				+#endif
			
--- a/examples/opt/pi/SobolQRNG/sobol_gold.c
+++ b/examples/opt/pi/SobolQRNG/sobol_gold.c
@@ -0,0 +1,141 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "sobol.h"
			
 
				+#include "sobol_gold.h"
			
 
				+#include "sobol_primitives.h"
			
 
				+
			
 
				+#define k_2powneg32 2.3283064E-10F
			
 
				+
			
 
				+/* Create the direction numbers, based on the primitive polynomials. */
			
 
				+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions)
			
 
				+{
			
 
				+    unsigned int *v = directions;
			
 
				+
			
 
				+    int dim;
			
 
				+    for (dim = 0 ; dim < n_dimensions ; dim++)
			
 
				+    {
			
 
				+        /* First dimension is a special case */
			
 
				+        if (dim == 0)
			
 
				+        {
			
 
				+            int i;
			
 
				+            for (i = 0 ; i < n_directions ; i++)
			
 
				+            {
			
 
				+                /* All m's are 1 */
			
 
				+                v[i] = 1 << (31 - i);
			
 
				+            }
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            int d = sobol_primitives[dim].degree;
			
 
				+            /* The first direction numbers (up to the degree of the polynomial) 
			
 
				+               are simply v[i] = m[i] / 2^i (stored in Q0.32 format) */
			
 
				+            int i;
			
 
				+            for (i = 0 ; i < d ; i++)
			
 
				+            {
			
 
				+                v[i] = sobol_primitives[dim].m[i] << (31 - i);
			
 
				+            }
			
 
				+            /* The remaining direction numbers are computed as described in
			
 
				+               the Bratley and Fox paper. */
			
 
				+            /* v[i] = a[1]v[i-1] ^ a[2]v[i-2] ^ ... ^ a[v-1]v[i-d+1] ^ v[i-d] ^ v[i-d]/2^d */
			
 
				+            for (i = d ; i < n_directions ; i++)
			
 
				+            {
			
 
				+                /* First do the v[i-d] ^ v[i-d]/2^d part */
			
 
				+                v[i] = v[i - d] ^ (v[i - d] >> d);
			
 
				+                /* Now do the a[1]v[i-1] ^ a[2]v[i-2] ^ ... part
			
 
				+                   Note that the coefficients a[] are zero or one and for compactness in
			
 
				+                   the input tables they are stored as bits of a single integer. To extract
			
 
				+                   the relevant bit we use right shift and mask with 1.
			
 
				+                   For example, for a 10 degree polynomial there are ten useful bits in a,
			
 
				+                   so to get a[2] we need to right shift 7 times (to get the 8th bit into
			
 
				+                   the LSB) and then mask with 1. */
			
 
				+                int j;
			
 
				+                for (j = 1 ; j < d ; j++)
			
 
				+                {
			
 
				+                    v[i] ^= (((sobol_primitives[dim].a >> (d - 1 - j)) & 1) * v[i - j]);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        v += n_directions;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/* Reference model for generating Sobol numbers on the host */
			
 
				+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output)
			
 
				+{
			
 
				+    unsigned int *v = directions;
			
 
				+
			
 
				+    int d;
			
 
				+    for (d = 0 ; d < n_dimensions ; d++)
			
 
				+    {
			
 
				+        unsigned int X = 0;
			
 
				+        /* x[0] is zero (in all dimensions) */
			
 
				+        output[n_vectors * d] = 0.0;        
			
 
				+        int i;
			
 
				+        for (i = 1 ; i < n_vectors ; i++)
			
 
				+        {
			
 
				+            /* x[i] = x[i-1] ^ v[c]
			
 
				+                where c is the index of the rightmost zero bit in i
			
 
				+                minus 1 (since C arrays count from zero)
			
 
				+               In the Bratley and Fox paper this is equation (**) */
			
 
				+            X ^= v[ffs(~(i - 1)) - 1];
			
 
				+            output[i + n_vectors * d] = (float)X * k_2powneg32;
			
 
				+        }
			
 
				+        v += n_directions;
			
 
				+    }
			
 
				+}
			
--- a/examples/opt/pi/SobolQRNG/sobol_gold.h
+++ b/examples/opt/pi/SobolQRNG/sobol_gold.h
@@ -0,0 +1,61 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_GOLD_H
			
 
				+#define SOBOL_GOLD_H
			
 
				+
			
 
				+void initSobolDirectionVectors(int n_dimensions, unsigned int *directions);
			
 
				+void sobolCPU(int n_vectors, int n_dimensions, unsigned int *directions, float *output);
			
 
				+
			
 
				+#endif
			
--- a/examples/opt/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/opt/pi/SobolQRNG/sobol_gpu.cu
@@ -0,0 +1,170 @@
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#include "sobol.h"
			
 
				+#include "sobol_gpu.h"
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+#define k_2powneg32 2.3283064E-10F
			
 
				+
			
 
				+__global__ void sobolGPU_kernel(unsigned n_vectors, unsigned n_dimensions, unsigned *d_directions, float *d_output)
			
 
				+{
			
 
				+    __shared__ unsigned int v[n_directions];
			
 
				+
			
 
				+    // Offset into the correct dimension as specified by the
			
 
				+    // block y coordinate
			
 
				+    d_directions = d_directions + n_directions * blockIdx.y;
			
 
				+    d_output = d_output +  n_vectors * blockIdx.y;
			
 
				+
			
 
				+    // Copy the direction numbers for this dimension into shared
			
 
				+    // memory - there are only 32 direction numbers so only the
			
 
				+    // first 32 (n_directions) threads need participate.
			
 
				+    if (threadIdx.x < n_directions)
			
 
				+    {
			
 
				+	    v[threadIdx.x] = d_directions[threadIdx.x];
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    // Set initial index (i.e. which vector this thread is
			
 
				+    // computing first) and stride (i.e. step to the next vector
			
 
				+    // for this thread)
			
 
				+    int i0     = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				+    int stride = gridDim.x * blockDim.x;
			
 
				+
			
 
				+    // Get the gray code of the index
			
 
				+    // c.f. Numerical Recipes in C, chapter 20
			
 
				+    // http://www.nrbook.com/a/bookcpdf/c20-2.pdf
			
 
				+    unsigned int g = i0 ^ (i0 >> 1);
			
 
				+
			
 
				+    // Initialisation for first point x[i0]
			
 
				+    // In the Bratley and Fox paper this is equation (*), where
			
 
				+    // we are computing the value for x[n] without knowing the
			
 
				+    // value of x[n-1].
			
 
				+    unsigned int X = 0;
			
 
				+    unsigned int mask;
			
 
				+    for (unsigned int k = 0 ; k < __ffs(stride) - 1 ; k++)
			
 
				+    {
			
 
				+        // We want X ^= g_k * v[k], where g_k is one or zero.
			
 
				+        // We do this by setting a mask with all bits equal to
			
 
				+        // g_k. In reality we keep shifting g so that g_k is the
			
 
				+        // LSB of g. This way we avoid multiplication.
			
 
				+        mask = - (g & 1);
			
 
				+        X ^= mask & v[k];
			
 
				+        g = g >> 1;
			
 
				+    }
			
 
				+    if (i0 < n_vectors)
			
 
				+    {
			
 
				+        d_output[i0] = (float)X * k_2powneg32;
			
 
				+    }
			
 
				+
			
 
				+    // Now do rest of points, using the stride
			
 
				+    // Here we want to generate x[i] from x[i-stride] where we
			
 
				+    // don't have any of the x in between, therefore we have to
			
 
				+    // revisit the equation (**), this is easiest with an example
			
 
				+    // so assume stride is 16.
			
 
				+    // From x[n] to x[n+16] there will be:
			
 
				+    //   8 changes in the first bit
			
 
				+    //   4 changes in the second bit
			
 
				+    //   2 changes in the third bit
			
 
				+    //   1 change in the fourth
			
 
				+    //   1 change in one of the remaining bits
			
 
				+    //
			
 
				+    // What this means is that in the equation:
			
 
				+    //   x[n+1] = x[n] ^ v[p]
			
 
				+    //   x[n+2] = x[n+1] ^ v[q] = x[n] ^ v[p] ^ v[q]
			
 
				+    //   ...
			
 
				+    // We will apply xor with v[1] eight times, v[2] four times,
			
 
				+    // v[3] twice, v[4] once and one other direction number once.
			
 
				+    // Since two xors cancel out, we can skip even applications
			
 
				+    // and just apply xor with v[4] (i.e. log2(16)) and with
			
 
				+    // the current applicable direction number.
			
 
				+    // Note that all these indices count from 1, so we need to
			
 
				+    // subtract 1 from them all to account for C arrays counting
			
 
				+    // from zero.
			
 
				+    unsigned int v_log2stridem1 = v[__ffs(stride) - 2];
			
 
				+    unsigned int v_stridemask = stride - 1;
			
 
				+    for (unsigned int i = i0 + stride ; i < n_vectors ; i += stride)
			
 
				+    {
			
 
				+        // x[i] = x[i-stride] ^ v[b] ^ v[c]
			
 
				+        //  where b is log2(stride) minus 1 for C array indexing
			
 
				+        //  where c is the index of the rightmost zero bit in i,
			
 
				+        //  not including the bottom log2(stride) bits, minus 1
			
 
				+        //  for C array indexing
			
 
				+        // In the Bratley and Fox paper this is equation (**)
			
 
				+        X ^= v_log2stridem1 ^ v[__ffs(~((i - stride) | v_stridemask)) - 1];
			
 
				+        d_output[i] = (float)X * k_2powneg32;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+extern "C"
			
 
				+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output)
			
 
				+{
			
 
				+    const int threadsperblock = 64;
			
 
				+
			
 
				+    // Set up the execution configuration
			
 
				+    dim3 dimGrid;
			
 
				+    dim3 dimBlock;
			
 
				+
			
 
				+    // This implementation of the generator outputs all the draws for
			
 
				+    // one dimension in a contiguous region of memory, followed by the
			
 
				+    // next dimension and so on.
			
 
				+    // Therefore all threads within a block will be processing different
			
 
				+    // vectors from the same dimension. As a result we want the total
			
 
				+    // number of blocks to be a multiple of the number of dimensions.
			
 
				+    dimGrid.y = n_dimensions;
			
 
				+
			
 
				+    // If the number of dimensions is large then we will set the number
			
 
				+    // of blocks to equal the number of dimensions (i.e. dimGrid.x = 1)
			
 
				+    // but if the number of dimensions is small (e.g. less than 32) then
			
 
				+    // we'll partition the vectors across blocks (as well as threads).
			
 
				+    // We also need to cap the dimGrid.x where the number of vectors
			
 
				+    // is too small to be partitioned.
			
 
				+    dimGrid.x = 1 + 31 / n_dimensions;
			
 
				+    if (dimGrid.x > (unsigned int)(n_vectors / threadsperblock))
			
 
				+    {
			
 
				+        dimGrid.x = (n_vectors + threadsperblock - 1) / threadsperblock;
			
 
				+    }
			
 
				+    
			
 
				+    // Fix the number of threads
			
 
				+    dimBlock.x = threadsperblock;
			
 
				+
			
 
				+    // Execute GPU kernel
			
 
				+    sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				+}
			
--- a/examples/opt/pi/SobolQRNG/sobol_gpu.h
+++ b/examples/opt/pi/SobolQRNG/sobol_gpu.h
@@ -0,0 +1,61 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_GPU_H
			
 
				+#define SOBOL_GPU_H
			
 
				+
			
 
				+extern "C"
			
 
				+void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_directions, float *d_output);
			
 
				+
			
 
				+#endif
			
--- a/examples/opt/pi/SobolQRNG/sobol_primitives.c
+++ b/examples/opt/pi/SobolQRNG/sobol_primitives.c
--- a/examples/opt/pi/SobolQRNG/sobol_primitives.h
+++ b/examples/opt/pi/SobolQRNG/sobol_primitives.h
@@ -0,0 +1,75 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+/*
			
 
				+ * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ *
			
 
				+ * NVIDIA Corporation and its licensors retain all intellectual property and 
			
 
				+ * proprietary rights in and to this software and related documentation and 
			
 
				+ * any modifications thereto.  Any use, reproduction, disclosure, or distribution 
			
 
				+ * of this software and related documentation without an express license 
			
 
				+ * agreement from NVIDIA Corporation is strictly prohibited.
			
 
				+ * 
			
 
				+ */
			
 
				+ 
			
 
				+ /*
			
 
				+ * Portions Copyright (c) 1993-2009 NVIDIA Corporation.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2009 Mike Giles, Oxford University.  All rights reserved.
			
 
				+ * Portions Copyright (c) 2008 Frances Y. Kuo and Stephen Joe.  All rights reserved.
			
 
				+ *
			
 
				+ * Sobol Quasi-random Number Generator example
			
 
				+ *
			
 
				+ * Based on CUDA code submitted by Mike Giles, Oxford University, United Kingdom
			
 
				+ * http://people.maths.ox.ac.uk/~gilesm/
			
 
				+ *
			
 
				+ * and C code developed by Stephen Joe, University of Waikato, New Zealand
			
 
				+ * and Frances Kuo, University of New South Wales, Australia
			
 
				+ * http://web.maths.unsw.edu.au/~fkuo/sobol/
			
 
				+ *
			
 
				+ * For theoretical background see:
			
 
				+ *
			
 
				+ * P. Bratley and B.L. Fox.
			
 
				+ * Implementing Sobol's quasirandom sequence generator
			
 
				+ * http://portal.acm.org/citation.cfm?id=42288
			
 
				+ * ACM Trans. on Math. Software, 14(1):88-100, 1988
			
 
				+ *
			
 
				+ * S. Joe and F. Kuo.
			
 
				+ * Remark on algorithm 659: implementing Sobol's quasirandom sequence generator.
			
 
				+ * http://portal.acm.org/citation.cfm?id=641879
			
 
				+ * ACM Trans. on Math. Software, 29(1):49-57, 2003
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#ifndef SOBOL_PRIMITIVES_H
			
 
				+#define SOBOL_PRIMITIVES_H
			
 
				+
			
 
				+#define max_m 17
			
 
				+
			
 
				+/* Each primitive is stored as a struct where
			
 
				+   dimension is the dimension number of the polynomial (unused)
			
 
				+   degree is the degree of the polynomial
			
 
				+   a is a binary word representing the coefficients 
			
 
				+   m is the array of m values */
			
 
				+struct primitive
			
 
				+{
			
 
				+    unsigned int dimension;
			
 
				+    unsigned int degree;
			
 
				+    unsigned int a;
			
 
				+    unsigned int m[max_m];
			
 
				+};
			
 
				+
			
 
				+extern const struct primitive sobol_primitives[];
			
 
				+
			
 
				+#endif
			
--- a/examples/opt/pi/pi.c
+++ b/examples/opt/pi/pi.c
@@ -0,0 +1,175 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "SobolQRNG/sobol.h"
			
 
				+#include "SobolQRNG/sobol_gold.h"
			
 
				+#include "pi.h"
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void cuda_kernel(void **descr, void *cl_arg);
			
 
				+#endif
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+/* default value */
			
 
				+static unsigned ntasks = 1024;
			
 
				+
			
 
				+static void cpu_kernel(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned nx = NSHOT_PER_TASK;
			
 
				+
			
 
				+	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
			
 
				+	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				+
			
 
				+	TYPE *random_numbers_x = &random_numbers[0];
			
 
				+	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				+
			
 
				+	unsigned current_cnt = 0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < nx; i++)
			
 
				+	{
			
 
				+		TYPE x = random_numbers_x[i];
			
 
				+		TYPE y = random_numbers_y[i];
			
 
				+
			
 
				+		TYPE dist = (x*x + y*y);
			
 
				+
			
 
				+		unsigned success = (dist <= 1.0);
			
 
				+		current_cnt += success;
			
 
				+	}
			
 
				+
			
 
				+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	*cnt = current_cnt;
			
 
				+
			
 
				+	free(random_numbers);
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-ntasks") == 0) {
			
 
				+			char *argptr;
			
 
				+			ntasks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	/* Initialize the random number generator */
			
 
				+	unsigned *sobol_qrng_directions = malloc(n_dimensions*n_directions*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(sobol_qrng_directions);
			
 
				+
			
 
				+	initSobolDirectionVectors(n_dimensions, sobol_qrng_directions);
			
 
				+
			
 
				+	/* Any worker may use that array now */
			
 
				+	starpu_data_handle sobol_qrng_direction_handle;
			
 
				+	starpu_vector_data_register(&sobol_qrng_direction_handle, 0,
			
 
				+		(uintptr_t)sobol_qrng_directions, n_dimensions*n_directions, sizeof(unsigned));
			
 
				+
			
 
				+	unsigned *cnt_array = malloc(ntasks*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(cnt_array);
			
 
				+	starpu_data_handle cnt_array_handle;
			
 
				+	starpu_vector_data_register(&cnt_array_handle, 0, (uintptr_t)cnt_array, ntasks, sizeof(unsigned));
			
 
				+
			
 
				+	/* Use a write-through policy : when the data is modified on an
			
 
				+	 * accelerator, we know that it will only be modified once and be
			
 
				+	 * accessed by the CPU later on */
			
 
				+	starpu_data_set_wt_mask(cnt_array_handle, (1<<0));
			
 
				+
			
 
				+	struct starpu_data_filter f = {
			
 
				+		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.nchildren = ntasks
			
 
				+	};
			
 
				+	
			
 
				+	starpu_data_partition(cnt_array_handle, &f);
			
 
				+
			
 
				+	static struct starpu_perfmodel_t model = {
			
 
				+		.type = STARPU_HISTORY_BASED,
			
 
				+		.symbol = "monte_carlo_pi"
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_codelet_t cl = {
			
 
				+		.where = STARPU_CPU|STARPU_CUDA,
			
 
				+		.cpu_func = cpu_kernel,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		.cuda_func = cuda_kernel,
			
 
				+#endif
			
 
				+		.nbuffers = 2,
			
 
				+		.model = &model
			
 
				+	};
			
 
				+
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+
			
 
				+		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
			
 
				+
			
 
				+		task->buffers[0].handle = sobol_qrng_direction_handle;
			
 
				+		task->buffers[0].mode   = STARPU_R;
			
 
				+		task->buffers[1].handle = starpu_data_get_sub_data(cnt_array_handle, 1, i);
			
 
				+		task->buffers[1].mode   = STARPU_W;
			
 
				+
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* Get the cnt_array back in main memory */
			
 
				+	starpu_data_unpartition(cnt_array_handle, 0);
			
 
				+	starpu_data_unregister(cnt_array_handle);
			
 
				+
			
 
				+	/* Count the total number of entries */
			
 
				+	unsigned long total_cnt = 0;
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+		total_cnt += cnt_array[i];
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
			
 
				+
			
 
				+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
			
 
				+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
			
 
				+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
			
 
				+	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
			
 
				+
			
 
				+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/opt/pi/pi.h
+++ b/examples/opt/pi/pi.h
@@ -0,0 +1,33 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PI_H__
			
 
				+#define __PI_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+#define NSHOT_PER_TASK	(16*1024*1024ULL)
			
 
				+
			
 
				+#define TYPE	float
			
 
				+
			
 
				+/* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */
			
 
				+
			
 
				+static int n_dimensions = 100;
			
 
				+
			
 
				+#endif /* __PI_H__ */
			
--- a/examples/opt/pi/pi_kernel.cu
+++ b/examples/opt/pi/pi_kernel.cu
@@ -0,0 +1,150 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "SobolQRNG/sobol_gpu.h"
			
 
				+#include "pi.h"
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+#define MAXNBLOCKS	128
			
 
				+#define MAXTHREADSPERBLOCK	256
			
 
				+
			
 
				+static __global__ void monte_carlo(TYPE *random_numbers_x, TYPE *random_numbers_y,
			
 
				+						unsigned n, unsigned *output_cnt)
			
 
				+{
			
 
				+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
			
 
				+
			
 
				+	/* Do we have a successful shot ? */
			
 
				+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				+
			
 
				+	const int nthreads = gridDim.x * blockDim.x;
			
 
				+
			
 
				+	/* Blank the shared mem buffer */
			
 
				+	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				+		scnt[threadIdx.x] = 0;
			
 
				+
			
 
				+	__syncthreads();
			
 
				+	int ind;
			
 
				+	for (ind = tid; ind < n; ind += nthreads)
			
 
				+	{ 
			
 
				+		TYPE x = random_numbers_x[ind];
			
 
				+		TYPE y = random_numbers_y[ind];
			
 
				+		TYPE dist = (x*x + y*y);
			
 
				+
			
 
				+		unsigned success = (dist <= 1.0f)?1:0;
			
 
				+
			
 
				+		scnt[threadIdx.x] += success;
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				+
			
 
				+	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				+	unsigned s;
			
 
				+	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				+	{
			
 
				+		if (threadIdx.x < s)
			
 
				+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
			
 
				+
			
 
				+		__syncthreads();
			
 
				+	}
			
 
				+
			
 
				+	/* report the number of successful shots in the block */
			
 
				+	if (threadIdx.x == 0)
			
 
				+		output_cnt[blockIdx.x] = scnt[0];
			
 
				+
			
 
				+	__syncthreads();
			
 
				+}
			
 
				+
			
 
				+static __global__ void sum_per_block_cnt(unsigned *output_cnt, unsigned *cnt)
			
 
				+{
			
 
				+	__shared__ unsigned accumulator[MAXNBLOCKS];
			
 
				+
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* Load the values from global mem */
			
 
				+	for (i = 0; i < blockDim.x; i++)
			
 
				+		accumulator[i] = output_cnt[i];
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Perform a reduction in shared memory */
			
 
				+	unsigned s;
			
 
				+	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				+	{
			
 
				+		if (threadIdx.x < s)
			
 
				+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				+
			
 
				+		__syncthreads();
			
 
				+	}
			
 
				+
			
 
				+	/* Save the result in global memory */
			
 
				+	if (threadIdx.x == 0)
			
 
				+		*cnt = accumulator[0];
			
 
				+}
			
 
				+
			
 
				+extern "C" void cuda_kernel(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	unsigned nx = NSHOT_PER_TASK;
			
 
				+
			
 
				+	/* Generate Random numbers */
			
 
				+	float *random_numbers;
			
 
				+	cudaMalloc((void **)&random_numbers, 2*nx*sizeof(float));
			
 
				+	STARPU_ASSERT(random_numbers);
			
 
				+	
			
 
				+	sobolGPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+
			
 
				+	TYPE *random_numbers_x = &random_numbers[0];
			
 
				+	TYPE *random_numbers_y = &random_numbers[nx];
			
 
				+
			
 
				+	unsigned *cnt = (unsigned *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	/* How many blocks do we use ? */ 
			
 
				+	unsigned nblocks = 128; // TODO
			
 
				+
			
 
				+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				+	
			
 
				+	unsigned *per_block_cnt;
			
 
				+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned));
			
 
				+
			
 
				+	STARPU_ASSERT((nx % nblocks) == 0);
			
 
				+
			
 
				+	/* How many threads per block ? At most 256, but no more threads than
			
 
				+	 * there are entries to process per block. */
			
 
				+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
			
 
				+
			
 
				+	/* each entry of per_block_cnt contains the number of successful shots
			
 
				+	 * in the corresponding block. */
			
 
				+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+
			
 
				+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				+
			
 
				+	/* compute the total number of successful shots by adding the elements
			
 
				+	 * of the per_block_cnt array */
			
 
				+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
			
 
				+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	if (cures)
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cudaFree(per_block_cnt);
			
 
				+	cudaFree(random_numbers);
			
 
				+}
			
--- a/examples/opt/pi/pi_redux.c
+++ b/examples/opt/pi/pi_redux.c
@@ -0,0 +1,362 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+#define PI	3.14159265358979323846
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CURAND)
			
 
				+#warning CURAND is required to run that example on CUDA devices
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+#include <cuda.h>
			
 
				+#include <curand.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#endif
			
 
				+
			
 
				+#define NSHOT_PER_TASK	(1024*1024)
			
 
				+
			
 
				+/* default value */
			
 
				+static unsigned long ntasks = 1024;
			
 
				+static unsigned long ntasks_warmup = 0;
			
 
				+
			
 
				+static unsigned use_redux = 1;
			
 
				+static unsigned do_warmup = 0;
			
 
				+
			
 
				+/*
			
 
				+ *	Initialization of the Random Number Generators (RNG)
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+/* RNG for the CURAND library */
			
 
				+static curandGenerator_t curandgens[STARPU_NMAXWORKERS];
			
 
				+#endif 
			
 
				+
			
 
				+/* state for the erand48 function : note the huge padding to avoid false-sharing */
			
 
				+#define PADDING	1024
			
 
				+static unsigned short xsubi[STARPU_NMAXWORKERS*PADDING];
			
 
				+static struct drand48_data randbuffer[STARPU_NMAXWORKERS*PADDING];
			
 
				+
			
 
				+/* Function to initialize the random number generator in the current worker */
			
 
				+static void init_rng(void *arg __attribute__((unused)))
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+	curandStatus_t res;
			
 
				+#endif
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	switch (starpu_worker_get_type(workerid)) {
			
 
				+		case STARPU_CPU_WORKER:
			
 
				+			/* create a seed */
			
 
				+			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
			
 
				+
			
 
				+			xsubi[0 + PADDING*workerid] = (unsigned short)workerid;
			
 
				+			xsubi[1 + PADDING*workerid] = (unsigned short)workerid;
			
 
				+			xsubi[2 + PADDING*workerid] = (unsigned short)workerid;
			
 
				+			break;
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+
			
 
				+			/* Create a RNG */
			
 
				+			res = curandCreateGenerator(&curandgens[workerid],
			
 
				+						CURAND_RNG_PSEUDO_DEFAULT);
			
 
				+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				+
			
 
				+			/* Seed it with worker's id */
			
 
				+			res = curandSetPseudoRandomGeneratorSeed(curandgens[workerid],
			
 
				+							(unsigned long long)workerid);
			
 
				+			STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-ntasks") == 0) {
			
 
				+			char *argptr;
			
 
				+			ntasks = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-noredux") == 0) {
			
 
				+			use_redux = 0;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-warmup") == 0) {
			
 
				+			do_warmup = 1;
			
 
				+			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0) {
			
 
				+			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
			
 
				+			exit(-1);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Monte-carlo kernel
			
 
				+ */
			
 
				+
			
 
				+static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
			
 
				+{
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	unsigned short *worker_xsub;
			
 
				+	worker_xsub = &xsubi[PADDING*workerid];
			
 
				+	
			
 
				+	struct drand48_data *buffer;
			
 
				+	buffer = &randbuffer[PADDING*workerid];
			
 
				+
			
 
				+	unsigned long local_cnt = 0;
			
 
				+
			
 
				+	/* Fill the scratchpad with random numbers */
			
 
				+	int i;
			
 
				+	for (i = 0; i < NSHOT_PER_TASK; i++)
			
 
				+	{
			
 
				+		double randx, randy;
			
 
				+
			
 
				+		starpu_erand48_r(worker_xsub, buffer, &randx);
			
 
				+		starpu_erand48_r(worker_xsub, buffer, &randy);
			
 
				+
			
 
				+		double x = (2.0*randx - 1.0);
			
 
				+		double y = (2.0*randy - 1.0);
			
 
				+
			
 
				+		double dist = x*x + y*y;
			
 
				+		if (dist < 1.0)
			
 
				+			local_cnt++;
			
 
				+	}
			
 
				+
			
 
				+	/* Put the contribution of that task into the counter */
			
 
				+	unsigned long *cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	*cnt = *cnt + local_cnt;
			
 
				+}
			
 
				+
			
 
				+extern void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+	curandStatus_t res;	
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	/* CURAND is a bit silly: it assumes that any error is fatal. Calling
			
 
				+	 * cudaGetLastError resets the last error value. */
			
 
				+	cures = cudaGetLastError();
			
 
				+/*	if (cures)
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures); */
			
 
				+
			
 
				+	/* Fill the scratchpad with random numbers. Note that both x and y
			
 
				+	 * arrays are in stored the same vector. */
			
 
				+	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
			
 
				+	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
			
 
				+
			
 
				+	float *x = &scratchpad_xy[0];
			
 
				+	float *y = &scratchpad_xy[NSHOT_PER_TASK];
			
 
				+
			
 
				+	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_codelet_t pi_cl = {
			
 
				+	.where =
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+		STARPU_CPU,
			
 
				+	.cpu_func = pi_func_cpu,
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+	.cuda_func = pi_func_cuda,
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.model = NULL
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Codelets to implement reduction
			
 
				+ */
			
 
				+
			
 
				+static void init_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        *val = 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+static void init_cuda_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        cudaMemset(val, 0, sizeof(unsigned long));
			
 
				+        cudaThreadSynchronize();
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_codelet_t init_codelet = {
			
 
				+	.where =
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+		STARPU_CPU,
			
 
				+        .cpu_func = init_cpu_func,
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+        .cuda_func = init_cuda_func,
			
 
				+#endif
			
 
				+        .nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+/* Dummy implementation of the addition of two unsigned longs in CUDA */
			
 
				+static void redux_cuda_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	unsigned long *d_a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned long *d_b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned long h_a, h_b;
			
 
				+	
			
 
				+	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
			
 
				+	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
			
 
				+
			
 
				+	h_a += h_b;
			
 
				+
			
 
				+	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+static void redux_cpu_func(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+
			
 
				+	*a = *a + *b;
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet_t redux_codelet = {
			
 
				+	.where =
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+		STARPU_CUDA|
			
 
				+#endif
			
 
				+		STARPU_CPU,
			
 
				+	.cpu_func = redux_cpu_func,
			
 
				+#ifdef STARPU_HAVE_CURAND
			
 
				+	.cuda_func = redux_cuda_func,
			
 
				+#endif
			
 
				+	.nbuffers = 2
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ *	Main program
			
 
				+ */
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	parse_args(argc, argv);
			
 
				+
			
 
				+	starpu_init(NULL);
			
 
				+
			
 
				+	/* Launch a Random Number Generator (RNG) on each worker */
			
 
				+	starpu_execute_on_each_worker(init_rng, NULL, STARPU_CPU|STARPU_CUDA);
			
 
				+
			
 
				+	/* Create a scratchpad data */
			
 
				+	starpu_data_handle xy_scratchpad_handle;
			
 
				+	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
			
 
				+		2*NSHOT_PER_TASK, sizeof(float));
			
 
				+
			
 
				+	/* Create a variable that will be used to count the number of shots
			
 
				+	 * that actually hit the unit circle when shooting randomly in
			
 
				+	 * [-1,1]^2. */
			
 
				+	unsigned long shot_cnt = 0;
			
 
				+	starpu_data_handle shot_cnt_handle;
			
 
				+	starpu_variable_data_register(&shot_cnt_handle, 0,
			
 
				+			(uintptr_t)&shot_cnt, sizeof(shot_cnt));
			
 
				+
			
 
				+	starpu_data_set_reduction_methods(shot_cnt_handle,
			
 
				+					&redux_codelet, &init_codelet);
			
 
				+
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	for (i = 0; i < ntasks_warmup; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &pi_cl;
			
 
				+
			
 
				+		task->buffers[0].handle = xy_scratchpad_handle;
			
 
				+		task->buffers[0].mode   = STARPU_SCRATCH;
			
 
				+		task->buffers[1].handle = shot_cnt_handle;
			
 
				+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
			
 
				+
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &pi_cl;
			
 
				+
			
 
				+		task->buffers[0].handle = xy_scratchpad_handle;
			
 
				+		task->buffers[0].mode   = STARPU_SCRATCH;
			
 
				+		task->buffers[1].handle = shot_cnt_handle;
			
 
				+		task->buffers[1].mode   = use_redux?STARPU_REDUX:STARPU_RW;
			
 
				+
			
 
				+		int ret = starpu_task_submit(task);
			
 
				+		STARPU_ASSERT(!ret);
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_unregister(shot_cnt_handle);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
			
 
				+	 * probability to impact the disk: pi/4 */
			
 
				+	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
			
 
				+	double pi_approx = ((double)shot_cnt*4.0)/total;
			
 
				+
			
 
				+	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");
			
 
				+	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", pi_approx, shot_cnt, total);
			
 
				+	FPRINTF(stderr, "Error %e \n", pi_approx - PI);
			
 
				+	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
			
 
				+	FPRINTF(stderr, "Speed : %f GShot/s\n", total/(1e3*timing));
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (abs(pi_approx - PI) > 1.0)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/opt/pi/pi_redux_kernel.cu
+++ b/examples/opt/pi/pi_redux_kernel.cu
@@ -0,0 +1,128 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+#define MAXNBLOCKS	128
			
 
				+#define MAXTHREADSPERBLOCK	256
			
 
				+
			
 
				+static __global__ void monte_carlo(float *x, float *y, unsigned n, unsigned long *output_cnt)
			
 
				+{
			
 
				+	__shared__ unsigned scnt[MAXTHREADSPERBLOCK];
			
 
				+
			
 
				+	/* Do we have a successful shot ? */
			
 
				+	const int tid = threadIdx.x + blockIdx.x*blockDim.x;
			
 
				+
			
 
				+	const int nthreads = gridDim.x * blockDim.x;
			
 
				+
			
 
				+	/* Blank the shared mem buffer */
			
 
				+	if (threadIdx.x < MAXTHREADSPERBLOCK)
			
 
				+		scnt[threadIdx.x] = 0;
			
 
				+
			
 
				+	__syncthreads();
			
 
				+	int ind;
			
 
				+	for (ind = tid; ind < n; ind += nthreads)
			
 
				+	{ 
			
 
				+		float xval = (2.0f * x[ind] - 1.0f);
			
 
				+		float yval = (2.0f * y[ind] - 1.0f);
			
 
				+		float dist = (xval*xval + yval*yval);
			
 
				+
			
 
				+		unsigned long success = (dist <= 1.0f)?1:0;
			
 
				+
			
 
				+		scnt[threadIdx.x] += success;
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Perform a reduction to compute the sum on each thread within that block */
			
 
				+
			
 
				+	/* NB: We assume that the number of threads per block is a power of 2 ! */
			
 
				+	unsigned long s;
			
 
				+	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				+	{
			
 
				+		if (threadIdx.x < s)
			
 
				+			scnt[threadIdx.x] += scnt[threadIdx.x + s];
			
 
				+
			
 
				+		__syncthreads();
			
 
				+	}
			
 
				+
			
 
				+	/* report the number of successful shots in the block */
			
 
				+	if (threadIdx.x == 0)
			
 
				+		output_cnt[blockIdx.x] = scnt[0];
			
 
				+
			
 
				+	__syncthreads();
			
 
				+}
			
 
				+
			
 
				+static __global__ void sum_per_block_cnt(unsigned long *output_cnt, unsigned long *cnt)
			
 
				+{
			
 
				+	__shared__ unsigned long accumulator[MAXNBLOCKS];
			
 
				+
			
 
				+	unsigned i;
			
 
				+
			
 
				+	/* Load the values from global mem */
			
 
				+	for (i = 0; i < blockDim.x; i++)
			
 
				+		accumulator[i] = output_cnt[i];
			
 
				+
			
 
				+	__syncthreads();
			
 
				+
			
 
				+	/* Perform a reduction in shared memory */
			
 
				+	unsigned s;
			
 
				+	for (s = blockDim.x/2; s!=0; s>>=1)
			
 
				+	{
			
 
				+		if (threadIdx.x < s)
			
 
				+			accumulator[threadIdx.x] += accumulator[threadIdx.x + s];
			
 
				+
			
 
				+		__syncthreads();
			
 
				+	}
			
 
				+
			
 
				+	/* Save the result in global memory */
			
 
				+	if (threadIdx.x == 0)
			
 
				+		*cnt = *cnt + accumulator[0];
			
 
				+}
			
 
				+
			
 
				+extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned long *shot_cnt)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	/* How many blocks do we use ? */ 
			
 
				+	unsigned nblocks = 128; // TODO
			
 
				+	STARPU_ASSERT(nblocks <= MAXNBLOCKS);
			
 
				+	STARPU_ASSERT((n % nblocks) == 0);
			
 
				+	
			
 
				+	unsigned long *per_block_cnt;
			
 
				+	cudaMalloc((void **)&per_block_cnt, nblocks*sizeof(unsigned long));
			
 
				+
			
 
				+	/* How many threads per block ? At most 256, but no more threads than
			
 
				+	 * there are entries to process per block. */
			
 
				+	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (n / nblocks));
			
 
				+
			
 
				+	/* each entry of per_block_cnt contains the number of successful shots
			
 
				+	 * in the corresponding block. */
			
 
				+	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
			
 
				+
			
 
				+	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				+
			
 
				+	/* compute the total number of successful shots by adding the elements
			
 
				+	 * of the per_block_cnt array */
			
 
				+	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
			
 
				+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	if (cures)
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cudaFree(per_block_cnt);
			
 
				+}
			
--- a/examples/ppm_downscaler/ppm_downscaler.c
+++ b/examples/ppm_downscaler/ppm_downscaler.c
@@ -76,7 +76,7 @@ struct ppm_image *file_to_ppm(char *filename)
 
				 	unsigned i;
			
 
				 	for (i = 0; i < ppm->ncols*ppm->nlines; i++)
			
 
				 	{
			
 
				-//		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b);
			
 
				+/*		fprintf(stderr, "READ (index %d) -> r %d g %d b %d\n", i, ppm->data[i].r, ppm->data[i].g, ppm->data[i].b); */
			
 
				 	}
			
 
				 
			
 
				 	fclose(file);
			
@@ -136,7 +136,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
				 				{
			
 
				 					unsigned index = (big_col + i)+(big_line + j)*input_ppm->ncols;
			
 
				 
			
 
				-//					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b);
			
 
				+/*					fprintf(stderr, "(col %d, line %d) i %d j %d index %d -> r %d g %d b %d\n", col, line, i, j, index, in[index].r, in[index].g, in[index].b); */
			
 
				 
			
 
				 					sum_r += (unsigned)in[index].r;
			
 
				 					sum_g += (unsigned)in[index].g;
			
@@ -148,7 +148,7 @@ void dummy_downscale(struct ppm_image *input_ppm, struct ppm_image *output_ppm)
 
				 			out[col + line*output_ppm->ncols].g = (unsigned char)(sum_g/(FACTOR*FACTOR));
			
 
				 			out[col + line*output_ppm->ncols].b = (unsigned char)(sum_b/(FACTOR*FACTOR));
			
 
				 
			
 
				-//			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r);
			
 
				+/*			fprintf(stderr, "col %d line %d -> sum_r = %d out -> %d\n", col, line, sum_r, out[col + line*FACTOR].r); */
			
 
				 	
			
 
				 		}
			
 
				 	}
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
@@ -92,16 +92,12 @@ static struct starpu_codelet_t ds_codelet = {
 
				 /* each block contains BLOCK_HEIGHT consecutive lines */
			
 
				 static struct starpu_data_filter filter_y = {
			
 
				 	.filter_func = starpu_block_filter_func,
			
 
				-	.nchildren= HEIGHT/BLOCK_HEIGHT,
			
 
				-	.get_nchildren = NULL,
			
 
				-	.get_child_ops = NULL
			
 
				+	.nchildren= HEIGHT/BLOCK_HEIGHT
			
 
				 };
			
 
				 	
			
 
				 static struct starpu_data_filter filter_uv = {
			
 
				 	.filter_func = starpu_block_filter_func,
			
 
				-	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT,
			
 
				-	.get_nchildren = NULL,
			
 
				-	.get_child_ops = NULL
			
 
				+	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
@@ -111,7 +107,7 @@ int main(int argc, char **argv)
 
				 	
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-//	fprintf(stderr, "Reading input file ...\n");
			
 
				+/*	fprintf(stderr, "Reading input file ...\n"); */
			
 
				 
			
 
				 	/* how many frames ? */
			
 
				 	struct stat stbuf;
			
@@ -120,7 +116,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	unsigned nframes = filesize/FRAMESIZE; 
			
 
				 
			
 
				-//	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes);
			
 
				+/*	fprintf(stderr, "filesize %lx (FRAME SIZE %lx NEW SIZE %lx); nframes %d\n", filesize, FRAMESIZE, NEW_FRAMESIZE, nframes); */
			
 
				 	assert((filesize % sizeof(struct yuv_frame)) == 0);
			
 
				 
			
 
				 	/* fetch input data */
			
@@ -134,7 +130,7 @@ int main(int argc, char **argv)
 
				 	FILE *f_out = fopen(filename_out, "w+");
			
 
				 	assert(f_out);
			
 
				 
			
 
				-//	fprintf(stderr, "Alloc output file ...\n");
			
 
				+/*	fprintf(stderr, "Alloc output file ...\n"); */
			
 
				 	struct yuv_new_frame *yuv_out_buffer = calloc(nframes, NEW_FRAMESIZE);
			
 
				 	assert(yuv_out_buffer);
			
 
				 
			
@@ -199,7 +195,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	unsigned ntasks = (nblocks_y + 2*nblocks_uv)*nframes;
			
 
				 
			
 
				-	fprintf(stderr, "Start computation: there will be %d tasks for %d frames\n", ntasks, nframes);
			
 
				+	fprintf(stderr, "Start computation: there will be %u tasks for %u frames\n", ntasks, nframes);
			
 
				 	gettimeofday(&start, NULL);
			
 
				 
			
 
				 	/* do the computation */
			
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,6 +20,8 @@
 
				 #include <assert.h>
			
 
				 #include <unistd.h>
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static unsigned niter = 500;
			
 
				 
			
 
				 void sleep_codelet(__attribute__ ((unused)) void *descr[],
			
@@ -70,7 +72,7 @@ int main(int argc, char **argv)
 
				 		int ret = starpu_task_submit(task);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 		{
			
 
				-			fprintf(stderr, "No worker may execute this task\n");
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				 			exit(0);
			
 
				 		}
			
 
				 	}
			
@@ -97,8 +99,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	free(tasks);
			
 
				 
			
 
				-	fprintf(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
			
 
				-	fprintf(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
			
 
				+	FPRINTF(stderr, "Avg. delay : %2.2lf us\n", (delay_sum)/niter);
			
 
				+	FPRINTF(stderr, "Avg. length : %2.2lf us\n", (length_sum)/niter);
			
 
				 
			
 
				 	/* Display the occupancy of all workers during the test */
			
 
				 	int worker;
			
@@ -117,10 +119,10 @@ int main(int argc, char **argv)
 
				 
			
 
				 		char workername[128];
			
 
				 		starpu_worker_get_name(worker, workername, 128);
			
 
				-		fprintf(stderr, "Worker %s:\n", workername);
			
 
				-		fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
			
 
				-		fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
			
 
				-		fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
			
 
				+		FPRINTF(stderr, "Worker %s:\n", workername);
			
 
				+		FPRINTF(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
			
 
				+		FPRINTF(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
			
 
				+		FPRINTF(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
			
 
				 	}
			
 
				 
			
 
				 	starpu_shutdown();
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,13 +22,15 @@
 
				 #include <cublas.h>
			
 
				 #endif
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static float *x;
			
 
				 static float *y;
			
 
				 static starpu_data_handle *x_handles;
			
 
				 static starpu_data_handle *y_handles;
			
 
				 
			
 
				 static unsigned nblocks = 4096;
			
 
				-static unsigned entries_per_bock = 1024;
			
 
				+static unsigned entries_per_block = 1024;
			
 
				 
			
 
				 #define DOT_TYPE double
			
 
				 
			
@@ -75,9 +77,16 @@ void redux_cpu_func(void *descr[], void *cl_arg)
 
				 	*dota = *dota + *dotb;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void redux_cuda_func(void *descr[], void *_args);
			
 
				+#endif
			
 
				+
			
 
				 static struct starpu_codelet_t redux_codelet = {
			
 
				-	.where = STARPU_CPU,
			
 
				+	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_func = redux_cpu_func,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_func = redux_cuda_func,
			
 
				+#endif
			
 
				 	.nbuffers = 2
			
 
				 };
			
 
				 
			
@@ -118,11 +127,11 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
				 
			
 
				 	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
			
 
				 
			
 
				-	int ret = cudaThreadSynchronize();
			
 
				+	cudaThreadSynchronize();
			
 
				 
			
 
				 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
			
 
				 
			
 
				-	//fprintf(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot);
			
 
				+	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
			
 
				 	current_dot += local_dot;
			
 
				 
			
 
				 	cudaThreadSynchronize();
			
@@ -146,15 +155,13 @@ static struct starpu_codelet_t dot_codelet = {
 
				  *	Tasks initialization
			
 
				  */
			
 
				 
			
 
				-extern void starpu_data_end_reduction_mode(starpu_data_handle handle);
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	starpu_init(NULL);
			
 
				 
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				-	unsigned long nelems = nblocks*entries_per_bock;
			
 
				+	unsigned long nelems = nblocks*entries_per_block;
			
 
				 	size_t size = nelems*sizeof(float);
			
 
				 
			
 
				 	x = malloc(size);
			
@@ -182,9 +189,9 @@ int main(int argc, char **argv)
 
				 	for (block = 0; block < nblocks; block++)
			
 
				 	{
			
 
				 		starpu_vector_data_register(&x_handles[block], 0,
			
 
				-			(uintptr_t)&x[entries_per_bock*block], entries_per_bock, sizeof(float));
			
 
				+			(uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				 		starpu_vector_data_register(&y_handles[block], 0,
			
 
				-			(uintptr_t)&y[entries_per_bock*block], entries_per_bock, sizeof(float));
			
 
				+			(uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
			
 
				 	}
			
 
				 
			
 
				 	starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
			
@@ -199,6 +206,7 @@ int main(int argc, char **argv)
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 		task->cl = &dot_codelet;
			
 
				+		task->destroy = 1;
			
 
				 
			
 
				 		task->buffers[0].handle = x_handles[block];
			
 
				 		task->buffers[0].mode = STARPU_R;
			
@@ -208,16 +216,33 @@ int main(int argc, char **argv)
 
				 		task->buffers[2].mode = STARPU_REDUX;
			
 
				 
			
 
				 		int ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				 
			
 
				+	for (block = 0; block < nblocks; block++)
			
 
				+	{
			
 
				+		starpu_data_unregister(x_handles[block]);
			
 
				+		starpu_data_unregister(y_handles[block]);
			
 
				+	}
			
 
				 	starpu_data_unregister(dot_handle);
			
 
				 
			
 
				-	fprintf(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				+	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				 
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				+	free(x);
			
 
				+	free(y);
			
 
				+	free(x_handles);
			
 
				+	free(y_handles);
			
 
				+
			
 
				 	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+	/* yes, we do not perform the computation but we did detect that no one
			
 
				+ 	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -22,6 +22,8 @@
 
				 static unsigned nblocks = 8192;
			
 
				 static unsigned entries_per_bock = 1024;
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 #define TYPE		double
			
 
				 #define TYPE_MAX	DBL_MAX
			
 
				 #define TYPE_MIN	DBL_MIN
			
@@ -171,15 +173,19 @@ int main(int argc, char **argv)
 
				 		if (ret)
			
 
				 		{
			
 
				 			STARPU_ASSERT(ret == -ENODEV);
			
 
				-			fprintf(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
			
 
				+			FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
			
 
				 			return 0;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	for (block = 0; block < nblocks; block++)
			
 
				+	{
			
 
				+		starpu_data_unregister(x_handles[block]);
			
 
				+	}
			
 
				 	starpu_data_unregister(minmax_handle);
			
 
				 
			
 
				-	fprintf(stderr, "Min : %e\n", minmax[0]);
			
 
				-	fprintf(stderr, "Max : %e\n", minmax[1]);
			
 
				+	FPRINTF(stderr, "Min : %e\n", minmax[0]);
			
 
				+	FPRINTF(stderr, "Max : %e\n", minmax[1]);
			
 
				 
			
 
				 	STARPU_ASSERT(minmax[0] <= minmax[1]);
			
 
				 
			
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,7 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 #define NTASKS	32000
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 struct starpu_task_list sched_list;
			
 
				 
			
@@ -38,7 +39,7 @@ static void init_dummy_sched(struct starpu_machine_topology_s *topology,
 
				 	for (workerid = 0; workerid < topology->nworkers; workerid++)
			
 
				 		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
			
 
				 
			
 
				-	fprintf(stderr, "Initialising Dummy scheduler\n");
			
 
				+	FPRINTF(stderr, "Initialising Dummy scheduler\n");
			
 
				 }
			
 
				 
			
 
				 static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
			
@@ -49,7 +50,7 @@ static void deinit_dummy_sched(struct starpu_machine_topology_s *topology,
 
				 	pthread_cond_destroy(&sched_cond);
			
 
				 	pthread_mutex_destroy(&sched_mutex);
			
 
				 
			
 
				-	fprintf(stderr, "Destroying Dummy scheduler\n");
			
 
				+	FPRINTF(stderr, "Destroying Dummy scheduler\n");
			
 
				 }
			
 
				 
			
 
				 static int push_task_dummy(struct starpu_task *task)
			
@@ -80,7 +81,6 @@ static struct starpu_sched_policy_s dummy_sched_policy = {
 
				 	.init_sched = init_dummy_sched,
			
 
				 	.deinit_sched = deinit_dummy_sched,
			
 
				 	.push_task = push_task_dummy,
			
 
				-	.push_prio_task = NULL,
			
 
				 	.pop_task = pop_task_dummy,
			
 
				 	.post_exec_hook = NULL,
			
 
				 	.pop_every_task = NULL,
			
@@ -118,10 +118,16 @@ static starpu_codelet dummy_codelet =
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	int ntasks = NTASKS;
			
 
				+
			
 
				 	starpu_init(&conf);
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	ntasks /= 100;
			
 
				+#endif
			
 
				+
			
 
				 	unsigned i;
			
 
				-	for (i = 0; i < NTASKS; i++)
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 	
			
--- a/examples/socl/Makefile.am
+++ b/examples/socl/Makefile.am
@@ -0,0 +1,51 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+LIBS = $(top_builddir)/socl/src/libsocl.la
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/socl/include/ 
			
 
				+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				+
			
 
				+
			
 
				+SOCL_EXAMPLES	=
			
 
				+TESTS		=	$(SOCL_EXAMPLES)
			
 
				+
			
 
				+check_PROGRAMS	=	$(STARPU_EXAMPLES)
			
 
				+
			
 
				+examplebindir = $(libdir)/starpu/examples/socl/
			
 
				+examplebin_PROGRAMS =
			
 
				+
			
 
				+
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	basic/basic		\
			
 
				+	mandelbrot/mandelbrot		\
			
 
				+	clinfo/clinfo
			
 
				+
			
 
				+
			
 
				+SOCL_EXAMPLES +=				\
			
 
				+	basic/basic		\
			
 
				+	mandelbrot/mandelbrot		\
			
 
				+	clinfo/clinfo
			
 
				+
			
 
				+basic_basic_SOURCES = basic/basic.c
			
 
				+clinfo_clinfo_SOURCES = clinfo/clinfo.c
			
 
				+mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
			
 
				+
			
 
				+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
			
 
				+if HAVE_X11
			
 
				+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
			
 
				+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
			
 
				+endif
			
--- a/examples/socl/basic/basic.c
+++ b/examples/socl/basic/basic.c
@@ -0,0 +1,211 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
			
 
				+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
			
 
				+
			
 
				+#ifdef UNUSED
			
 
				+#elif defined(__GNUC__)
			
 
				+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
			
 
				+#else
			
 
				+# define UNUSED(x) x
			
 
				+#endif
			
 
				+
			
 
				+#define SIZE 1024
			
 
				+#define TYPE float
			
 
				+#define REALSIZE (SIZE * sizeof(TYPE))
			
 
				+
			
 
				+const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \
			
 
				+   size_t x = get_global_id(0);\
			
 
				+   size_t y = get_global_id(1);\
			
 
				+   size_t w = get_global_size(0); \
			
 
				+   int idx = y*w+x; \
			
 
				+   d[idx] = s1[idx] + s2[idx];\
			
 
				+}";
			
 
				+
			
 
				+
			
 
				+
			
 
				+int main(int UNUSED(argc), char** UNUSED(argv)) {
			
 
				+   cl_platform_id platforms[15];
			
 
				+   cl_uint num_platforms;
			
 
				+   cl_device_id devices[15];
			
 
				+   cl_uint num_devices;
			
 
				+   cl_context context;
			
 
				+   cl_program program;
			
 
				+   cl_kernel kernel;
			
 
				+   cl_mem s1m, s2m, dm;
			
 
				+   cl_command_queue cq;
			
 
				+   cl_int err;
			
 
				+
			
 
				+   TYPE s1[SIZE],s2[SIZE],d[SIZE];
			
 
				+
			
 
				+   {
			
 
				+      int i;
			
 
				+      for (i=0; i<SIZE; i++) {
			
 
				+         s1[i] = 2.0;
			
 
				+         s2[i] = 7.0;
			
 
				+         d[i] = 98.0;
			
 
				+      }
			
 
				+   }
			
 
				+
			
 
				+   printf("Querying platform...\n");
			
 
				+   err = clGetPlatformIDs(0, NULL, &num_platforms);
			
 
				+   if (num_platforms == 0) {
			
 
				+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
			
 
				+      exit(0);
			
 
				+   }
			
 
				+   err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
			
 
				+   check(err, "clGetPlatformIDs");
			
 
				+
			
 
				+   printf("Querying devices...\n");
			
 
				+   unsigned int platform_idx;
			
 
				+   for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
			
 
				+      err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
			
 
				+      check(err, "clGetDeviceIDs");
			
 
				+      if (num_devices != 0)
			
 
				+         break;
			
 
				+   }
			
 
				+   if (num_devices == 0)
			
 
				+      error("No OpenCL device found\n");
			
 
				+
			
 
				+   printf("Creating context...\n");
			
 
				+   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
			
 
				+   context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
			
 
				+   check(err, "clCreateContext");
			
 
				+
			
 
				+   printf("Creating program...\n");
			
 
				+   program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
			
 
				+   check(err, "clCreateProgram");
			
 
				+
			
 
				+   printf("Building program...\n");
			
 
				+   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
			
 
				+   check(err, "clBuildProgram");
			
 
				+
			
 
				+   printf("Creating kernel...\n");
			
 
				+   kernel = clCreateKernel(program, "add", &err);
			
 
				+   check(err, "clCreateKernel");
			
 
				+
			
 
				+   printf("Creating buffers...\n");
			
 
				+   s1m = clCreateBuffer(context, CL_MEM_READ_WRITE, REALSIZE, NULL, &err);
			
 
				+   check(err, "clCreateBuffer s1");
			
 
				+   s2m = clCreateBuffer(context, CL_MEM_READ_ONLY, REALSIZE, NULL, &err);
			
 
				+   check(err, "clCreateBuffer s2");
			
 
				+   dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY, REALSIZE, NULL, &err);
			
 
				+   check(err, "clCreateBuffer d");
			
 
				+
			
 
				+   printf("Creating command queue...\n");
			
 
				+   cl_event eventW1, eventW2, eventK, eventR;
			
 
				+
			
 
				+#ifdef PROFILING
			
 
				+   cq = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
			
 
				+#else
			
 
				+   cq = clCreateCommandQueue(context, devices[0], 0, &err);
			
 
				+#endif
			
 
				+   check(err, "clCreateCommandQueue");
			
 
				+
			
 
				+   printf("Enqueueing WriteBuffers...\n");
			
 
				+   err = clEnqueueWriteBuffer(cq, s1m, CL_FALSE, 0, REALSIZE, s1, 0, NULL, &eventW1);
			
 
				+   check(err, "clEnqueueWriteBuffer s1");
			
 
				+   err = clEnqueueWriteBuffer(cq, s2m, CL_FALSE, 0, REALSIZE, s2, 0, NULL, &eventW2);
			
 
				+   check(err, "clEnqueueWriteBuffer s2");
			
 
				+
			
 
				+   printf("Setting kernel arguments...\n");
			
 
				+   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m);
			
 
				+   check(err, "clSetKernelArg 0");
			
 
				+   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m);
			
 
				+   check(err, "clSetKernelArg 1");
			
 
				+   err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm);
			
 
				+   check(err, "clSetKernelArg 2");
			
 
				+
			
 
				+   printf("Enqueueing NDRangeKernel...\n");
			
 
				+   size_t local[3] = {16, 1, 1};
			
 
				+   size_t global[3] = {1024, 1, 1};
			
 
				+   cl_event deps[] = {eventW1,eventW2};
			
 
				+   err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, 2, deps, &eventK);
			
 
				+   check(err, "clEnqueueNDRangeKernel");
			
 
				+
			
 
				+   printf("Enqueueing ReadBuffer...\n");
			
 
				+   err = clEnqueueReadBuffer(cq, dm, CL_FALSE, 0, REALSIZE, d, 0, NULL, &eventR);
			
 
				+   check(err, "clEnqueueReadBuffer");
			
 
				+
			
 
				+   clFinish(cq);
			
 
				+
			
 
				+   {
			
 
				+      int i;
			
 
				+      for (i=0; i<SIZE; i++) {
			
 
				+        printf("%f ", d[i]);
			
 
				+      }
			
 
				+      printf("\n");
			
 
				+   }
			
 
				+
			
 
				+#ifdef PROFILING
			
 
				+   #define DURATION(event,label) do { \
			
 
				+      cl_ulong t0,t1; \
			
 
				+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t0, NULL);\
			
 
				+      check(err, "clGetEventProfilingInfo");\
			
 
				+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
			
 
				+      check(err, "clGetEventProfilingInfo");\
			
 
				+      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
			
 
				+   } while (0);
			
 
				+
			
 
				+   DURATION(eventW1, "first buffer writing");
			
 
				+   DURATION(eventW2, "second buffer writing");
			
 
				+   DURATION(eventK, "kernel execution");
			
 
				+   DURATION(eventR, "result buffer reading");
			
 
				+#endif
			
 
				+
			
 
				+   
			
 
				+   printf("Releasing events...\n");
			
 
				+   err = clReleaseEvent(eventW1);
			
 
				+   err |= clReleaseEvent(eventW2);
			
 
				+   err |= clReleaseEvent(eventK);
			
 
				+   err |= clReleaseEvent(eventR);
			
 
				+   check(err, "clReleaseCommandQueue");
			
 
				+
			
 
				+   printf("Releasing command queue...\n");
			
 
				+   err = clReleaseCommandQueue(cq);
			
 
				+   check(err, "clReleaseCommandQueue");
			
 
				+
			
 
				+   printf("Releasing buffers...\n");
			
 
				+   err = clReleaseMemObject(s1m);
			
 
				+   check(err, "clReleaseMemObject s1");
			
 
				+   err = clReleaseMemObject(s2m);
			
 
				+   check(err, "clReleaseMemObject s2");
			
 
				+   err = clReleaseMemObject(dm);
			
 
				+   check(err, "clReleaseMemObject d");
			
 
				+
			
 
				+   printf("Releasing kernel...\n");
			
 
				+   err = clReleaseKernel(kernel);
			
 
				+   check(err, "clReleaseKernel");
			
 
				+
			
 
				+   printf("Releasing program...\n");
			
 
				+   err = clReleaseProgram(program);
			
 
				+   check(err, "clReleaseProgram");
			
 
				+
			
 
				+   printf("Releasing context...\n");
			
 
				+   err = clReleaseContext(context);
			
 
				+   check(err, "clReleaseContext");
			
 
				+
			
 
				+   return 0;
			
 
				+}
			
--- a/examples/socl/clinfo/clinfo.c
+++ b/examples/socl/clinfo/clinfo.c
@@ -0,0 +1,299 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <CL/cl.h>
			
 
				+
			
 
				+inline 
			
 
				+void 
			
 
				+checkErr(cl_int err, const char * name) {
			
 
				+    if (err != CL_SUCCESS) {
			
 
				+        fprintf(stderr, "ERROR: %s (%d)\n", name, err);
			
 
				+        exit(1);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void) {
			
 
				+   cl_int err;
			
 
				+   cl_uint num_platforms;
			
 
				+   cl_platform_id *platforms;
			
 
				+
			
 
				+   // Plaform info
			
 
				+   err = clGetPlatformIDs(0, NULL, &num_platforms);
			
 
				+   checkErr(err, "Unable to get platform count");
			
 
				+
			
 
				+   platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
			
 
				+   err = clGetPlatformIDs(num_platforms, platforms, NULL);
			
 
				+   checkErr(err, "Unable to get platform list");
			
 
				+   
			
 
				+   
			
 
				+   // Iteratate over platforms
			
 
				+   printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
			
 
				+
			
 
				+   {
			
 
				+      unsigned int i;
			
 
				+      for (i=0; i<num_platforms; i++) {
			
 
				+         char str[256];
			
 
				+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
			
 
				+         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
			
 
				+
			
 
				+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
			
 
				+         printf("  Plaform Version:\t\t\t\t %s\n", str);    
			
 
				+
			
 
				+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
			
 
				+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
			
 
				+
			
 
				+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
			
 
				+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
			
 
				+
			
 
				+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
			
 
				+         printf("  Plaform Extensions:\t\t\t %s\n", str);    
			
 
				+      }
			
 
				+   }
			
 
				+
			
 
				+   printf("\n\n");
			
 
				+
			
 
				+   // Now Iteratate over each platform and its devices
			
 
				+   {
			
 
				+      unsigned int i;
			
 
				+      for (i=0; i<num_platforms; i++) {
			
 
				+         char str[256];
			
 
				+         cl_device_id * devices;
			
 
				+         cl_uint num_devices;
			
 
				+
			
 
				+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
			
 
				+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
			
 
				+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
			
 
				+
			
 
				+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
			
 
				+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
			
 
				+         devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
			
 
				+         
			
 
				+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
			
 
				+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
			
 
				+
			
 
				+         printf("  Number of devices:\t\t\t\t %d\n", num_devices);
			
 
				+         {
			
 
				+            unsigned int j;
			
 
				+            for (j=0; j<num_devices; j++) {
			
 
				+               cl_device_type dev_type;
			
 
				+               printf("\n  DEVICE %d\n", j);
			
 
				+               
			
 
				+               err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
			
 
				+               checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
			
 
				+
			
 
				+               printf("  Device Type:\t\t\t\t\t ");
			
 
				+               if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
			
 
				+                  printf("CL_DEVICE_TYPE_ACCELERATOR ");
			
 
				+               else if (dev_type & CL_DEVICE_TYPE_CPU)
			
 
				+                  printf("CL_DEVICE_TYPE_CPU ");
			
 
				+               else if (dev_type & CL_DEVICE_TYPE_GPU)
			
 
				+                  printf("CL_DEVICE_TYPE_GPU ");
			
 
				+               else if (dev_type & CL_DEVICE_TYPE_DEFAULT)
			
 
				+                  printf("CL_DEVICE_TYPE_DEFAULT ");
			
 
				+
			
 
				+               printf("\n");
			
 
				+
			
 
				+               {
			
 
				+                  cl_uint vendor_id;
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
			
 
				+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
			
 
				+               }
			
 
				+               {
			
 
				+                  cl_uint units;
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
			
 
				+                  printf("  Max compute units:\t\t\t\t %d\n", units); 
			
 
				+               }
			
 
				+
			
 
				+               {
			
 
				+                  cl_uint dims;
			
 
				+                  size_t *sizes;
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
			
 
				+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
			
 
				+
			
 
				+                  sizes = (size_t*)malloc(dims * sizeof(size_t));
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
			
 
				+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
			
 
				+
			
 
				+                  {
			
 
				+                     unsigned int k;
			
 
				+                     printf("    Max work items:\t\t\t\t (");
			
 
				+                     for (k=0; k<dims; k++) {
			
 
				+                        printf("%u", (unsigned int)sizes[k]);
			
 
				+                        if (k != dims-1)
			
 
				+                           printf(",");
			
 
				+                     }
			
 
				+                     printf(")\n");
			
 
				+                  }
			
 
				+               }
			
 
				+
			
 
				+#define GET_SIZET(CL_D,str) { \
			
 
				+   size_t val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, (unsigned int)val); \
			
 
				+}
			
 
				+
			
 
				+#define GET_STRING(CL_D,str,size) { \
			
 
				+   char val[size]; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, val); \
			
 
				+}
			
 
				+
			
 
				+#define GET_UINT(CL_D,str) { \
			
 
				+   cl_uint val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, val); \
			
 
				+}
			
 
				+
			
 
				+#define GET_ULONG(CL_D,str) { \
			
 
				+   cl_ulong val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, val); \
			
 
				+}
			
 
				+
			
 
				+#define GET_BOOL(CL_D,str) { \
			
 
				+   cl_bool val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, (val == CL_TRUE ? "Yes" : "No")); \
			
 
				+}
			
 
				+
			
 
				+#define GET_BOOL_CUSTOM(CL_D,str,t,f) { \
			
 
				+   cl_bool val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, (val == CL_TRUE ? t : f)); \
			
 
				+}
			
 
				+
			
 
				+#define GET_BITSET_AND(TYPE,CL_D,test,str) { \
			
 
				+   TYPE val; \
			
 
				+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
			
 
				+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
			
 
				+   printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
			
 
				+}
			
 
				+      
			
 
				+               GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
			
 
				+               
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "  Preferred vector width long:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "  Preferred vector width float:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
			
 
				+               GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
			
 
				+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
			
 
				+
			
 
				+               GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
			
 
				+
			
 
				+               GET_SIZET(CL_DEVICE_MAX_PARAMETER_SIZE, "  Max size of kernel argument:\t\t\t %u\n")
			
 
				+               GET_UINT(CL_DEVICE_MEM_BASE_ADDR_ALIGN, "  Alignment of base addres:\t\t\t %u bits\n")
			
 
				+               GET_UINT(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "  Minimum alignment for any datatype:\t\t %u bytes\n")
			
 
				+
			
 
				+               printf("  Single precision floating point capability\n");
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_DENORM, "    Denorms:\t\t\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_INF_NAN, "    Quiet NaNs:\t\t\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_NEAREST, "    Round to nearest even:\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_ZERO, "    Round to zero:\t\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_INF, "    Round to +ve and infinity:\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_FMA, "    IEEE754-2008 fused multiply-add:\t\t %s\n")
			
 
				+
			
 
				+               {
			
 
				+                  cl_device_mem_cache_type cache;
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cache), &cache, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)");
			
 
				+                  printf("  Cache type:\t\t\t\t\t ");
			
 
				+                  switch (cache) {
			
 
				+                     case CL_NONE:
			
 
				+                        printf("None\n");
			
 
				+                        break;
			
 
				+                     case CL_READ_ONLY_CACHE:
			
 
				+                        printf("Read only\n");
			
 
				+                        break;
			
 
				+                     case CL_READ_WRITE_CACHE:
			
 
				+                        printf("Read/Write\n");
			
 
				+                        break;
			
 
				+                  }
			
 
				+               }
			
 
				+
			
 
				+               GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
			
 
				+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
			
 
				+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
			
 
				+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
			
 
				+               GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
			
 
				+
			
 
				+               {
			
 
				+                  cl_device_local_mem_type cache;
			
 
				+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cache), &cache, NULL);
			
 
				+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_TYPE)");
			
 
				+                  printf("  Local memory type:\t\t\t\t ");
			
 
				+                  switch (cache) {
			
 
				+                     case CL_LOCAL:
			
 
				+                        printf("Local\n");
			
 
				+                        break;
			
 
				+                     case CL_GLOBAL:
			
 
				+                        printf("Global\n");
			
 
				+                        break;
			
 
				+                  }
			
 
				+               }
			
 
				+
			
 
				+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
			
 
				+               GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
			
 
				+               GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
			
 
				+               GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")
			
 
				+               GET_BOOL(CL_DEVICE_COMPILER_AVAILABLE, "  Compiler available:\t\t\t\t %s\n")
			
 
				+
			
 
				+               printf("  Execution capabilities:\t\t\t\t \n");
			
 
				+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_KERNEL, "  Execute OpenCL kernels:\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_NATIVE_KERNEL, "  Execute native kernels:\t\t\t %s\n")
			
 
				+
			
 
				+               printf("  Queue properties:\t\t\t\t\n ");
			
 
				+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "   Out-of-Order:\t\t\t\t %s\n")
			
 
				+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, "    Profiling:\t\t\t\t\t %s\n")
			
 
				+
			
 
				+
			
 
				+               GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
			
 
				+               GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
			
 
				+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
			
 
				+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
			
 
				+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
			
 
				+               GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
			
 
				+            
			
 
				+               printf("\n");
			
 
				+            }
			
 
				+         }
			
 
				+      }
			
 
				+   }
			
 
				+
			
 
				+   return 0;
			
 
				+}
			
--- a/examples/socl/mandelbrot/mandelbrot.c
+++ b/examples/socl/mandelbrot/mandelbrot.c