Browse Source

Create new MPI branch for Marc Sergent's internship

Nathalie Furmento 13 years ago
parent
commit
b12d5b53ba
100 changed files with 6649 additions and 2199 deletions
  1. 7 3
      .gitignore
  2. 12 12
      AUTHORS
  3. 158 2
      ChangeLog
  4. 45 4
      Makefile.am
  5. 10 8
      README
  6. 2 0
      STARPU-VERSION
  7. 0 342
      build-aux/compile
  8. 159 0
      build-aux/pmccabe.css
  9. 907 0
      build-aux/pmccabe2html
  10. 457 0
      build-aux/svn2cl.xsl
  11. 465 212
      configure.ac
  12. 19 11
      doc/Makefile.am
  13. 455 61
      doc/chapters/advanced-api.texi
  14. 166 126
      doc/chapters/advanced-examples.texi
  15. 494 159
      doc/chapters/basic-api.texi
  16. 112 141
      doc/chapters/basic-examples.texi
  17. 47 0
      doc/chapters/benchmarks.texi
  18. 13 7
      doc/chapters/c-extensions.texi
  19. 400 177
      doc/chapters/configuration.texi
  20. 0 1
      doc/chapters/fdl-1.3.texi
  21. 3 3
      doc/chapters/fft-support.texi
  22. 60 29
      doc/chapters/installing.texi
  23. 28 11
      doc/chapters/introduction.texi
  24. 106 144
      doc/chapters/mpi-support.texi
  25. 210 24
      doc/chapters/perf-feedback.texi
  26. 269 10
      doc/chapters/perf-optimization.texi
  27. 6 7
      doc/chapters/scaling-vector-example.texi
  28. 394 0
      doc/chapters/sched_ctx_hypervisor.texi
  29. 7 15
      doc/chapters/socl.texi
  30. 36 15
      doc/chapters/using.texi
  31. 0 1
      doc/chapters/vector_scal_c.texi
  32. 1 1
      doc/chapters/vector_scal_cpu.texi
  33. 4 4
      doc/chapters/vector_scal_cuda.texi
  34. 1 2
      doc/chapters/vector_scal_opencl.texi
  35. 20 3
      doc/starpu.css
  36. 47 20
      doc/starpu.texi
  37. 10 25
      doc/tutorial/Makefile
  38. 12 24
      doc/tutorial/hello_world.c
  39. 9 23
      doc/tutorial/vector_scal.c
  40. 9 22
      doc/tutorial/vector_scal_cpu.c
  41. 9 23
      doc/tutorial/vector_scal_cuda.cu
  42. 9 23
      doc/tutorial/vector_scal_opencl.c
  43. 9 22
      doc/tutorial/vector_scal_opencl_kernel.cl
  44. 45 39
      examples/Makefile.am
  45. 10 3
      examples/audio/starpu_audio_processing.c
  46. 1 4
      examples/axpy/axpy.c
  47. 18 0
      examples/axpy/axpy.h
  48. 0 1
      examples/axpy/axpy_opencl.c
  49. 0 1
      examples/basic_examples/block.c
  50. 0 1
      examples/basic_examples/block_cuda.cu
  51. 0 1
      examples/basic_examples/block_opencl.c
  52. 1 0
      examples/basic_examples/hello_world.c
  53. 1 4
      examples/basic_examples/multiformat.c
  54. 0 1
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  55. 0 1
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  56. 0 1
      examples/basic_examples/multiformat_cuda.cu
  57. 0 1
      examples/basic_examples/multiformat_opencl.c
  58. 4 5
      examples/basic_examples/variable.c
  59. 1 2
      examples/basic_examples/variable_kernels.cu
  60. 1 2
      examples/basic_examples/variable_kernels_opencl.c
  61. 9 3
      examples/basic_examples/vector_scal.c
  62. 1 2
      examples/basic_examples/vector_scal_c.c
  63. 4 63
      examples/basic_examples/vector_scal_cpu.c
  64. 0 1
      examples/basic_examples/vector_scal_cpu_icc.icc
  65. 26 0
      examples/basic_examples/vector_scal_cpu_icc.icc
  66. 93 0
      examples/basic_examples/vector_scal_cpu_template.h
  67. 2 3
      examples/basic_examples/vector_scal_cuda.cu
  68. 2 3
      examples/basic_examples/vector_scal_opencl.c
  69. 0 1
      examples/binary/binary.c
  70. 1 1
      examples/cg/cg.c
  71. 0 3
      examples/cg/cg.h
  72. 1 2
      examples/cg/cg_dot_kernel.cu
  73. 34 4
      examples/cg/cg_kernels.c
  74. 47 8
      examples/cholesky/cholesky.h
  75. 10 6
      examples/cholesky/cholesky_grain_tag.c
  76. 75 35
      examples/cholesky/cholesky_implicit.c
  77. 5 8
      examples/cholesky/cholesky_kernels.c
  78. 10 6
      examples/cholesky/cholesky_tag.c
  79. 12 31
      examples/cholesky/cholesky_tile_tag.c
  80. 0 1
      examples/cpp/incrementer_cpp.cpp
  81. 0 1
      examples/filters/custom_mf/conversion.cu
  82. 0 1
      examples/filters/custom_mf/conversion_opencl.c
  83. 0 1
      examples/filters/custom_mf/cuda.cu
  84. 57 171
      examples/filters/custom_mf/custom_interface.c
  85. 1 1
      examples/filters/custom_mf/custom_interface.h
  86. 0 3
      examples/filters/custom_mf/custom_mf_filter.c
  87. 0 1
      examples/filters/custom_mf/custom_opencl.c
  88. 1 2
      examples/filters/fblock.c
  89. 1 2
      examples/filters/fblock_cuda.cu
  90. 1 2
      examples/filters/fblock_opencl.c
  91. 189 0
      examples/filters/shadow.c
  92. 291 0
      examples/filters/shadow2d.c
  93. 331 0
      examples/filters/shadow3d.c
  94. 3 2
      examples/gl_interop/gl_interop.c
  95. 154 0
      examples/gl_interop/gl_interop_idle.c
  96. 5 29
      examples/heat/dw_factolu.c
  97. 3 6
      examples/heat/dw_factolu.h
  98. 10 10
      examples/heat/dw_factolu_kernels.c
  99. 1 2
      examples/heat/dw_sparse_cg.h
  100. 0 0
      examples/heat/heat.c

+ 7 - 3
.gitignore

@@ -4,6 +4,7 @@
 /autom4te.cache
 /libtool
 /aclocal.m4
+/build
 /build-aux
 /GPATH
 /GRTAGS
@@ -23,12 +24,12 @@ Makefile.in
 .dirstamp
 stamp-h[0-9]*
 starpu.log
-/gcc-plugin/src/starpu-gcc-config.h
+/gcc-plugin/include/starpu-gcc/config.h
 /gcc-plugin/tests/*.c.[0-9]*.*
 /tests/datawizard/handle_to_pointer
 /tests/datawizard/data_lookup
 /doc/stamp-vti
-/doc/version.texi
+/doc/chapters/version.texi
 /examples/basic_examples/block
 /examples/basic_examples/hello_world
 /examples/basic_examples/mult
@@ -143,7 +144,7 @@ starpu.log
 /tests/parallel_tasks/explicit_combined_worker
 /tests/parallel_tasks/parallel_kernels
 /tests/parallel_tasks/parallel_kernels_spmd
-/tests/parallel_tasks/spmd_pgreedy
+/tests/parallel_tasks/spmd_peager
 /tests/perfmodels/non_linear_regression_based
 /tests/perfmodels/regression_based
 /tools/cbc2paje
@@ -287,3 +288,6 @@ starpu.log
 /tests/datawizard/interfaces/copy_interfaces
 /gcc-plugin/tests/release
 /gcc-plugin/tests/opencl
+/gcc-plugin/tests/registered
+/gcc-plugin/tests/warn-unregistered
+/cyclomatic-complexity.html

+ 12 - 12
AUTHORS

@@ -1,19 +1,19 @@
 Cédric Augonnet <cedric.augonnet@inria.fr>
-Nicolas Collin <nicolas.collin@inria.fr>
+William Braik <wbraik@gmail.com>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
-Nathalie Furmento <nathalie.furmento@labri.fr>
-Sylvain Henry <sylvain.henry@inria.fr>
-Cyril Roélandt <cyril.roelandt@inria.fr>
-François Tessier <francois.tessier@inria.fr>
-Samuel Thibault <samuel.thibault@labri.fr>
-Pierre André Wacrenier <wacrenier@labri.fr>
-William Braik <wbraik@gmail.com>
+Ludovic Courtès <ludovic.courtes@inria.fr>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
-Mehdi Juhoor <mjuhoor@gmail.com>
-Anthony Roy <theanthony33@gmail.com>
+Nathalie Furmento <nathalie.furmento@labri.fr>
 David Gómez <david_gomez1380@yahoo.com.mx>
-Nguyen Quôc Dinh <nguyen.quocdinh@gmail.com>
+Sylvain Henry <sylvain.henry@inria.fr>
+Mehdi Juhoor <mjuhoor@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
-
+Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
+Cyril Roelandt <cyril.roelandt@inria.fr>
+Anthony Roy <theanthony33@gmail.com>
+Ludovic Stordeur <ludovic.stordeur@inria.fr>
+François Tessier <francois.tessier@inria.fr>
+Samuel Thibault <samuel.thibault@labri.fr>
+Pierre-André Wacrenier <wacrenier@labri.fr>

+ 158 - 2
ChangeLog

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2013  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -21,15 +21,165 @@ New features:
   * OpenGL interoperability support.
   * Capability to store compiled OpenCL kernels on the file system
   * Capability to load compiled OpenCL kernels
+  * Performance models measurements can now be provided explicitly by
+    applications.
+  * Capability to emit communication statistics when running MPI code
+  * Add starpu_unregister_submit, starpu_data_acquire_on_node and
+    starpu_data_invalidate_submit
+  * New functionnality to wrapper starpu_insert_task to pass a array of
+	data_handles via the parameter STARPU_DATA_ARRAY
+  * Enable GPU-GPU direct transfers.
+  * GCC plug-in
+	- Add `registered' attribute
+	- A new pass was added that warns about the use of possibly
+	  unregistered memory buffers.
+  * SOCL
+        - Manual mapping of commands on specific devices is now
+	  possible
+        - SOCL does not require StarPU CPU tasks anymore. CPU workers
+          are automatically disabled to enhance performance of OpenCL
+          CPU devices
+  * New interface: COO matrix.
+  * Data interfaces: The pack operation of user-defined data interface
+    defines a new parameter count which should be set to the size of
+    the buffer created by the packing of the data.
+  * MPI:
+        - Communication statistics for MPI can only be enabled at
+	  execution time by defining the environment variable
+	  STARPU_COMM_STATS
+        - Communication cache mechanism is enabled by default, and can
+	  only be disabled at execution time by setting the
+	  environment variable STARPU_MPI_CACHE to 0.
+        - Initialisation functions starpu_mpi_initialize_extended()
+  	  and starpu_mpi_initialize() have been made deprecated. One
+	  should now use starpu_mpi_init(int *, char ***, int). The
+	  last parameter indicates if MPI should be initialised.
+        - Collective detached operations have new parameters, a
+	  callback function and a argument. This is to be consistent
+	  with the detached point-to-point communications.
+        - When exchanging user-defined data interfaces, the size of
+	  the data is the size returned by the pack operation, i.e
+	  data with dynamic size can now be exchanged with StarPU-MPI.
+        - New functionality starpu_mpi_irecv_probe_detached which
+  	  first tests if the message is available before calling MPI_Recv.
+  * Add experimental simgrid support, to simulate execution with various
+    number of CPUs, GPUs, amount of memory, etc.
+  * Add support for OpenCL simulators (which provide simulated execution time)
+  * Add support for Temanejo, a task graph debugger
+  * Theoretical bound lp output now includes data transfer time.
+  * Update OpenCL driver to only enable CPU devices (the environment
+        variable STARPU_OPENCL_ONLY_ON_CPUS must be set to a positive
+	value when executing an application)
+  * Add Scheduling contexts to separate computation resources
+    	- Scheduling policies take into account the set of resources corresponding
+	to the context it belongs to
+	- Add support to dynamically change scheduling contexts
+	(Create and Delete a context, Add Workers to a context, Remove workers from a context)
+	- Add support to indicate to which contexts the tasks are submitted
+  * Add the Hypervisor to manage the Scheduling Contexts automatically
+    	- The Contexts can be registered to the Hypervisor
+	- Only the registered contexts are managed by the Hypervisor
+	- The Hypervisor can detect the initial distribution of resources of 
+	a context and constructs it consequently (the cost of execution is required)
+    	- Several policies can adapt dynamically the distribution of resources
+	in contexts if the initial one was not appropriate
+	- Add a platform to implement new policies of redistribution
+	of resources
+  * Implement a memory manager which checks the global amount of
+    memory available on devices, and checks there is enough memory
+    before doing an allocation on the device.
+  * Discard environment variable STARPU_LIMIT_GPU_MEM and define
+    instead STARPU_LIMIT_CUDA_MEM and STARPU_LIMIT_OPENCL_MEM
+  * Introduce new variables STARPU_LIMIT_CUDA_devid_MEM and
+    STARPU_LIMIT_OPENCL_devid_MEM to limit memory per specific device
 
 Changes:
+  * Fix the block filter functions.
+  * Fix StarPU-MPI on Darwin.
   * The FxT code can now be used on systems other than Linux.
+  * Keep only one hashtable implementation common/uthash.h
   * The cache of starpu_mpi_insert_task is fixed and thus now enabled by
     default.
+  * Improve starpu_machine_display output.
+  * Standardize objects name in the performance model API
+  * SOCL
+    - Virtual SOCL device has been removed
+    - Automatic scheduling still available with command queues not
+      assigned to any device
+    - Remove modified OpenCL headers. ICD is now the only supported
+      way to use SOCL.
+    - SOCL test suite is only run when environment variable
+      SOCL_OCL_LIB_OPENCL is defined. It should contain the location
+      of the libOpenCL.so file of the OCL ICD implementation.
+  * Fix main memory leak on multiple unregister/re-register.
+  * Improve hwloc detection by configure
+  * Cell:
+    - It is no longer possible to enable the cell support via the
+      gordon driver
+    - Data interfaces no longer define functions to copy to and from
+      SPU devices
+    - Codelet no longer define pointer for Gordon implementations
+    - Gordon workers are no longer enabled
+    - Gordon performance models are no longer enabled
+  * Fix data transfer arrows in paje traces
+  * The "heft" scheduler no longer exists. Users should now pick "dmda"
+    instead.
+  * StarPU can now use poti to generate paje traces.
+  * Rename scheduling policy "parallel greedy" to "parallel eager"
+
+Small features:
+  * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
+  * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
+  pause trace recording.
+  * Add trace_buffer_size configuration field to permit to specify the tracing
+  buffer size.
+  * Add starpu_codelet_profile and starpu_codelet_histo_profile, tools which draw
+  the profile of a codelet.
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
 	still available for compatibility reasons.
+  * include/starpu.h includes all include/starpu_*.h files, applications
+	therefore only need to have #include <starpu.h>
+  * Active task wait is now included in blocked time.
+  * Fix GCC plugin linking issues starting with GCC 4.7.
+  * Fix forcing calibration of never-calibrated archs.
+  * CUDA applications are no longer compiled with the "-arch sm_13"
+    option. It is specifically added to applications which need it.
+
+StarPU 1.0.3 (svn revision 7379)
+==============================================
+
+Changes:
+  * Several bug fixes in the build system
+  * Bug fixes in source code for non-Linux systems
+  * Fix generating FXT traces bigger than 64MiB.
+  * Improve ENODEV error detections in StarPU FFT
+
+StarPU 1.0.2 (svn revision xxx)
+==============================================
+
+Changes:
+  * Add starpu_block_shadow_filter_func_vector and an example.
+  * Add tag dependency in trace-generated DAG.
+  * Fix CPU binding for optimized CPU-GPU transfers.
+  * Fix parallel tasks CPU binding and combined worker generation.
+  * Fix generating FXT traces bigger than 64MiB.
+
+StarPU 1.0.1 (svn revision 6659)
+==============================================
+
+Changes:
+  * hwloc support. Warn users when hwloc is not found on the system and
+	produce error when not explicitely disabled.
+  * Several bug fixes
+  * GCC plug-in
+	- Add `#pragma starpu release'
+	- Fix bug when using `acquire' pragma with function parameters
+	- Slightly improve test suite coverage
+	- Relax the GCC version check
+  * Update SOCL to use new API
+  * Documentation improvement.
 
 StarPU 1.0.0 (svn revision 6306)
 ==============================================
@@ -230,3 +380,9 @@ Changes:
    - transparent data coherency management
    - High-level expressive interface
 
+
+# Local Variables:
+# mode: text
+# coding: utf-8
+# ispell-local-dictionary: "american"
+# End:

+ 45 - 4
Makefile.am

@@ -18,18 +18,24 @@ ACLOCAL_AMFLAGS=-I m4
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 SUBDIRS = src
-SUBDIRS += tools tests doc
+SUBDIRS += tools tests
+
+if BUILD_DOC
+SUBDIRS += doc
+endif
 
 if USE_MPI
 SUBDIRS += mpi
 endif
 
+if BUILD_EXAMPLES
+SUBDIRS += examples
+endif
+
 if BUILD_SOCL
 SUBDIRS += socl
 endif
 
-SUBDIRS += examples
-
 if BUILD_GCC_PLUGIN
 SUBDIRS += gcc-plugin
 endif
@@ -38,6 +44,10 @@ if BUILD_STARPUFFT
 SUBDIRS += starpufft
 endif
 
+if STARPU_BUILD_SCHED_CTX_HYPERVISOR
+SUBDIRS += sched_ctx_hypervisor
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpu.pc starpu-1.0.pc
 
@@ -46,9 +56,11 @@ versinclude_HEADERS = 				\
 	include/starpu.h			\
 	include/starpu_data_filters.h		\
 	include/starpu_data_interfaces.h	\
+	include/starpu_worker.h			\
 	include/starpu_task.h			\
 	include/starpu_task_bundle.h		\
 	include/starpu_task_list.h		\
+	include/starpu_task_util.h		\
 	include/starpu_data.h			\
 	include/starpu_perfmodel.h		\
 	include/starpu_util.h			\
@@ -59,10 +71,12 @@ versinclude_HEADERS = 				\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
+	include/starpu_sched_ctx.h		\
 	include/starpu_top.h			\
 	include/starpu_deprecated_api.h         \
 	include/starpu_hash.h			\
-	include/starpu_rand.h
+	include/starpu_rand.h			\
+	include/starpu_cublas.h
 
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h
@@ -109,3 +123,30 @@ showcheck:
 	for i in $(SUBDIRS) ; do \
 		make -C $$i showcheck ; \
 	done
+
+ctags-local:
+	$(CTAGS) -R -I LIST_TYPE
+
+
+# Cyclomatic complexity reports.
+
+# The pmccabe tool, see <http://www.parisc-linux.org/~bame/pmccabe/>.
+PMCCABE = pmccabe
+
+VC_URL = "https://gforge.inria.fr/scm/viewvc.php/trunk/%FILENAME%?view=markup&root=starpu"
+
+# Generate a cyclomatic complexity report.  Note that examples and tests are
+# excluded because they're not particularly relevant, and more importantly
+# they all have a function called `main', which clobbers the report.
+cyclomatic-complexity.html:
+	$(PMCCABE)								\
+	  `find \( -name examples -o -name tests -o -path ./tools/dev/experimental \) -prune -o -name \*.c` \
+	  | sort -nr								\
+	  | $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html			\
+		   -v lang=html -v name="$(PACKAGE_NAME)"			\
+		   -v vcurl=$(VC_URL)						\
+		   -v url="$(PACKAGE_URL)"					\
+		   -v css=${top_srcdir}/build-aux/pmccabe.css			\
+		   -v cut_dir=${top_srcdir}/					\
+		   > $@-tmp
+	mv $@-tmp $@

+ 10 - 8
README

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -23,15 +23,15 @@
 
 StarPU is a runtime system that offers support for heterogeneous multicore
 machines. While many efforts are devoted to design efficient computation kernels
-for those architectures (e.g. to implement BLAS kernels on GPUs or on Cell's
-SPUs), StarPU not only takes care of offloading such kernels (and implementing
-data coherency across the machine), but it also makes sure the kernels are
-executed as efficiently as possible.
+for those architectures (e.g. to implement BLAS kernels on GPUs),
+StarPU not only takes care of offloading such kernels (and
+implementing data coherency across the machine), but it also makes
+sure the kernels are executed as efficiently as possible.
 
 +------------------------
 | I.b. What StarPU is not
 
-StarPU is not a new language, and it does not extends existing languages either.
+StarPU is not a new language, and it does not extend existing languages either.
 StarPU does not help to write computation kernels.
 
 +---------------------------------
@@ -76,11 +76,13 @@ advantage of their specificities in a portable fashion.
    units according to the machine topology. For more details on hwloc, see
    http://www.open-mpi.org/projects/hwloc/ .
 
- * To build the StarPU-Top tool the following are also required:
-   * libqt4 >= 4.7
+ * To build the StarPU-Top tool the following packages (along with
+   their development files) are also required:
+   * libqt4-dev >= 4.7
    * libqt4-network
    * libqt4-opengl
    * libqt4-sql
+   * qt4-qmake
 
 ++=====================++
 || III. Getting StarPU ||

+ 2 - 0
STARPU-VERSION

@@ -17,6 +17,8 @@
 # - If any interfaces have been removed or changed since the last
 #   public release, then set age to 0. change
 
+# Note for StarPU 1.1: we have changed ABI
+
 # Libtool interface versioning (info "(libtool) Versioning").
 LIBSTARPU_INTERFACE_CURRENT=1	# increment upon ABI change
 LIBSTARPU_INTERFACE_REVISION=0	# increment upon implementation change

+ 0 - 342
build-aux/compile

@@ -1,342 +0,0 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2012-03-05.13; # UTC
-
-# Copyright (C) 1999-2012 Free Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" ""	$nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv/,$2, in
-	*,$file_conv,*)
-	  ;;
-	mingw/*)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin/*)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine/*)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_cl_dashL linkdir
-# Make cl look for libraries in LINKDIR
-func_cl_dashL ()
-{
-  func_file_conv "$1"
-  if test -z "$lib_path"; then
-    lib_path=$file
-  else
-    lib_path="$lib_path;$file"
-  fi
-  linker_opts="$linker_opts -LIBPATH:$file"
-}
-
-# func_cl_dashl library
-# Do a library search-path lookup for cl
-func_cl_dashl ()
-{
-  lib=$1
-  found=no
-  save_IFS=$IFS
-  IFS=';'
-  for dir in $lib_path $LIB
-  do
-    IFS=$save_IFS
-    if $shared && test -f "$dir/$lib.dll.lib"; then
-      found=yes
-      lib=$dir/$lib.dll.lib
-      break
-    fi
-    if test -f "$dir/$lib.lib"; then
-      found=yes
-      lib=$dir/$lib.lib
-      break
-    fi
-  done
-  IFS=$save_IFS
-
-  if test "$found" != yes; then
-    lib=$lib.lib
-  fi
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
-  # Assume a capable shell
-  lib_path=
-  shared=:
-  linker_opts=
-  for arg
-  do
-    if test -n "$eat"; then
-      eat=
-    else
-      case $1 in
-	-o)
-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
-	  eat=1
-	  case $2 in
-	    *.o | *.[oO][bB][jJ])
-	      func_file_conv "$2"
-	      set x "$@" -Fo"$file"
-	      shift
-	      ;;
-	    *)
-	      func_file_conv "$2"
-	      set x "$@" -Fe"$file"
-	      shift
-	      ;;
-	  esac
-	  ;;
-	-I)
-	  eat=1
-	  func_file_conv "$2" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-I*)
-	  func_file_conv "${1#-I}" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-l)
-	  eat=1
-	  func_cl_dashl "$2"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-l*)
-	  func_cl_dashl "${1#-l}"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-L)
-	  eat=1
-	  func_cl_dashL "$2"
-	  ;;
-	-L*)
-	  func_cl_dashL "${1#-L}"
-	  ;;
-	-static)
-	  shared=false
-	  ;;
-	-Wl,*)
-	  arg=${1#-Wl,}
-	  save_ifs="$IFS"; IFS=','
-	  for flag in $arg; do
-	    IFS="$save_ifs"
-	    linker_opts="$linker_opts $flag"
-	  done
-	  IFS="$save_ifs"
-	  ;;
-	-Xlinker)
-	  eat=1
-	  linker_opts="$linker_opts $2"
-	  ;;
-	-*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
-	  func_file_conv "$1"
-	  set x "$@" -Tp"$file"
-	  shift
-	  ;;
-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
-	  func_file_conv "$1" mingw
-	  set x "$@" "$file"
-	  shift
-	  ;;
-	*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-      esac
-    fi
-    shift
-  done
-  if test -n "$linker_opts"; then
-    linker_opts="-link$linker_opts"
-  fi
-  exec "$@" $linker_opts
-  exit 1
-}
-
-eat=
-
-case $1 in
-  '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "compile $scriptversion"
-    exit $?
-    ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
-    func_cl_wrapper "$@"      # Doesn't return...
-    ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
-  if test -n "$eat"; then
-    eat=
-  else
-    case $1 in
-      -o)
-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
-	# So we strip '-o arg' only if arg is an object.
-	eat=1
-	case $2 in
-	  *.o | *.obj)
-	    ofile=$2
-	    ;;
-	  *)
-	    set x "$@" -o "$2"
-	    shift
-	    ;;
-	esac
-	;;
-      *.c)
-	cfile=$1
-	set x "$@" "$1"
-	shift
-	;;
-      *)
-	set x "$@" "$1"
-	shift
-	;;
-    esac
-  fi
-  shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
-  # If no '-o' option was seen then we might have been invoked from a
-  # pattern rule where we don't need one.  That is ok -- this is a
-  # normal compilation that the losing compiler can handle.  If no
-  # '.c' file was seen then we are probably linking.  That is also
-  # ok.
-  exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file.  Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
-  if mkdir "$lockdir" >/dev/null 2>&1; then
-    break
-  fi
-  sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:

+ 159 - 0
build-aux/pmccabe.css

@@ -0,0 +1,159 @@
+body {
+    font-family: Helvetica, sans-serif;
+}
+
+.page_title {
+    font: 18pt Georgia, serif;
+    color: darkred;
+}
+
+.section_title {
+    font: 14pt Georgia, serif;
+    color: darkred;
+}
+
+.report_timestamp {
+    color: darkred;
+    font-weight: bold;
+}
+
+.function_src {
+    text-align: left;
+    background: white;
+}
+
+.resume_table {
+}
+
+.resume_header_entry {
+    color: black;
+}
+
+.resume_number_entry {
+    color: darkred;
+    font-weight: bold;
+    text-align: right;
+}
+
+.ranges_table {
+    border-spacing: 0px;
+    border-bottom: solid 2px black;
+    border-top: solid 2px black;
+    border-left: solid 2px black;
+    border-right: solid 2px black;
+}
+
+.ranges_header_entry {
+    padding: 5px;
+    border-bottom: solid 1px black;
+    font-size: 1em;
+    font-weight: bold;
+    color: darkred;
+    text-align: left;
+}
+
+.ranges_entry {
+}
+
+.ranges_entry_simple {
+    background: #87ff75;
+}
+
+.ranges_entry_moderate {
+    background: #fffc60;
+}
+
+.ranges_entry_high {
+    background: #ff5a5d;
+}
+
+.ranges_entry_untestable {
+    background: #993300
+}
+
+
+.function_table {
+    border-spacing: 0px;
+    border-bottom: solid 2px black;
+    border-top: solid 2px black;
+    border-left: solid 2px black;
+    border-right: solid 2px black;
+}
+
+.function_table_caption {
+    font-size: 1.1em;
+    font-weight: bold;
+    color: black;
+    padding: 5px;
+}
+
+.function_table_header {
+}
+
+
+.function_table_header_entry {
+    padding: 5px;
+    border-bottom: solid 1px black;
+    font-size: 1em;
+    font-weight: bold;
+    color: darkred;
+    text-align: left;
+}
+
+.function_entry {
+}
+
+
+.function_entry_simple {
+    background: #87ff75;
+}
+
+.function_entry_moderate {
+    background: #fffc60;
+}
+
+.function_entry_high {
+    background: #ff5a5d;
+}
+
+.function_entry_untestable {
+    background: #993300
+}
+
+
+.function_entry_name {
+    font-size: 1em;
+    text-align: left;
+    font-weight: bold;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_cyclo {
+    font-size: 1em;
+    text-align: right;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_number {
+    font-size: 1em;
+    text-align: right;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}
+
+.function_entry_filename {
+    font-size: 1em;
+    text-align: left;
+    text-valign: top;
+
+    border-top: solid 1px black;
+    padding: 3px;
+}

+ 907 - 0
build-aux/pmccabe2html

@@ -0,0 +1,907 @@
+# pmccabe2html - AWK script to convert pmccabe output to html       -*- awk -*-
+
+# Copyright (C) 2007-2012 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Written by Jose E. Marchesi <jemarch@gnu.org>.
+# Adapted for gnulib by Simon Josefsson <simon@josefsson.org>.
+# Added support for C++ by Giuseppe Scrivano <gscrivano@gnu.org>.
+
+# Typical Invocation is from a Makefile.am:
+#
+# CYCLO_SOURCES = ${top_srcdir}/src/*.[ch]
+#
+# cyclo-$(PACKAGE).html: $(CYCLO_SOURCES)
+# 	$(PMCCABE) $(CYCLO_SOURCES) \
+# 		| sort -nr \
+# 		| $(AWK) -f ${top_srcdir}/build-aux/pmccabe2html \
+# 			-v lang=html -v name="$(PACKAGE_NAME)" \
+# 			-v vcurl="http://git.savannah.gnu.org/gitweb/?p=$(PACKAGE).git;a=blob;f=%FILENAME%;hb=HEAD" \
+# 			-v url="http://www.gnu.org/software/$(PACKAGE)/" \
+# 			-v css=${top_srcdir}/build-aux/pmccabe.css \
+# 			-v cut_dir=${top_srcdir}/ \
+# 			> $@-tmp
+# 	mv $@-tmp $@
+#
+# The variables available are:
+#   lang     output language, either 'html' or 'wiki'
+#   name     project name
+#   url      link to project's home page
+#   vcurl    URL to version controlled source code browser,
+#            a %FILENAME% in the string is replaced with the relative
+#            source filename
+#   css      CSS stylesheet filename, included verbatim in HTML output
+#   css_url  link to CSS stylesheet, an URL
+
+# Prologue & configuration
+BEGIN {
+    section_global_stats_p = 1
+    section_function_cyclo_p = 1
+
+    # "html" or "wiki"
+    package_name = name
+    output_lang = lang
+
+    # General Options
+    cyclo_simple_max = 10
+    cyclo_moderate_max = 20
+    cyclo_high_max = 50
+    source_file_link_tmpl = vcurl
+
+    # HTML options
+    if (url != "")
+    {
+	html_prolog = "<a href=\"" url "\">Back to " package_name " Homepage</a><br/><br/>"
+    }
+    html_epilog = "<hr color=\"black\" size=\"2\"/> \
+Copyright (c) 2007, 2008 Free Software Foundation, Inc."
+    html_doctype = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \
+\"http://www.w3.org/TR/html401/loose.dtd\">"
+    html_comment = "<!-- Generated by gnulib's pmccabe2html at " systime() " -->"
+    html_title = "Cyclomatic Complexity report for " package_name
+
+    # Wiki options
+    wiki_prolog = "{{Note|This page has been automatically generated}}"
+    wiki_epilog = ""
+
+    # Internal variables
+    nfuncs = 0;
+}
+
+# Functions
+
+function build_stats()
+{
+    # Maximum modified cyclo
+    for (fcn in mcyclo)
+    {
+        num_of_functions++
+        if (mcyclo[fcn] > max_mcyclo)
+        {
+            max_mcyclo = mcyclo[fcn]
+        }
+
+        if (mcyclo[fcn] > cyclo_high_max)
+        {
+            num_of_untestable_functions++
+        }
+        else if (mcyclo[fcn] > cyclo_moderate_max)
+        {
+            num_of_high_functions++
+        }
+        else if (mcyclo[fcn] > cyclo_simple_max)
+        {
+            num_of_moderate_functions++
+        }
+        else
+        {
+            num_of_simple_functions++
+        }
+    }
+}
+
+function html_fnc_table_complete (caption)
+{
+    html_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
+}
+
+function html_fnc_table_abbrev (caption)
+{
+    html_fnc_table(caption, 1, 1, 0, 0, 1, 0, 0)
+}
+
+
+function html_fnc_table (caption,
+                         fname_p,
+                         mcyclo_p,
+                         cyclo_p,
+                         num_statements_p,
+                         num_lines_p,
+                         first_line_p,
+                         file_p)
+{
+    print "<table width=\"90%\" class=\"function_table\" cellpadding=\"0\" cellspacing=\"0\">"
+    if (caption != "")
+    {
+        print "<caption class=\"function_table_caption\">" caption "</caption>"
+    }
+    html_fnc_header(fname_p,
+                    mcyclo_p,
+                    cyclo_p,
+                    num_statements_p,
+                    num_lines_p,
+                    first_line_p,
+                    file_p)
+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
+    {
+        html_fnc(nfnc,
+                 fname_p,
+                 mcyclo_p,
+                 cyclo_p,
+                 num_statements_p,
+                 num_lines_p,
+                 first_line_p,
+                 file_p)
+    }
+    print "</table>"
+}
+
+function html_header ()
+{
+    print html_doctype
+    print "<html>"
+    print html_comment
+    print "<head>"
+    print "<title>" html_title "</title>"
+    print ""
+    print "<meta name=\"description\" content=\"" html_title "\">"
+    print "<meta name=\"keywords\" content=\"" html_title "\">"
+    print "<meta name=\"resource-type\" content=\"document\">"
+    print "<meta name=\"distribution\" content=\"global\">"
+    print "<meta name=\"Generator\" content=\"pmccabe2html\">"
+    print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"
+    print "<script language=\"javascript\" type=\"text/javascript\">"
+    print "function show_hide(idCapa, idButton, fuerzaVisibilidad)\
+{\
+        var button = document.getElementById(idButton);\
+	var capa = document.getElementById(idCapa);\
+	if (capa)\
+	{\
+		if (fuerzaVisibilidad && fuerzaVisibilidad!=\"\") {\
+			if (fuerzaVisibilidad==\"visible\") capa.style.display=\"\";\
+			else capa.style.display=\"none\";\
+		}\
+		else\
+		{\
+			if (capa.style.display == \"none\") {\
+				capa.style.display = \"\";\
+                                button.innerHTML = \"&uarr;\";\
+			} else {\
+				capa.style.display = \"none\";\
+                                button.innerHTML = \"&darr;\";     \
+			}\
+		}\
+	}\
+}"
+    print "</script>"
+
+
+    if (css_url != "")
+    {
+        print "<link rel=\"stylesheet\" href=\"" css_url "\" type =\"text/css\" media=\"screen\"/>"
+    }
+    if (css != "")
+    {
+        print "<style type =\"text/css\" media=\"screen\">"
+	print "<!--"
+        while ((getline cssline < css) > 0)
+        {
+	    print cssline
+	}
+        print "-->"
+	print "</style />"
+	close(css)
+    }
+    print "</head>"
+    print "<body lang=\"en\" bgcolor=\"#FFFFFF\" text=\"#000000\" link=\"#0000FF\" \
+vlink=\"#800080\" alink=\"#FF0000\">"
+}
+
+function html_footer ()
+{
+    print "</body>"
+    print "</html>"
+}
+
+function html_fnc_header (fname_p,
+                          mcyclo_p,
+                          cyclo_p,
+                          num_statements_p,
+                          num_lines_p,
+                          first_line_p,
+                          file_p)
+{
+    print "<tr class=\"function_table_header\">"
+    if (fname_p)
+    {
+        # Function name
+        print "<td class=\"function_table_header_entry\">"
+        print ""
+        print "</td>"
+
+        print "<td class=\"function_table_header_entry\">"
+        print "Function Name"
+        print "</td>"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "<td class=\"function_table_header_entry\">"
+        print "Modified Cyclo"
+        print "</td>"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "<td class=\"function_table_header_entry\">"
+        print "Cyclomatic"
+        print "<br/>"
+        print "Complexity"
+        print "</td>"
+    }
+    if (num_statements_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Number of"
+        print "<br/>"
+        print "Statements"
+        print "</td>"
+    }
+    if (num_lines_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Number of"
+        print "<br/>"
+        print "Lines"
+        print "</td>"
+    }
+    if (first_line_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "First Line"
+        print "</td>"
+    }
+    if (file_p)
+    {
+        print "<td class=\"function_table_header_entry\">"
+        print "Source File"
+        print "</td>"
+
+    }
+    print "</tr>"
+}
+
+function html_fnc (nfun,
+                   fname_p,
+                   mcyclo_p,
+                   cyclo_p,
+                   num_statements_p,
+                   num_lines_p,
+                   first_line_p,
+                   file_p)
+{
+    fname = fnames[nfun]
+
+    # Function name
+    trclass = "function_entry_simple"
+    if (mcyclo[nfun] > cyclo_high_max)
+    {
+        trclass="function_entry_untestable"
+    }
+    else if (mcyclo[nfun] > cyclo_moderate_max)
+    {
+        trclass="function_entry_high"
+    }
+    else if (mcyclo[nfun] > cyclo_simple_max)
+    {
+        trclass="function_entry_moderate"
+    }
+
+    print "<tr class=\"" trclass "\">"
+    if (fname_p)
+    {
+        print "<td class=\"function_entry_filename\">"
+        if (file_p && mcyclo[nfun] > cyclo_simple_max)
+        {
+            print "<a href=\"javascript:void(0);\" title=\"show/hide function source\" onClick=\"javascript:show_hide('" fname "_src', '" fname "_button')\">\
+<span id=\"" fname "_button\">&darr;</span></a>"
+        }
+        else
+        {
+            print "&nbsp;"
+        }
+        print "</td>"
+
+        print "<td class=\"function_entry_name\">"
+        print fname
+        print "</td>"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "<td class=\"function_entry_cyclo\">"
+        print mcyclo[nfun]
+        print "</td>"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "<td class=\"function_entry_cyclo\">"
+        print cyclo[nfun]
+        print "</td>"
+    }
+    if (num_statements_p)
+    {
+        # Number of statements
+        print "<td class=\"function_entry_number\">"
+        print num_statements[nfun]
+        print "</td>"
+    }
+    if (num_lines_p)
+    {
+        # Number of lines
+        print "<td class=\"function_entry_number\">"
+        print num_lines[nfun]
+        print "</td>"
+    }
+    if (first_line_p)
+    {
+        # First line
+        print "<td class=\"function_entry_number\">"
+        print first_line[nfun]
+        print "</td>"
+    }
+    if (file_p)
+    {
+        href = ""
+        if (source_file_link_tmpl != "")
+        {
+            # Get href target
+            href = source_file_link_tmpl
+            sub(/%FILENAME%/, file[nfun], href)
+        }
+
+        # Source file
+        print "<td class=\"function_entry_filename\">"
+        if (href != "")
+        {
+            print "<a href=\"" href "\">" file[nfun] "</a>"
+        }
+        else
+        {
+            print file[nfun]
+        }
+
+        print "</td>"
+
+
+        print "</tr>"
+
+        if (mcyclo[nfun] > cyclo_simple_max)
+        {
+            print "<tr>"
+
+            num_columns = 1;
+            if (fname_p) { num_columns++ }
+            if (mcyclo_p) { num_columns++ }
+            if (cyclo_p) { num_columns++ }
+            if (num_statements_p) { num_columns++ }
+            if (num_lines_p) { num_columns++ }
+            if (first_line_p) { num_columns++ }
+            if (file_p) { num_columns++ }
+
+            print "<td colspan=\"" num_columns "\" height=\"0\">"
+            print "<div id=\"" fname "_src\" class=\"function_src\" style=\"position: relative; display: none;\">"
+            print "<pre class=\"function_src\">"
+
+            while ((getline codeline < (fname nfun "_fn.txt")) > 0)
+            {
+                sub(/\\</, "&lt;", codeline)
+                sub(/\\>/, "&gt;", codeline)
+                sub(/&/, "&amp;", codeline)
+
+                print codeline
+            }
+            close(fname nfun "_fn.txt")
+            system("rm " "'" fname "'" nfun "_fn.txt")
+            print "</pre>"
+            print "</div>"
+            print "</td>"
+            print "</tr>"
+        }
+
+    }
+}
+
+function html_global_stats ()
+{
+    print "<div class=\"section_title\">Summary</div>"
+
+    print "<table class=\"summary_table\">"
+    # Total number of functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Total number of functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_functions
+    print "</td>"
+    print "</tr>"
+    # Number of simple functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of low risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_simple_functions
+    print "</td>"
+    print "</tr>"
+    # Number of moderate functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of moderate risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_moderate_functions
+    print "</td>"
+    print "</tr>"
+    # Number of high functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of high risk functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_high_functions
+    print "</td>"
+    print "</tr>"
+    # Number of untestable functions
+    print "<tr>"
+    print "<td class=\"summary_header_entry\">"
+    print "Number of untestable functions"
+    print "</td>"
+    print "<td class=\"summary_number_entry\">"
+    print num_of_untestable_functions
+    print "</td>"
+    print "</tr>"
+    print "</table>"
+    print "<br/>"
+}
+
+function html_function_cyclo ()
+{
+    print "<div class=\"section_title\">Details for all functions</div>"
+
+    print "<table class=\"ranges_table\">"
+    print "<tr>"
+    print "<td class=\"ranges_header_entry\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_header_entry\">"
+    print "Cyclomatic Complexity"
+    print "</td>"
+    print "<td class=\"ranges_header_entry\">"
+    print "Risk Evaluation"
+    print "</td>"
+    print "</tr>"
+    # Simple
+    print "<tr>"
+    print "<td class=\"ranges_entry_simple\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "0 - " cyclo_simple_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Simple module, without much risk"
+    print "</td>"
+    print "</tr>"
+    # Moderate
+    print "<tr>"
+    print "<td class=\"ranges_entry_moderate\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print cyclo_simple_max + 1 " - " cyclo_moderate_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "More complex module, moderate risk"
+    print "</td>"
+    print "</tr>"
+    # High
+    print "<tr>"
+    print "<td class=\"ranges_entry_high\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print cyclo_moderate_max + 1 " - " cyclo_high_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Complex module, high risk"
+    print "</td>"
+    print "</tr>"
+    # Untestable
+    print "<tr>"
+    print "<td class=\"ranges_entry_untestable\">"
+    print "&nbsp;"
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "greater than " cyclo_high_max
+    print "</td>"
+    print "<td class=\"ranges_entry\">"
+    print "Untestable module, very high risk"
+    print "</td>"
+    print "</tr>"
+    print "</table>"
+    print "<br/>"
+    html_fnc_table_complete("")
+}
+
+function wiki_global_stats ()
+{
+    print "{| class=\"cyclo_summary_table\""
+    # Total number of functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Total number of functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_functions
+    # Number of simple functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of low risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_simple_functions
+    # Number of moderate functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of moderate risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_moderate_functions
+    # Number of high functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of high risk functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_high_functions
+    # Number of untestable functions
+    print "|-"
+    print "| class=\"cyclo_summary_header_entry\" | Number of untestable functions"
+    print "| class=\"cyclo_summary_number_entry\" |" num_of_untestable_functions
+    print "|}"
+}
+
+function wiki_function_cyclo ()
+{
+    print "==Details for all functions=="
+
+    print "Used ranges:"
+
+    print "{| class =\"cyclo_ranges_table\""
+    print "|-"
+    print "| class=\"cyclo_ranges_header_entry\" | "
+    print "| class=\"cyclo_ranges_header_entry\" | Cyclomatic Complexity"
+    print "| class=\"cyclo_ranges_header_entry\" | Risk Evaluation"
+    # Simple
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_simple\" | "
+    print "| class=\"cyclo_ranges_entry\" | 0 - " cyclo_simple_max
+    print "| class=\"cyclo_ranges_entry\" | Simple module, without much risk"
+    # Moderate
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_moderate\" | "
+    print "| class=\"cyclo_ranges_entry\" |" cyclo_simple_max + 1 " - " cyclo_moderate_max
+    print "| class=\"cyclo_ranges_entry\" | More complex module, moderate risk"
+    # High
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_high\" | "
+    print "| class=\"cyclo_ranges_entry\" |" cyclo_moderate_max + 1 " - " cyclo_high_max
+    print "| class=\"cyclo_ranges_entry\" | Complex module, high risk"
+    # Untestable
+    print "|-"
+    print "| class=\"cyclo_ranges_entry_untestable\" | "
+    print "| class=\"cyclo_ranges_entry\" | greater than " cyclo_high_max
+    print "| class=\"cyclo_ranges_entry\" | Untestable module, very high risk"
+    print "|}"
+
+    print ""
+    print ""
+    wiki_fnc_table_complete("")
+}
+
+function wiki_fnc_table_complete (caption)
+{
+    wiki_fnc_table(caption, 1, 1, 0, 1, 1, 0, 1)
+}
+
+function wiki_fnc_table_abbrev (caption)
+{
+    wiki_fnc_table(caption, 1, 0, 0, 0, 0, 0, 0)
+}
+
+function wiki_fnc_table (caption,
+                         fname_p,
+                         mcyclo_p,
+                         cyclo_p,
+                         num_statements_p,
+                         num_lines_p,
+                         first_line_p,
+                         file_p)
+{
+    print "{| width=\"90%\" class=\"cyclo_function_table\" cellpadding=\"0\" cellspacing=\"0\">"
+    if (caption != "")
+    {
+        print "|+" caption
+    }
+    wiki_fnc_header(fname_p,
+                    mcyclo_p,
+                    cyclo_p,
+                    num_statements_p,
+                    num_lines_p,
+                    first_line_p,
+                    file_p)
+    for (nfnc = 1; nfnc <= nfuncs; nfnc++)
+    {
+        wiki_fnc(nfnc,
+                 fname_p,
+                 mcyclo_p,
+                 cyclo_p,
+                 num_statements_p,
+                 num_lines_p,
+                 first_line_p,
+                 file_p)
+    }
+    print "|}"
+}
+
+function wiki_fnc_header (fname_p,
+                          mcyclo_p,
+                          cyclo_p,
+                          num_statements_p,
+                          num_lines_p,
+                          first_line_p,
+                          file_p)
+{
+    if (fname_p)
+    {
+        # Function name
+        print "! class=\"cyclo_function_table_header_entry\" | Function Name"
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "! class=\"cyclo_function_table_header_entry\" | Modified Cyclo"
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "! class=\"cyclo_function_table_header_entry\" | Cyclomatic Complexity"
+    }
+    if (num_statements_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Number of Statements"
+    }
+    if (num_lines_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Number of Lines"
+    }
+    if (first_line_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | First Line"
+    }
+    if (file_p)
+    {
+        print "! class=\"cyclo_function_table_header_entry\" | Source File"
+    }
+}
+
+function wiki_fnc (nfnc,
+                   fname_p,
+                   mcyclo_p,
+                   cyclo_p,
+                   num_statements_p,
+                   num_lines_p,
+                   first_line_p,
+                   file_p)
+{
+   fname = fnames[nfnc]
+
+    # Function name
+    trclass = "cyclo_function_entry_simple"
+    if (mcyclo[nfnc] > cyclo_high_max)
+    {
+        trclass="cyclo_function_entry_untestable"
+    }
+    else if (mcyclo[nfnc] > cyclo_moderate_max)
+    {
+        trclass="cyclo_function_entry_high"
+    }
+    else if (mcyclo[nfnc] > cyclo_simple_max)
+    {
+        trclass="cyclo_function_entry_moderate"
+    }
+
+    print "|- class=\"" trclass "\""
+    if (fname_p)
+    {
+        print "| class=\"cyclo_function_entry_name\" |" fname
+    }
+    if (mcyclo_p)
+    {
+        # Modified cyclo
+        print "| class=\"cyclo_function_entry_cyclo\" |" mcyclo[nfnc]
+    }
+    if (cyclo_p)
+    {
+        # Cyclo
+        print "| class=\"cyclo_function_entry_cyclo\" |" cyclo[nfnc]
+    }
+    if (num_statements_p)
+    {
+        # Number of statements
+        print "| class=\"cyclo_function_entry_number\" |" num_statements[nfnc]
+    }
+    if (num_lines_p)
+    {
+        # Number of lines
+        print "| class=\"cyclo_function_entry_number\" |" num_lines[nfnc]
+    }
+    if (first_line_p)
+    {
+        # First line
+        print "| class=\"cyclo_function_entry_number\" |" first_line[nfnc]
+    }
+    if (file_p)
+    {
+        href = ""
+        if (source_file_link_tmpl != "")
+        {
+            # Get href target
+            href = source_file_link_tmpl
+            sub(/%FILENAME%/, file[nfnc], href)
+        }
+
+        # Source file
+        print "| class=\"cyclo_function_entry_filename\" |" \
+            ((href != "") ? "[" href " " file[nfnc] "]" : "[" file[nfnc] "]")
+    }
+}
+
+# Scan data from a line
+{
+    function_name = $7
+
+    nfuncs++;
+    fnames[nfuncs] = function_name
+    mcyclo[nfuncs] = $1
+    cyclo[nfuncs] = $2
+    num_statements[nfuncs] = $3
+    first_line[nfuncs] = $4
+    num_lines[nfuncs] = $5
+
+    # Build the filename from the file_spec ($6)
+    begin_util_path = index($6, cut_dir)
+    tmpfilename = substr($6, begin_util_path + length(cut_dir))
+    sub(/\([0-9]+\):/, "", tmpfilename)
+    file[nfuncs] = tmpfilename
+
+    if (mcyclo[nfuncs] > cyclo_simple_max)
+    {
+        # Extract function contents to a fn_txt file
+        filepath = $6
+
+        sub(/\([0-9]+\):/, "", filepath)
+        num_line = 0
+
+        while ((getline codeline < filepath) > 0)
+        {
+            num_line++;
+            if ((num_line >= first_line[nfuncs]) &&
+                (num_line < first_line[nfuncs] + num_lines[nfuncs]))
+            {
+                print codeline > (function_name nfuncs "_fn.txt")
+            }
+        }
+        close (function_name nfuncs "_fn.txt")
+        close(filepath)
+    }
+
+    # Initial values for statistics variables
+    num_of_functions = 0
+    max_mcyclo = 0
+    max_function_length = 0
+    num_of_simple_functions = 0
+    num_of_moderate_functions = 0
+    num_of_high_functions = 0
+    num_of_untestable_functions = 0
+}
+
+# Epilogue
+END {
+    # Print header (only for html)
+    if (output_lang == "html")
+    {
+        html_header()
+    }
+
+    # Print prolog
+    if ((output_lang == "html") &&
+        (html_prolog != ""))
+    {
+        print html_prolog
+    }
+    if ((output_lang == "wiki") &&
+        (wiki_prolog != ""))
+    {
+        print wiki_prolog
+    }
+
+    if (output_lang == "html")
+    {
+        print "<div class=\"page_title\">" package_name " Cyclomatic Complexity Report</div>"
+        print "<p>Report generated at: <span class=\"report_timestamp\">" strftime() "</div></p>"
+    }
+    if (output_lang == "wiki")
+    {
+        print "==" package_name " Cyclomatic Complexity Report=="
+        print "Report generated at: '''" strftime() "'''"
+    }
+
+    if (section_global_stats_p)
+    {
+        build_stats()
+
+        if (output_lang == "html")
+        {
+            html_global_stats()
+        }
+        if (output_lang == "wiki")
+        {
+            wiki_global_stats()
+        }
+    }
+    if (section_function_cyclo_p)
+    {
+        if (output_lang == "html")
+        {
+            html_function_cyclo()
+        }
+        if (output_lang == "wiki")
+        {
+            wiki_function_cyclo()
+        }
+    }
+
+    # Print epilog
+    if ((output_lang == "html") &&
+        (html_epilog != ""))
+    {
+        print html_epilog
+    }
+    if ((output_lang == "wiki") &&
+        (wiki_epilog != ""))
+    {
+        print wiki_epilog
+    }
+
+    # Print footer (html only)
+    if (output_lang == "html")
+    {
+        html_footer()
+    }
+}
+
+# End of pmccabe2html

+ 457 - 0
build-aux/svn2cl.xsl

@@ -0,0 +1,457 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<!--
+
+   svn2cl.xsl - xslt stylesheet for converting svn log to a normal
+                changelog
+
+   version 0.13
+
+   Usage (replace ++ with two minus signs which aren't allowed
+   inside xml comments):
+     svn ++verbose ++xml log | \
+       xsltproc ++stringparam strip-prefix `basename $(pwd)` \
+                ++stringparam linelen 75 \
+                ++stringparam groupbyday yes \
+                ++stringparam separate-daylogs yes \
+                ++stringparam include-rev yes \
+                ++stringparam include-actions yes \
+                ++stringparam breakbeforemsg yes/2 \
+                ++stringparam reparagraph yes \
+                ++stringparam authorsfile FILE \
+                ++stringparam ignore-message-starting \
+                svn2cl.xsl - > ChangeLog
+
+   This file is based on several implementations of this conversion
+   that I was not completely happy with and some other common
+   xslt constructs found on the web.
+
+   Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Arthur de Jong.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+   IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+   IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+   OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-->
+
+<xsl:stylesheet
+  version="1.0"
+  xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:output
+   method="text"
+   encoding="utf-8"
+   media-type="text/plain"
+   omit-xml-declaration="yes"
+   standalone="yes"
+   indent="no" />
+
+ <xsl:strip-space elements="*" />
+
+ <!-- the prefix of pathnames to strip -->
+ <xsl:param name="strip-prefix" select="'/'" />
+
+ <!-- the length of a line to wrap messages at -->
+ <xsl:param name="linelen" select="75" />
+
+ <!-- whether entries should be grouped by day -->
+ <xsl:param name="groupbyday" select="'no'" />
+
+ <!-- whether to seperate log messages by empty lines -->
+ <xsl:param name="separate-daylogs" select="'no'" />
+
+ <!-- whether a revision number should be included -->
+ <xsl:param name="include-rev" select="'no'" />
+
+ <!-- whether aaction labels should be added to files -->
+ <xsl:param name="include-actions" select="'no'" />
+
+ <!-- whether the log message should start on a new line -->
+ <xsl:param name="breakbeforemsg" select="'no'" />
+
+ <!-- whether the message should be rewrapped within one paragraph -->
+ <xsl:param name="reparagraph" select="'no'" />
+
+ <!-- whether certain messages should be ignored -->
+ <xsl:param name="ignore-message-starting" select="''" />
+
+ <!-- location of authors file if any -->
+ <xsl:param name="authorsfile" select="''" />
+ <xsl:key name="author-lookup" match="author" use="@uid" />
+ <xsl:variable name="authors-top" select="document($authorsfile)/authors" />
+
+ <!-- determin the path part to strip -->
+ <xsl:variable name="strip-path">
+  <!-- if strip-prefix does not start with a slash, prepend it -->
+  <xsl:if test="not(starts-with($strip-prefix,'/'))">
+   <xsl:text>/</xsl:text>
+  </xsl:if>
+  <!-- the prefix itself -->
+  <xsl:value-of select="$strip-prefix" />
+  <!-- if strip-prefix does not start with a slash, append it -->
+  <xsl:if test="substring($strip-prefix,string-length($strip-prefix),1)!='/'">
+   <xsl:text>/</xsl:text>
+  </xsl:if>
+ </xsl:variable>
+
+ <!-- match the topmost log entry -->
+ <xsl:template match="log">
+  <xsl:choose>
+   <xsl:when test="$ignore-message-starting != ''">
+    <!-- only handle logentries with don't contain the string -->
+    <xsl:apply-templates select="logentry[not(starts-with(msg,$ignore-message-starting))]" />
+   </xsl:when>
+   <xsl:otherwise>
+    <xsl:apply-templates select="logentry" />
+   </xsl:otherwise>
+  </xsl:choose>
+  <!-- add newlines at the end of the changelog -->
+  <xsl:text>&#10;</xsl:text>
+ </xsl:template>
+
+ <!-- format one entry from the log -->
+ <xsl:template match="logentry">
+  <xsl:choose>
+   <!-- if we're grouping we should omit some headers -->
+   <xsl:when test="$groupbyday='yes'">
+    <!-- fetch previous entry's date -->
+    <xsl:variable name="prevdate">
+     <xsl:apply-templates select="preceding-sibling::logentry[position()=1]/date" />
+    </xsl:variable>
+    <!-- fetch previous entry's author -->
+    <xsl:variable name="prevauthor">
+     <xsl:value-of select="normalize-space(preceding-sibling::logentry[position()=1]/author)" />
+    </xsl:variable>
+    <!-- fetch this entry's date -->
+    <xsl:variable name="date">
+     <xsl:apply-templates select="date" />
+    </xsl:variable>
+    <!-- fetch this entry's author -->
+    <xsl:variable name="author">
+     <xsl:value-of select="normalize-space(author)" />
+    </xsl:variable>
+    <!-- check if header is changed -->
+    <xsl:if test="($prevdate!=$date) or ($prevauthor!=$author)">
+     <!-- add newline -->
+     <xsl:if test="not(position()=1)">
+      <xsl:text>&#10;</xsl:text>
+     </xsl:if>
+     <!-- date -->
+     <xsl:value-of select="$date" />
+     <!-- two spaces -->
+     <xsl:text>&#32;&#32;</xsl:text>
+     <!-- author's name -->
+     <xsl:apply-templates select="author" />
+     <!-- two newlines -->
+     <xsl:text>&#10;</xsl:text>
+     <xsl:if test="$separate-daylogs!='yes'"><xsl:text>&#10;</xsl:text></xsl:if>
+    </xsl:if>
+   </xsl:when>
+   <!-- write the log header -->
+   <xsl:otherwise>
+    <!-- add newline -->
+    <xsl:if test="not(position()=1)">
+     <xsl:text>&#10;</xsl:text>
+    </xsl:if>
+    <!-- date -->
+    <xsl:apply-templates select="date" />
+    <!-- two spaces -->
+    <xsl:text>&#32;&#32;</xsl:text>
+    <!-- author's name -->
+    <xsl:apply-templates select="author" />
+    <!-- two newlines -->
+    <xsl:text>&#10;&#10;</xsl:text>
+   </xsl:otherwise>
+  </xsl:choose>
+  <!-- get paths string -->
+  <xsl:variable name="paths">
+   <xsl:apply-templates select="paths" />
+  </xsl:variable>
+  <!-- get revision number -->
+  <xsl:variable name="rev">
+   <xsl:if test="$include-rev='yes'">
+    <xsl:text>[r</xsl:text>
+    <xsl:value-of select="@revision" />
+    <xsl:text>]&#32;</xsl:text>
+   </xsl:if>
+  </xsl:variable>
+  <!-- trim trailing newlines -->
+  <xsl:variable name="msg">
+   <!-- add a line break before the log message -->
+   <xsl:choose>
+    <xsl:when test="$breakbeforemsg='yes'">
+     <xsl:text>&#10;</xsl:text>
+    </xsl:when>
+    <xsl:when test="number($breakbeforemsg)&gt;0">
+     <xsl:call-template name="newlines">
+      <xsl:with-param name="count" select="number($breakbeforemsg)" />
+     </xsl:call-template>
+    </xsl:when>
+   </xsl:choose>
+   <xsl:call-template name="trim-newln">
+    <xsl:with-param name="txt" select="msg" />
+   </xsl:call-template>
+  </xsl:variable>
+  <!-- add newline here if separate-daylogs is in effect -->
+  <xsl:if test="$groupbyday='yes' and $separate-daylogs='yes'"><xsl:text>&#10;</xsl:text></xsl:if>
+  <!-- first line is indented (other indents are done in wrap template) -->
+  <xsl:text>&#9;*&#32;</xsl:text>
+  <!-- set up the text to wrap -->
+  <xsl:variable name="txt">
+   <xsl:value-of select="$rev" />
+   <xsl:if test="$paths!=''">
+    <xsl:value-of select="concat($paths,':&#32;')" />
+   </xsl:if>
+   <xsl:value-of select="$msg" />
+  </xsl:variable>
+  <!-- print the paths and message nicely wrapped -->
+  <xsl:call-template name="wrap">
+   <xsl:with-param name="txt" select="$txt" />
+  </xsl:call-template>
+ </xsl:template>
+
+ <!-- format date -->
+ <xsl:template match="date">
+  <xsl:variable name="date" select="normalize-space(.)" />
+  <!-- output date part -->
+  <xsl:value-of select="substring($date,1,10)" />
+  <!-- output time part -->
+  <xsl:if test="$groupbyday!='yes'">
+   <xsl:text>&#32;</xsl:text>
+   <xsl:value-of select="substring($date,12,5)" />
+  </xsl:if>
+ </xsl:template>
+
+ <!-- format author -->
+ <xsl:template match="author">
+  <xsl:variable name="uid" select="normalize-space(.)" />
+  <!-- try to lookup author in authorsfile -->
+  <xsl:choose>
+   <xsl:when test="$authorsfile!=''">
+    <xsl:for-each select="$authors-top">
+     <xsl:variable name="author" select="key('author-lookup',$uid)" />
+     <!-- present result -->
+     <xsl:choose>
+      <xsl:when test="string($author/.)">
+       <xsl:apply-templates select="$author/node()" mode="copy" />
+      </xsl:when>
+      <xsl:otherwise>
+       <xsl:value-of select="$uid" />
+      </xsl:otherwise>
+     </xsl:choose>
+    </xsl:for-each>
+   </xsl:when>
+   <xsl:otherwise>
+    <xsl:value-of select="$uid" />
+   </xsl:otherwise>
+  </xsl:choose>
+ </xsl:template>
+
+ <!-- copy but normalize text -->
+ <xsl:template match="text()" mode="copy">
+  <xsl:value-of select="normalize-space(.)" />
+ </xsl:template>
+
+ <!-- simple copy template -->
+ <xsl:template match="@*|node()" mode="copy">
+  <xsl:copy>
+   <xsl:apply-templates select="@*|node()" mode="copy" />
+  </xsl:copy>
+ </xsl:template>
+
+ <!-- present a list of paths names -->
+ <xsl:template match="paths">
+  <xsl:choose>
+   <!-- only handle paths that begin with the path and strip the path -->
+   <xsl:when test="$strip-prefix != ''">
+    <!-- filter on all entries within directory -->
+    <xsl:for-each select="path[starts-with(concat(normalize-space(.),'/'),$strip-path)]">
+     <xsl:sort select="normalize-space(.)" data-type="text" />
+     <!-- unless we are the first entry, add a comma -->
+     <xsl:if test="not(position()=1)">
+      <xsl:text>,&#32;</xsl:text>
+     </xsl:if>
+     <!-- get path part -->
+     <xsl:variable name="path" select="substring(normalize-space(.),string-length($strip-path)+1)" />
+     <!-- translate empty string to dot and print result -->
+     <xsl:if test="$path = ''">
+      <xsl:text>.</xsl:text>
+     </xsl:if>
+     <xsl:value-of select="$path" />
+     <!-- add the action flag -->
+     <xsl:if test="$include-actions='yes'">
+      <xsl:apply-templates select="." mode="action"/>
+     </xsl:if>
+    </xsl:for-each>
+   </xsl:when>
+   <!-- print a simple list of all paths -->
+   <xsl:otherwise>
+    <xsl:for-each select="path">
+     <xsl:sort select="normalize-space(.)" data-type="text" />
+     <!-- unless we are the first entry, add a comma -->
+     <xsl:if test="not(position()=1)">
+      <xsl:text>,&#32;</xsl:text>
+     </xsl:if>
+     <!-- print the path name -->
+     <xsl:value-of select="normalize-space(.)" />
+     <!-- add the action flag -->
+     <xsl:if test="$include-actions='yes'">
+      <xsl:apply-templates select="." mode="action"/>
+     </xsl:if>
+    </xsl:for-each>
+   </xsl:otherwise>
+  </xsl:choose>
+ </xsl:template>
+
+ <xsl:template match="path" mode="action">
+  <xsl:choose>
+   <xsl:when test="@action='D'">
+    <xsl:text>[DEL]</xsl:text>
+   </xsl:when>
+   <xsl:when test="@copyfrom-path">
+    <xsl:text>[CPY]</xsl:text>
+   </xsl:when>
+   <xsl:when test="@action='A'">
+    <xsl:text>[ADD]</xsl:text>
+   </xsl:when>
+  </xsl:choose>
+ </xsl:template>
+
+ <!-- string-wrapping template -->
+ <xsl:template name="wrap">
+  <xsl:param name="txt" />
+  <xsl:variable name="normtxt" select="normalize-space($txt)" />
+  <xsl:choose>
+   <xsl:when test="contains($txt,'&#10;')">
+     <!-- text contains newlines, do the first line -->
+     <xsl:call-template name="wrap">
+      <xsl:with-param name="txt" select="substring-before($txt,'&#10;')" />
+     </xsl:call-template>
+     <!-- print tab -->
+     <xsl:text>&#9;&#32;&#32;</xsl:text>
+     <!-- wrap the rest of the text -->
+     <xsl:call-template name="wrap">
+      <xsl:with-param name="txt" select="substring-after($txt,'&#10;')" />
+     </xsl:call-template>
+   </xsl:when>
+   <xsl:when test="(string-length($normtxt) &lt; (($linelen)-9)) or not(contains($normtxt,' '))">
+    <!-- this is easy, nothing to do -->
+    <xsl:value-of select="$normtxt" />
+    <!-- add newline -->
+    <xsl:text>&#10;</xsl:text>
+   </xsl:when>
+   <xsl:otherwise>
+    <!-- find the first line -->
+    <xsl:variable name="tmp" select="substring($normtxt,1,(($linelen)-9))" />
+    <xsl:variable name="line">
+     <xsl:choose>
+      <!-- if our attempt contains spaces wrap on that -->
+      <xsl:when test="contains($tmp,' ')">
+       <xsl:call-template name="find-line">
+        <xsl:with-param name="txt" select="$tmp" />
+       </xsl:call-template>
+      </xsl:when>
+      <!-- otherwise use the first non-space characters from the text -->
+      <xsl:otherwise>
+       <xsl:value-of select="substring-before($normtxt,' ')" />
+      </xsl:otherwise>
+     </xsl:choose>
+    </xsl:variable>
+    <!-- print line -->
+    <xsl:value-of select="$line" />
+    <!-- print newline and tab -->
+    <xsl:text>&#10;&#9;&#32;&#32;</xsl:text>
+    <!-- wrap the rest of the text -->
+    <xsl:call-template name="wrap">
+     <xsl:with-param name="txt" select="normalize-space(substring($normtxt,string-length($line)+1))" />
+    </xsl:call-template>
+   </xsl:otherwise>
+  </xsl:choose>
+ </xsl:template>
+
+ <!-- template to trim line to contain space as last char -->
+ <xsl:template name="find-line">
+  <xsl:param name="txt" />
+  <xsl:choose>
+   <xsl:when test="substring($txt,string-length($txt),1)=' '">
+    <xsl:value-of select="substring($txt,1,string-length($txt)-1)" />
+   </xsl:when>
+   <xsl:otherwise>
+    <xsl:call-template name="find-line">
+     <xsl:with-param name="txt" select="substring($txt,1,string-length($txt)-1)" />
+    </xsl:call-template>
+   </xsl:otherwise>
+  </xsl:choose>
+ </xsl:template>
+
+ <!-- template to trim trailing and starting newlines -->
+ <xsl:template name="trim-newln">
+  <xsl:param name="txt" />
+  <xsl:choose>
+   <!-- find starting newlines -->
+   <xsl:when test="substring($txt,1,1) = '&#10;'">
+    <xsl:call-template name="trim-newln">
+     <xsl:with-param name="txt" select="substring($txt,2)" />
+    </xsl:call-template>
+   </xsl:when>
+   <!-- find trailing newlines -->
+   <xsl:when test="substring($txt,string-length($txt),1) = '&#10;'">
+    <xsl:call-template name="trim-newln">
+     <xsl:with-param name="txt" select="substring($txt,1,string-length($txt)-1)" />
+    </xsl:call-template>
+   </xsl:when>
+   <!-- if the message has paragraphs, find the first one -->
+   <xsl:when test="$reparagraph='yes' and contains($txt,'&#10;&#10;')">
+     <!-- remove newlines from first paragraph -->
+     <xsl:value-of select="normalize-space(substring-before($txt,'&#10;&#10;'))" />
+     <!-- paragraph separator -->
+     <xsl:text>&#10;&#10;</xsl:text>
+     <!-- do the rest of the text -->
+     <xsl:call-template name="trim-newln">
+      <xsl:with-param name="txt" select="substring-after($txt,'&#10;&#10;')" />
+     </xsl:call-template>
+   </xsl:when>
+   <!-- remove more single newlines -->
+   <xsl:when test="$reparagraph='yes'">
+    <xsl:value-of select="normalize-space($txt)" />
+   </xsl:when>
+   <!-- no newlines found, we're done -->
+   <xsl:otherwise>
+    <xsl:value-of select="$txt" />
+   </xsl:otherwise>
+  </xsl:choose>
+ </xsl:template>
+
+ <!-- insert a number of newlines -->
+ <xsl:template name="newlines">
+  <xsl:param name="count" />
+  <xsl:text>&#10;</xsl:text>
+  <xsl:if test="$count&gt;1">
+   <xsl:call-template name="newlines">
+    <xsl:with-param name="count" select="($count)-1" />
+   </xsl:call-template>
+  </xsl:if>
+ </xsl:template>
+
+</xsl:stylesheet>

File diff suppressed because it is too large
+ 465 - 212
configure.ac


+ 19 - 11
doc/Makefile.am

@@ -13,6 +13,7 @@
 info_TEXINFOS = starpu.texi
 
 starpu_TEXINFOS = chapters/advanced-api.texi \
+	chapters/benchmarks.texi \
 	chapters/configuration.texi \
 	chapters/perf-feedback.texi \
 	chapters/vector_scal_cpu.texi \
@@ -34,27 +35,34 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/socl.texi \
-	chapters/version.texi
+	chapters/version.texi \
+	chapters/sched_ctx_hypervisor.texi
 
-MAINTAINERCLEANFILES = starpu.pdf
+MAINTAINERCLEANFILES = starpu.pdf starpu.html
 
-EXTRA_DIST = starpu.pdf \
-	starpu.css
+EXTRA_DIST = starpu.css
+
+dist_pdf_DATA = starpu.pdf
+dist_html_DATA = starpu.html
 
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 
 uninstall-local:
 	$(RM) $(DESTDIR)$(infodir)/dir
 
-#TODO: when stat is not available on the machine, insert "unknown date"
 chapters/version.texi:
-	@for f in $(starpu_TEXINFOS) ; do \
-                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f ; fi \
+	@-for f in $(starpu_TEXINFOS) ; do \
+                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp
-	@LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated
-	@LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month
-	@echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi
-	@echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi
+	@-LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null
+	@-LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null
+	@if test -s timestamp_updated ; then \
+		echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi;\
+		echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi;\
+	else \
+		echo "@set UPDATED unknown_date" > $(top_srcdir)/doc/chapters/version.texi ;\
+		echo "@set UPDATED-MONTH unknown_date" >> $(top_srcdir)/doc/chapters/version.texi; \
+	fi
 	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
 	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
 	@$(RM) timestamp timestamp_updated timestamp_updated_month

+ 455 - 61
doc/chapters/advanced-api.texi

@@ -2,21 +2,293 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Defining a new data interface::  
-* Multiformat Data Interface::  
-* Task Bundles::                
-* Task Lists::                  
-* Using Parallel Tasks::        
-* Defining a new scheduling policy::  
+* Insert Task::
+* Tracing support::
+* MPI Interface::
+* Defining a new data interface::
+* Multiformat Data Interface::
+* Task Bundles::
+* Task Lists::
+* Using Parallel Tasks::
+* Scheduling Contexts::
+* Defining a new scheduling policy::
 * Running drivers::
-* Expert mode::                 
+* Expert mode::
 @end menu
 
+@node Insert Task
+@section Insert Task
+
+@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
+Create and submit a task corresponding to @var{cl} with the following
+arguments.  The argument list must be zero-terminated.
+
+The arguments following the codelets can be of the following types:
+
+@itemize
+@item
+@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
+@item
+@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
+@item
+the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
+@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
+as defined below.
+@end itemize
+
+When using @code{STARPU_DATA_ARRAY}, the access mode of the data
+handles is not defined.
+
+Parameters to be passed to the codelet implementation are defined
+through the type @code{STARPU_VALUE}. The function
+@code{starpu_codelet_unpack_args} must be called within the codelet
+implementation to retrieve them.
+@end deftypefun
+
+@defmac STARPU_VALUE
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to a constant value and the size of the constant
+@end defmac
+
+@defmac STARPU_CALLBACK
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to a callback function
+@end defmac
+
+@defmac STARPU_CALLBACK_ARG
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to be given as an argument to the callback
+function
+@end defmac
+
+@defmac  STARPU_CALLBACK_WITH_ARG
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by two pointers: one to a callback function, and the other to
+be given as an argument to the callback function; this is equivalent
+to using both @code{STARPU_CALLBACK} and
+@code{STARPU_CALLBACK_WITH_ARG}
+@end defmac
+
+@defmac STARPU_PRIORITY
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a integer defining a priority level
+@end defmac
+
+@defmac STARPU_TAG
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a tag.
+@end defmac
+
+@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
+Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
+given to a codelet and later unpacked with the function
+@code{starpu_codelet_unpack_args} defined below.
+@end deftypefun
+
+@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
+Retrieve the arguments of type @code{STARPU_VALUE} associated to a
+task automatically created using the function
+@code{starpu_insert_task} defined above.
+@end deftypefun
+
+@node Tracing support
+@section Tracing support
+
+@deftypefun void starpu_fxt_start_profiling (void)
+Start recording the trace. The trace is by default started from
+@code{starpu_init()} call, but can be paused by using
+@code{starpu_fxt_stop_profiling}, in which case
+@code{starpu_fxt_start_profiling} should be called to specify when to resume
+recording events.
+@end deftypefun
+
+@deftypefun void starpu_fxt_stop_profiling (void)
+Stop recording the trace. The trace is by default stopped at
+@code{starpu_shutdown()} call. @code{starpu_fxt_stop_profiling} can however be
+used to stop it earlier. @code{starpu_fxt_start_profiling} can then be called to
+start recording it again, etc.
+@end deftypefun
+
+
+@node MPI Interface
+@section MPI Interface
+
+@menu
+* Initialisation::
+* Communication::
+* Communication cache::
+@end menu
+
+@node Initialisation
+@subsection Initialisation
+
+@deftypefun int starpu_mpi_init (int *@var{argc}, char ***@var{argv}, int initialize_mpi)
+Initializes the starpumpi library. @code{initialize_mpi} indicates if
+MPI should be initialized or not by StarPU. If the value is not @code{0},
+MPI will be initialized by calling @code{MPI_Init_Thread(argc, argv,
+MPI_THREAD_SERIALIZED, ...)}.
+@end deftypefun
+
+@deftypefun int starpu_mpi_initialize (void)
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
+This function does not call @code{MPI_Init}, it should be called beforehand.
+@end deftypefun
+
+@deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
+This function has been made deprecated. One should use instead the
+function @code{starpu_mpi_init()} defined above.
+MPI will be initialized by starpumpi by calling @code{MPI_Init_Thread(argc, argv,
+MPI_THREAD_SERIALIZED, ...)}.
+@end deftypefun
+
+@deftypefun int starpu_mpi_shutdown (void)
+Cleans the starpumpi library. This must be called between calling
+@code{starpu_mpi} functions and @code{starpu_shutdown()}.
+@code{MPI_Finalize()} will be called if StarPU-MPI has been initialized
+by @code{starpu_mpi_init()}.
+@end deftypefun
+
+@deftypefun void starpu_mpi_comm_amounts_retrieve (size_t *@var{comm_amounts})
+Retrieve the current amount of communications from the current node in
+the array @code{comm_amounts} which must have a size greater or equal
+to the world size. Communications statistics must be enabled
+(@pxref{STARPU_COMM_STATS}).
+@end deftypefun
+
+@node Communication
+@subsection Communication
+
+The standard point to point communications of MPI have been
+implemented. The semantic is similar to the MPI one, but adapted to
+the DSM provided by StarPU. A MPI request will only be submitted when
+the data is available in the main memory of the node submitting the
+request.
+
+There is two types of asynchronous communications: the classic
+asynchronous communications and the detached communications. The
+classic asynchronous communications (@code{starpu_mpi_isend} and
+@code{starpu_mpi_irecv}) need to be followed by a call to
+@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
+test the completion of the communication. Waiting for or testing the
+completion of detached communications is not possible, this is done
+internally by StarPU-MPI, on completion, the resources are
+automatically released. This mechanism is similar to the pthread
+detach state attribute which determines whether a thread will be
+created in a joinable or a detached state.
+
+@deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
+Performs a standard-mode, blocking send of @var{data_handle} to the
+node @var{dest} using the message tag @code{mpi_tag} within the
+communicator @var{comm}.
+@end deftypefun
+
+@deftypefun int starpu_mpi_recv (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
+Performs a standard-mode, blocking receive in @var{data_handle} from the
+node @var{source} using the message tag @code{mpi_tag} within the
+communicator @var{comm}.
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
+Posts a standard-mode, non blocking send of @var{data_handle} to the
+node @var{dest} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. After the call, the pointer to the request
+@var{req} can be used to test or to wait for the completion of the communication.
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
+Posts a nonblocking receive in @var{data_handle} from the
+node @var{source} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. After the call, the pointer to the request
+@var{req} can be used to test or to wait for the completion of the communication.
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_detached (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
+Posts a standard-mode, non blocking send of @var{data_handle} to the
+node @var{dest} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. On completion, the @var{callback} function is
+called with the argument @var{arg}. Similarly to the pthread detached
+functionality, when a detached communication completes, its resources
+are automatically released back to the system, there is no need to
+test or to wait for the completion of the request.
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_detached (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
+Posts a nonblocking receive in @var{data_handle} from the
+node @var{source} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. On completion, the @var{callback} function is
+called with the argument @var{arg}. Similarly to the pthread detached
+functionality, when a detached communication completes, its resources
+are automatically released back to the system, there is no need to
+test or to wait for the completion of the request.
+@end deftypefun
+
+@deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
+Returns when the operation identified by request @var{req} is complete.
+@end deftypefun
+
+@deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
+If the operation identified by @var{req} is complete, set @var{flag}
+to 1. The @var{status} object is set to contain information on the
+completed operation.
+@end deftypefun
+
+@deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
+Blocks the caller until all group members of the communicator
+@var{comm} have called it.
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
+Posts a standard-mode, non blocking send of @var{data_handle} to the
+node @var{dest} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. On completion, @var{tag} is unlocked.
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
+Posts a nonblocking receive in @var{data_handle} from the
+node @var{source} using the message tag @code{mpi_tag} within the
+communicator @var{comm}. On completion, @var{tag} is unlocked.
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
+Posts @var{array_size} standard-mode, non blocking send. Each post
+sends the n-th data of the array @var{data_handle} to the n-th node of
+the array @var{dest}
+using the n-th message tag of the array @code{mpi_tag} within the n-th
+communicator of the array
+@var{comm}. On completion of the all the requests, @var{tag} is unlocked.
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
+Posts @var{array_size} nonblocking receive. Each post receives in the
+n-th data of the array @var{data_handle} from the n-th
+node of the array @var{source} using the n-th message tag of the array
+@code{mpi_tag} within the n-th communicator of the array @var{comm}.
+On completion of the all the requests, @var{tag} is unlocked.
+@end deftypefun
+
+@node Communication cache
+@subsection Communication cache
+
+@deftypefun void starpu_mpi_cache_flush (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle})
+Clear the send and receive communication cache for the data
+@var{data_handle}. The function has to be called synchronously by all
+the MPI nodes.
+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
+@end deftypefun
+
+@deftypefun void starpu_mpi_cache_flush_all_data (MPI_Comm @var{comm})
+Clear the send and receive communication cache for all data. The
+function has to be called synchronously by all the MPI nodes.
+The function does nothing if the cache mechanism is disabled (@pxref{STARPU_MPI_CACHE}).
+@end deftypefun
+
 @node Defining a new data interface
 @section Defining a new data interface
 
@@ -33,19 +305,19 @@
 Per-interface data transfer methods.
 
 @table @asis
-@item @code{void (*register_data_handle)(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)}
+@item @code{void (*register_data_handle)(starpu_data_handle_t handle, unsigned home_node, void *data_interface)}
 Register an existing interface into a data handle.
 
-@item @code{starpu_ssize_t (*allocate_data_on_node)(void *data_interface, uint32_t node)}
+@item @code{starpu_ssize_t (*allocate_data_on_node)(void *data_interface, unsigned node)}
 Allocate data for the interface on a given node.
 
-@item @code{ void (*free_data_on_node)(void *data_interface, uint32_t node)}
+@item @code{ void (*free_data_on_node)(void *data_interface, unsigned node)}
 Free data of the interface on a given node.
 
 @item @code{ const struct starpu_data_copy_methods *copy_methods}
-ram/cuda/spu/opencl synchronous and asynchronous transfer methods.
+ram/cuda/opencl synchronous and asynchronous transfer methods.
 
-@item @code{ void * (*handle_to_pointer)(starpu_data_handle_t handle, uint32_t node)}
+@item @code{ void * (*handle_to_pointer)(starpu_data_handle_t handle, unsigned node)}
 Return the current pointer (if any) for the handle on the given node.
 
 @item @code{ size_t (*get_size)(starpu_data_handle_t handle)}
@@ -60,23 +332,32 @@ Compare the data size of two interfaces.
 @item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
 Dump the sizes of a handle to a file.
 
-@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
-Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
-
 @item @code{enum starpu_data_interface_id interfaceid}
 An identifier that is unique to each interface.
 
 @item @code{size_t interface_size}
 The size of the interface data descriptor.
 
+@item @code{int is_multiformat}
+todo
+
+@item @code{struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface)}
+todo
+
+@item @code{int (*pack_data)(starpu_data_handle_t handle, unsigned node, void **ptr, size_t *count)}
+Pack the data handle into a contiguous buffer at the address @code{ptr} and set the size of the newly created buffer in @code{count}
+
+@item @code{int (*unpack_data)(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)}
+Unpack the data handle from the contiguous buffer at the address @code{ptr} of size @var{count}
+
 @end table
 @end deftp
 
 @deftp {Data Type} {struct starpu_data_copy_methods}
 Defines the per-interface methods.
 @table @asis
-@item @code{int @{ram,cuda,opencl,spu@}_to_@{ram,cuda,opencl,spu@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
-These 16 functions define how to copy data from the @var{src_interface}
+@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 12 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 
@@ -141,7 +422,7 @@ future CRC computation. This is used for computing data size footprint.
 @node An example of data interface
 @subsection An example of data interface
 
-@deftypefun int starpu_data_interface_get_next_id ()
+@deftypefun int starpu_data_interface_get_next_id (void)
 Returns the next available id for a newly created data interface.
 @end deftypefun
 
@@ -167,7 +448,7 @@ described below.
 @cartouche
 @smallexample
 void starpu_complex_data_register(starpu_data_handle_t *handle,
-     uint32_t home_node, double *real, double *imaginary, int nx)
+     unsigned home_node, double *real, double *imaginary, int nx)
 @{
         struct starpu_complex_interface complex =
         @{
@@ -302,7 +583,7 @@ pointer to a codelet which converts from CUDA to CPU
 @end table
 @end deftp
 
-@deftypefun void starpu_multiformat_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, void *@var{ptr}, uint32_t @var{nobjects}, struct starpu_multiformat_data_interface_ops *@var{format_ops})
+@deftypefun void starpu_multiformat_data_register (starpu_data_handle_t *@var{handle}, unsigned @var{home_node}, void *@var{ptr}, uint32_t @var{nobjects}, struct starpu_multiformat_data_interface_ops *@var{format_ops})
 Register a piece of data that can be represented in different ways, depending upon
 the processing unit that manipulates it. It allows the programmer, for instance, to
 use an array of structures when working on a CPU, and a structure of arrays when
@@ -464,28 +745,6 @@ Get the description of a combined worker
 Variant of starpu_worker_can_execute_task compatible with combined workers
 @end deftypefun
 
-
-@node Defining a new scheduling policy
-@section Defining a new scheduling policy
-
-TODO
-
-A full example showing how to define a new scheduling policy is available in
-the StarPU sources in the directory @code{examples/scheduler/}.
-
-@menu
-* Scheduling Policy API:: Scheduling Policy API
-* Source code::
-@end menu
-
-@node Scheduling Policy API
-@subsection Scheduling Policy API
-
-While StarPU comes with a variety of scheduling policies (@pxref{Task
-scheduling policy}), it may sometimes be desirable to implement custom
-policies to address specific problems.  The API described below allows
-users to write their own scheduling policy.
-
 @deftp {Data Type} {struct starpu_machine_topology}
 @table @asis
 @item @code{unsigned nworkers}
@@ -521,9 +780,6 @@ Actual number of CUDA workers used by StarPU.
 @item @code{unsigned nopenclgpus}
 Actual number of OpenCL workers used by StarPU.
 
-@item @code{unsigned ngordon_spus}
-Actual number of Gordon workers used by StarPU.
-
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 Indicates the successive cpu identifier that should be used to bind the
 workers. It is either filled according to the user's explicit
@@ -543,10 +799,119 @@ driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 they are taken in ID order.
 
+@end table
+@end deftp
+
+@node Scheduling Contexts
+@section Scheduling Contexts
+StarPU permits on one hand grouping workers in combined workers in order to execute a parallel task and on the other hand grouping tasks in bundles that will be executed by a single specified worker.
+In contrast when we group workers in scheduling contexts we submit starpu tasks to them and we schedule them with the policy assigned to the context.
+Scheduling contexts can be created, deleted and modified dynamically.
+
+@deftypefun unsigned starpu_sched_ctx_create (const char *@var{policy_name}, int *@var{workerids_ctx}, int @var{nworkers_ctx}, const char *@var{sched_ctx_name})
+This function creates a scheduling context which uses the scheduling policy indicated in the first argument and assigns the workers indicated in the second argument to execute the tasks submitted to it.
+The return value represents the identifier of the context that has just been created. It will be further used to indicate the context the tasks will be submitted to. The return value should be at most @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_delete (unsigned @var{sched_ctx_id})
+Delete scheduling context @var{sched_ctx_id} and transfer remaining workers to the inheritor scheduling context.
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_add_workers ({int *}@var{workerids_ctx}, int @var{nworkers_ctx}, unsigned @var{sched_ctx_id})
+This function adds dynamically the workers indicated in the first argument to the context indicated in the last argument. The last argument cannot be greater than  @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_remove_workers ({int *}@var{workerids_ctx}, int @var{nworkers_ctx}, unsigned @var{sched_ctx_id})
+This function removes the workers indicated in the first argument from the context indicated in the last argument. The last argument cannot be greater than  @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+A scheduling context manages a collection of workers that can be memorized using different data structures. Thus, a generic structure is available in order to simplify the choice of its type.
+Only the list data structure is available but further data structures(like tree) implementations are foreseen.
+
+@deftp {Data Type} {struct starpu_sched_ctx_worker_collection}
+@table @asis
+@item @code{void *workerids}
+The workerids managed by the collection
+@item @code{unsigned nworkers}
+The number of workerids
+@item @code{pthread_key_t cursor_key} (optional)
+The cursor needed to iterate the collection (depending on the data structure)
+@item @code{int type}
+The type of structure (currently STARPU_WORKER_LIST is the only one available)
+@item @code{unsigned (*has_next)(struct starpu_sched_ctx_worker_collection *workers)}
+Checks if there is a next worker
+@item @code{int (*get_next)(struct starpu_sched_ctx_worker_collection *workers)}
+Gets the next worker
+@item @code{int (*add)(struct starpu_sched_ctx_worker_collection *workers, int worker)}
+Adds a worker to the collection
+@item @code{int (*remove)(struct starpu_sched_ctx_worker_collection *workers, int worker)}
+Removes a worker from the collection
+@item @code{void* (*init)(struct starpu_sched_ctx_worker_collection *workers)}
+Initialize the collection
+@item @code{void (*deinit)(struct starpu_sched_ctx_worker_collection *workers)}
+Deinitialize the colection
+@item @code{void (*init_cursor)(struct starpu_sched_ctx_worker_collection *workers)} (optional)
+Initialize the cursor if there is one
+@item @code{void (*deinit_cursor)(struct starpu_sched_ctx_worker_collection *workers)} (optional)
+Deinitialize the cursor if there is one
 
 @end table
 @end deftp
 
+@deftypefun struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_create_worker_collection (unsigned @var{sched_ctx_id}, int @var{type})
+Create a worker collection of the type indicated by the last parameter for the context specified through the first parameter.
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_delete_worker_collection (unsigned @var{sched_ctx_id})
+Delete the worker collection of the specified scheduling context
+@end deftypefun
+
+@deftypefun struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collection (unsigned @var{sched_ctx_id})
+Return the worker collection managed by the indicated context
+@end deftypefun
+
+@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
+TODO
+@end deftypefun
+
+@deftypefun void starpu_task_set_context (unsigned *@var{sched_ctx_id})
+Set the scheduling context the subsequent tasks will be submitted to
+@end deftypefun
+
+@deftypefun unsigned starpu_task_get_context (void)
+Return the scheduling context the tasks are currently submitted to
+@end deftypefun
+
+@deftypefun unsigned starpu_sched_ctx_get_nworkers (unsigned @var{sched_ctx_id})
+Return the number of workers managed by the specified contexts
+(Usually needed to verify if it manages any workers or if it should be blocked)
+@end deftypefun
+
+@deftypefun unsigned starpu_sched_ctx_get_nshared_workers (unsigned @var{sched_ctx_id}, unsigned @var{sched_ctx_id2})
+Return the number of workers shared by two contexts
+@end deftypefun
+
+@node Defining a new scheduling policy
+@section Defining a new scheduling policy
+
+TODO
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory @code{examples/scheduler/}.
+
+@menu
+* Scheduling Policy API:: Scheduling Policy API
+* Source code::
+@end menu
+
+@node Scheduling Policy API
+@subsection Scheduling Policy API
+
+While StarPU comes with a variety of scheduling policies (@pxref{Task
+scheduling policy}), it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own scheduling policy.
+
 @deftp {Data Type} {struct starpu_sched_policy}
 This structure contains all the methods that implement a scheduling policy.  An
 application may specify which scheduling strategy in the @code{sched_policy}
@@ -554,10 +919,10 @@ field of the @code{starpu_conf} structure passed to the @code{starpu_init}
 function. The different fields are:
 
 @table @asis
-@item @code{void (*init_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+@item @code{void (*init_sched)(unsigned sched_ctx_id)}
 Initialize the scheduling policy.
 
-@item @code{void (*deinit_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+@item @code{void (*deinit_sched)(unsigned sched_ctx_id)}
 Cleanup the scheduling policy.
 
 @item @code{int (*push_task)(struct starpu_task *)}
@@ -570,18 +935,18 @@ is about to be executed by the worker. This method therefore permits to keep
 the state of of the scheduler coherent even when StarPU bypasses the scheduling
 strategy.
 
-@item @code{struct starpu_task *(*pop_task)(void)} (optional)
+@item @code{struct starpu_task *(*pop_task)(unsigned sched_ctx_id)} (optional)
 Get a task from the scheduler. The mutex associated to the worker is already
 taken when this method is called. If this method is defined as @code{NULL}, the
 worker will only execute tasks from its local queue. In this case, the
 @code{push_task} method should use the @code{starpu_push_local_task} method to
 assign tasks to the different workers.
 
-@item @code{struct starpu_task *(*pop_every_task)(void)}
+@item @code{struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id)}
 Remove all available tasks from the scheduler (tasks are chained by the means
 of the prev and next fields of the starpu_task structure). The mutex associated
 to the worker is already taken when this method is called. This is currently
-only used by the Gordon driver.
+not used.
 
 @item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
 This method is called every time a task is starting.
@@ -589,6 +954,12 @@ This method is called every time a task is starting.
 @item @code{void (*post_exec_hook)(struct starpu_task *)} (optional)
 This method is called every time a task has been executed.
 
+@item @code{void (*add_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)}
+Initialize scheduling structures corresponding to each worker used by the policy.
+
+@item @code{void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)}
+Deinitialize scheduling structures corresponding to each worker used by the policy.
+
 @item @code{const char *policy_name} (optional)
 Name of the policy.
 
@@ -597,8 +968,12 @@ Description of the policy.
 @end table
 @end deftp
 
-@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
-This function specifies the condition variable associated to a worker
+@deftypefun {struct starpu_sched_policy **} starpu_sched_get_predefined_policies ()
+Return an NULL-terminated array of all the predefined scheduling policies.
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_set_worker_mutex_and_cond (unsigned @var{sched_ctx_id}, int @var{workerid}, pthread_mutex_t *@var{sched_mutex}, {pthread_cond_t *}@var{sched_cond})
+This function specifies the condition variable associated to a worker per context
 When there is no available task for a worker, StarPU blocks this worker on a
 condition variable. This function specifies which condition variable (and the
 associated mutex) should be used to block (and to wake up) a worker. Note that
@@ -609,6 +984,20 @@ The initialization method of a scheduling strategy (@code{init_sched}) must
 call this function once per worker.
 @end deftypefun
 
+@deftypefun void starpu_sched_ctx_get_worker_mutex_and_cond (unsigned @var{sched_ctx_id}, int @var{workerid}, {pthread_mutex_t **}@var{sched_mutex}, {pthread_cond_t **}@var{sched_cond})
+This function returns the condition variables associated to a worker in a context
+It is used in the policy to access to the local queue of the worker
+@end deftypefun
+
+@deftypefun void starpu_sched_ctx_set_policy_data (unsigned @var{sched_ctx_id}, {void *} @var{policy_data})
+Each scheduling policy uses some specific data (queues, variables, additional condition variables).
+It is memorize through a local structure. This function assigns it to a scheduling context.
+@end deftypefun
+
+@deftypefun void* starpu_sched_ctx_get_policy_data (unsigned @var{sched_ctx_id})
+Returns the policy data previously assigned to a context
+@end deftypefun
+
 @deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
 Defines the minimum priority level supported by the scheduling policy. The
 default minimum priority level is the same as the default priority level which
@@ -660,7 +1049,7 @@ Returns expected task duration in µs
 Returns an estimated speedup factor relative to CPU speed
 @end deftypefun
 
-@deftypefun double starpu_task_expected_data_transfer_time (uint32_t @var{memory_node}, {struct starpu_task *}@var{task})
+@deftypefun double starpu_task_expected_data_transfer_time (unsigned @var{memory_node}, {struct starpu_task *}@var{task})
 Returns expected data transfer time in µs
 @end deftypefun
 
@@ -684,6 +1073,8 @@ Returns expected conversion time in ms (multiformat interface only)
 static struct starpu_sched_policy dummy_sched_policy = @{
     .init_sched = init_dummy_sched,
     .deinit_sched = deinit_dummy_sched,
+    .add_workers = dummy_sched_add_workers,
+    .remove_workers = dummy_sched_remove_workers,
     .push_task = push_task_dummy,
     .push_prio_task = NULL,
     .pop_task = pop_task_dummy,
@@ -706,14 +1097,18 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 @node Driver API
 @subsection Driver API
 
-@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
-Initialize the given driver. Returns 0 on success, -EINVAL if
-@code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
-STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
+@deftypefun int starpu_driver_run ({struct starpu_driver *}@var{d})
+Initialize the given driver, run it until it receives a request to terminate,
+deinitialize it and return 0 on success. It returns -EINVAL if @code{d->type}
+is not a valid StarPU device type (STARPU_CPU_WORKER, STARPU_CUDA_WORKER or
+STARPU_OPENCL_WORKER). This is the same as using the following
+functions: calling @code{starpu_driver_init()}, then calling
+@code{starpu_driver_run_once()} in a loop, and eventually
+@code{starpu_driver_deinit()}.
 @end deftypefun
 
-@deftypefun int starpu_driver_run ({struct starpu_driver *}@var{d})
-Run the driver until it receives a request to terminate, then returns 0 on success, -EINVAL if
+@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
+Initialize the given driver. Returns 0 on success, -EINVAL if
 @code{d->type} is not a valid StarPU device type (STARPU_CPU_WORKER,
 STARPU_CUDA_WORKER or STARPU_OPENCL_WORKER).
 @end deftypefun
@@ -773,4 +1168,3 @@ Register a progression hook, to be called when workers are idle.
 @deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
 Unregister a given progression hook.
 @end deftypefun
-

+ 166 - 126
doc/chapters/advanced-examples.texi

@@ -2,20 +2,20 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
 * Using multiple implementations of a codelet::
 * Enabling implementation according to capabilities::
-* Task and Worker Profiling::   
+* Task and Worker Profiling::
 * Partitioning Data::
-* Performance model example::   
-* Theoretical lower bound on execution time::  
-* Insert Task Utility::          
-* Data reduction::  
-* Temporary buffers::  
+* Performance model example::
+* Theoretical lower bound on execution time::
+* Insert Task Utility::
+* Data reduction::
+* Temporary buffers::
 * Parallel Tasks::
 * Debugging::
 * The multiformat interface::
@@ -45,7 +45,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
     __m128 factor __attribute__((aligned(16)));
     factor = _mm_set1_ps(*(float *) cl_arg);
 
-    unsigned int i;    
+    unsigned int i;
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
 @}
@@ -63,8 +63,8 @@ struct starpu_codelet cl = @{
 @end smallexample
 @end cartouche
 
-Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
-and @code{pheft} for now) will use the performance models of all the
+Schedulers which are multi-implementation aware (only @code{dmda} and
+@code{pheft} for now) will use the performance models of all the
 implementations it was given, and pick the one that seems to be the fastest.
 
 @node Enabling implementation according to capabilities
@@ -197,9 +197,11 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
         double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
         double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
         double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
+        double overhead_time = total_time - executing_time - sleeping_time;
 
         float executing_ratio = 100.0*executing_time/total_time;
         float sleeping_ratio = 100.0*sleeping_time/total_time;
+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
 
         char workername[128];
         starpu_worker_get_name(worker, workername, 128);
@@ -209,6 +211,8 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
                 executing_ratio);
         fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
                 sleeping_ratio);
+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3,
+                overhead_ratio);
 @}
 @end smallexample
 @end cartouche
@@ -224,10 +228,11 @@ int vector[NX];
 starpu_data_handle_t handle;
 
 /* Declare data to StarPU */
-starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
+                            NX, sizeof(vector[0]));
 
 /* Partition the vector in PARTS sub-vectors */
-starpu_filter f =
+starpu_data_filter f =
 @{
     .filter_func = starpu_block_filter_func_vector,
     .nchildren = PARTS
@@ -294,6 +299,10 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 @end smallexample
 @end cartouche
 
+StarPU provides various interfaces and filters for matrices, vectors, etc.,
+but applications can also write their own data interfaces and filters, see
+@code{examples/interface} and @code{examples/filters/custom_mf} for an example.
+
 @node Performance model example
 @section Performance model example
 
@@ -317,13 +326,16 @@ few different sets of data input/output sizes. StarPU will then keep record of
 the average time of previous executions on the various processing units, and use
 it as an estimation. History is done per task size, by using a hash of the input
 and ouput sizes as an index.
-It will also save it in @code{~/.starpu/sampling/codelets}
+It will also save it in @code{$STARPU_HOME/.starpu/sampling/codelets}
 for further executions, and can be observed by using the
 @code{starpu_perfmodel_display} command, or drawn by using
 the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  The
 models are indexed by machine name. To
 share the models between machines (e.g. for a homogeneous cluster), use
-@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done
+when using a task scheduler which makes use of it, such as 
+@code{dmda}. Measurements can also be provided explicitly by the application, by
+using the @code{starpu_perfmodel_update_history} function.
 
 The following is a small code example.
 
@@ -368,11 +380,15 @@ tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
 and maximum observed input size. It can be useful to set the
 @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
-on varying input sizes, so as to feed the performance model for a variety of
-inputs. The @code{starpu_perfmodel_display} and @code{starpu_perfmodel_plot}
+on varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,
+so as to feed the performance model for a variety of
+inputs. The application can also provide the measurements explictly by using
+@code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
+@code{starpu_perfmodel_plot}
 tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
 their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
-StarPU use the resulting performance model without recording new measures. If
+StarPU use the resulting performance model without recording new measures, and
+@code{STARPU_SCHED} can be set to @code{dmda} to benefit from the performance models. If
 the data input sizes vary a lot, it is really important to set
 @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
 measures, and result with a very big performance model, which will take time a
@@ -425,7 +441,7 @@ needs to be called to destroy the dummy task afterwards. See
 @node Theoretical lower bound on execution time
 @section Theoretical lower bound on execution time
 
-For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
+For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
 bound for the execution time of a whole set of tasks. See for
 instance @code{examples/lu/lu_example.c}: before submitting tasks,
 call @code{starpu_bound_start}, and after complete execution, call
@@ -439,8 +455,10 @@ solve it immediately and get the optimized minimum, in ms. Its @code{integer}
 parameter allows to decide whether integer resolution should be computed
 and returned too.
 
-The @code{deps} parameter tells StarPU whether to take tasks and implicit data
-dependencies into account. It must be understood that the linear programming
+The @code{deps} parameter tells StarPU whether to take tasks, implicit data, and tag
+dependencies into account. Tags released in a callback or similar
+are not taken into account, only tags associated with a task are.
+It must be understood that the linear programming
 problem size is quadratic with the number of tasks and thus the time to solve it
 will be very long, it could be minutes for just a few dozen tasks. You should
 probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
@@ -452,6 +470,10 @@ of @code{lp_solve}. For instance, we often just use
 @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
 the @code{-gr} option can also be quite useful.
 
+Data transfer time can only be taken into account when @code{deps} is set. Only
+data transfers inferred from implicit data dependencies between tasks are taken
+into account.
+
 Setting @code{deps} to 0 will only take into account the actual computations
 on processing units. It however still properly takes into account the varying
 performances of kernels and processing units, which is quite more accurate than
@@ -469,73 +491,12 @@ transfers, which are assumed to be completely overlapped.
 @section Insert Task Utility
 
 StarPU provides the wrapper function @code{starpu_insert_task} to ease
-the creation and submission of tasks.
-
-@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
-Create and submit a task corresponding to @var{cl} with the following
-arguments.  The argument list must be zero-terminated.
-
-The arguments following the codelets can be of the following types:
-
-@itemize
-@item
-@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
-@item
-the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
-@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
-@code{STARPU_PRIORITY}, followed by the appropriated objects as
-defined below.
-@end itemize
-
-Parameters to be passed to the codelet implementation are defined
-through the type @code{STARPU_VALUE}. The function
-@code{starpu_codelet_unpack_args} must be called within the codelet
-implementation to retrieve them.
-@end deftypefun
-
-@defmac STARPU_VALUE
-this macro is used when calling @code{starpu_insert_task}, and must be
-followed by a pointer to a constant value and the size of the constant
-@end defmac
-
-@defmac STARPU_CALLBACK
-this macro is used when calling @code{starpu_insert_task}, and must be
-followed by a pointer to a callback function
-@end defmac
-
-@defmac STARPU_CALLBACK_ARG
-this macro is used when calling @code{starpu_insert_task}, and must be
-followed by a pointer to be given as an argument to the callback
-function
-@end defmac
-
-@defmac  STARPU_CALLBACK_WITH_ARG
-this macro is used when calling @code{starpu_insert_task}, and must be
-followed by two pointers: one to a callback function, and the other to
-be given as an argument to the callback function; this is equivalent
-to using both @code{STARPU_CALLBACK} and
-@code{STARPU_CALLBACK_WITH_ARG}
-@end defmac
-
-@defmac STARPU_PRIORITY
-this macro is used when calling @code{starpu_insert_task}, and must be
-followed by a integer defining a priority level
-@end defmac
-
-@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
-Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
-given to a codelet and later unpacked with the function
-@code{starpu_codelet_unpack_args} defined below.
-@end deftypefun
-
-@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
-Retrieve the arguments of type @code{STARPU_VALUE} associated to a
-task automatically created using the function
-@code{starpu_insert_task} defined above.
-@end deftypefun
+the creation and submission of tasks. See the definition of the
+functions in @ref{Insert Task}.
 
 Here the implementation of the codelet:
 
+@cartouche
 @smallexample
 void func_cpu(void *descr[], void *_args)
 @{
@@ -556,9 +517,11 @@ struct starpu_codelet mycodelet = @{
         .modes = @{ STARPU_RW, STARPU_RW @}
 @};
 @end smallexample
+@end cartouche
 
 And the call to the @code{starpu_insert_task} wrapper:
 
+@cartouche
 @smallexample
 starpu_insert_task(&mycodelet,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
@@ -566,10 +529,12 @@ starpu_insert_task(&mycodelet,
                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
                    0);
 @end smallexample
+@end cartouche
 
 The call to @code{starpu_insert_task} is equivalent to the following
 code:
 
+@cartouche
 @smallexample
 struct starpu_task *task = starpu_task_create();
 task->cl = &mycodelet;
@@ -585,19 +550,35 @@ task->cl_arg = arg_buffer;
 task->cl_arg_size = arg_buffer_size;
 int ret = starpu_task_submit(task);
 @end smallexample
+@end cartouche
+
+Here a similar call using @code{STARPU_DATA_ARRAY}.
+
+@cartouche
+@smallexample
+starpu_insert_task(&mycodelet,
+                   STARPU_DATA_ARRAY, data_handles, 2,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   0);
+@end smallexample
+@end cartouche
 
 If some part of the task insertion depends on the value of some computation,
 the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
 instance, assuming that the index variable @code{i} was registered as handle
 @code{i_handle}:
 
+@cartouche
 @smallexample
 /* Compute which portion we will work on, e.g. pivot */
 starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
 
 /* And submit the corresponding task */
-STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
+                       starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
 @end smallexample
+@end cartouche
 
 The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
 acquiring data @code{i} for the main application, and will execute the code
@@ -630,6 +611,7 @@ buffers, and how to assemble partial results.
 For instance, @code{cg} uses that to optimize its dot product: it first defines
 the codelets for initialization and reduction:
 
+@cartouche
 @smallexample
 struct starpu_codelet bzero_variable_cl =
 @{
@@ -660,34 +642,67 @@ struct starpu_codelet accumulate_variable_cl =
         .nbuffers = 1,
 @}
 @end smallexample
+@end cartouche
 
 and attaches them as reduction methods for its dtq handle:
 
+@cartouche
 @smallexample
+starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
 starpu_data_set_reduction_methods(dtq_handle,
         &accumulate_variable_cl, &bzero_variable_cl);
 @end smallexample
+@end cartouche
 
-and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
+and @code{dtq_handle} can now be used in @code{STARPU_REDUX} mode for the dot products
 with partitioned vectors:
 
+@cartouche
 @smallexample
-int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
-         starpu_data_handle_t s, unsigned nblocks)
-@{
-    starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
-    for (b = 0; b < nblocks; b++)
-        starpu_insert_task(&dot_kernel_cl,
-            STARPU_RW, s,
-            STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-            STARPU_R, starpu_data_get_sub_data(v2, 1, b),
-            0);
-@}
+for (b = 0; b < nblocks; b++)
+    starpu_insert_task(&dot_kernel_cl,
+        STARPU_REDUX, dtq_handle,
+        STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+        STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+        0);
 @end smallexample
+@end cartouche
+
+During registration, we have here provided NULL, i.e. there is no initial value
+to be taken into account during reduction. StarPU will thus only take into
+account the contributions from the @code{dot_kernel_cl} tasks. Also, it will not
+allocate any memory for @code{dtq_handle} before @code{dot_kernel_cl} tasks are
+ready to run.
+
+If another dot product has to be performed, one could unregister
+@code{dtq_handle}, and re-register it. But one can also use
+@code{starpu_data_invalidate_submit(dtq_handle)}, which will clear all data from the handle,
+thus resetting it back to the initial @code{register(NULL)} state.
 
 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 to yet more relaxed dependencies and more parallelism.
 
+STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPI
+case. That will however not produce any MPI communication, but just pass
+STARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to the
+application to call @code{starpu_mpi_redux_data}, which posts tasks that will
+reduce the partial results among MPI nodes into the MPI node which owns the
+data. For instance, some hypothetical application which collects partial results
+into data @code{res}, then uses it for other computation, before looping again
+with a new reduction:
+
+@cartouche
+@smallexample
+for (i = 0; i < 100; i++) @{
+    starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+    starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
+               STARPU_R, B, STARPU_REDUX, res, 0);
+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
+    starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+@}
+@end smallexample
+@end cartouche
+
 @node Temporary buffers
 @section Temporary buffers
 
@@ -706,7 +721,7 @@ and destroy it on unregistration.
 
 In addition to that, it can be tedious for the application to have to unregister
 the data, since it will not use its content anyway. The unregistration can be
-done lazily by using the @code{starpu_data_unregister_lazy(handle)} function,
+done lazily by using the @code{starpu_data_unregister_submit(handle)} function,
 which will record that no more tasks accessing the handle will be submitted, so
 that it can be freed as soon as the last task accessing it is over.
 
@@ -714,13 +729,15 @@ The following code examplifies both points: it registers the temporary
 data, submits three tasks accessing it, and records the data for automatic
 unregistration.
 
+@cartouche
 @smallexample
 starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
 starpu_insert_task(&produce_data, STARPU_W, handle, 0);
 starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
 starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
-starpu_data_unregister_lazy(handle);
+starpu_data_unregister_submit(handle);
 @end smallexample
+@end cartouche
 
 @subsection Scratch data
 
@@ -728,15 +745,18 @@ Some kernels sometimes need temporary data to achieve the computations, i.e. a
 workspace. The application could allocate it at the start of the codelet
 function, and free it at the end, but that would be costly. It could also
 allocate one buffer per worker (similarly to @ref{Per-worker library
-initialization }), but that would make them systematic and permanent. A more
+initialization}), but that would make them systematic and permanent. A more
 optimized way is to use the SCRATCH data access mode, as examplified below,
 which provides per-worker buffers without content consistency.
 
+@cartouche
 @smallexample
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
 for (i = 0; i < N; i++)
-    starpu_insert_task(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+    starpu_insert_task(&compute, STARPU_R, input[i],
+                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
 @end smallexample
+@end cartouche
 
 StarPU will make sure that the buffer is allocated before executing the task,
 and make this allocation per-worker: for CPU workers, notably, each worker has
@@ -776,7 +796,8 @@ the CPU binding mask that StarPU chose.
 For instance, using OpenMP (full source is available in
 @code{examples/openmp/vector_scal.c}):
 
-@example
+@cartouche
+@smallexample
 void scal_cpu_func(void *buffers[], void *_args)
 @{
     unsigned i;
@@ -799,7 +820,8 @@ static struct starpu_codelet cl =
     .cpu_funcs = @{scal_cpu_func, NULL@},
     .nbuffers = 1,
 @};
-@end example
+@end smallexample
+@end cartouche
 
 Other examples include for instance calling a BLAS parallel CPU implementation
 (see @code{examples/mult/xgemm.c}).
@@ -813,7 +835,8 @@ involved in the combined worker, and thus the number of calls that are made in
 parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
 the rank of the current CPU within the combined worker. For instance:
 
-@example
+@cartouche
+@smallexample
 static void func(void *buffers[], void *args)
 @{
     unsigned i;
@@ -840,7 +863,8 @@ static struct starpu_codelet cl =
     .cpu_funcs = @{ func, NULL @},
     .nbuffers = 1,
 @}
-@end example
+@end smallexample
+@end cartouche
 
 Of course, this trivial example will not really benefit from parallel task
 execution, and was only meant to be simple to understand.  The benefit comes
@@ -852,28 +876,25 @@ buffer.
 
 To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
 be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
-(parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed also
+(parallel-heft) and @code{peager} (parallel eager) schedulers will indeed also
 try to execute tasks with several CPUs. It will automatically try the various
 available combined worker sizes and thus be able to avoid choosing a large
 combined worker if the codelet does not actually scale so much.
 
-@subsection Combined worker sizes
+@subsection Combined workers
 
 By default, StarPU creates combined workers according to the architecture
 structure as detected by hwloc. It means that for each object of the hwloc
 topology (NUMA node, socket, cache, ...) a combined worker will be created. If
 some nodes of the hierarchy have a big arity (e.g. many cores in a socket
 without a hierarchy of shared caches), StarPU will create combined workers of
-intermediate sizes.
-The user can give some hints to StarPU about combined workers sizes to favor.
-This can be done by using the environment variables @code{STARPU_MIN_WORKERSIZE}
-and @code{STARPU_MAX_WORKERSIZE}. When set, they will force StarPU to create the
-biggest combined workers possible without overstepping the defined boundaries.
-However, StarPU will create the remaining combined workers without abiding by
-the rules if not possible.
-For example : if the user specifies a minimum and maximum combined workers size
-of 3 on a machine containing 8 CPUs, StarPU will create a combined worker of
-size 2 beside the combined workers of size 3.
+intermediate sizes. The @code{STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER} variable
+permits to tune the maximum arity between levels of combined workers.
+
+The combined workers actually produced can be seen in the output of the
+@code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variable
+has to be set to a combined worker-aware scheduler such as @code{pheft} or
+@code{peager}).
 
 @subsection Concurrent parallel tasks
 
@@ -892,7 +913,10 @@ sections for instance.
 The solution is then to use only one combined worker at a time.  This can be
 done by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}
 structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environment
-variable to 1. StarPU will then run only one parallel task at a time.
+variable to 1. StarPU will then run only one parallel task at a time (but other
+CPU and GPU tasks are not affected and can be run concurrently). The parallel
+task scheduler will however still however still try varying combined worker
+sizes to look for the most efficient ones.
 
 @node Debugging
 @section Debugging
@@ -906,13 +930,15 @@ gdb helpers are also provided to show the whole StarPU state:
 (gdb) help starpu
 @end smallexample
 
+The Temanejo task debugger can also be used, see @ref{Task debugger}.
+
 @node The multiformat interface
 @section The multiformat interface
 It may be interesting to represent the same piece of data using two different
 data structures: one that would only be used on CPUs, and one that would only
 be used on GPUs. This can be done by using the multiformat interface. StarPU
 will be able to convert data from one data structure to the other when needed.
-Note that the heft scheduler is the only one optimized for this interface. The
+Note that the dmda scheduler is the only one optimized for this interface. The
 user must provide StarPU with conversion codelets:
 
 @cartouche
@@ -1005,28 +1031,42 @@ constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
 to be the one that runs CUDA computations for that GPU.
 
 To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
-to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
+to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
+first, and the interoperability mode has to
 be enabled by using the @code{cuda_opengl_interoperability} field of the
 @code{starpu_conf} structure, and the driver loop has to be run by
 the application, by using the @code{not_launched_drivers} field of
 @code{starpu_conf} to prevent StarPU from running it in a separate thread, and
-by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} example
-shows how it articulates in a simple case, where rendering is done in task
-callbacks. TODO: provide glutIdleFunc alternative.
+by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} and
+@code{gl_interop_idle} examples shows how it articulates in a simple case, where
+rendering is done in task callbacks. The former uses @code{glutMainLoopEvent}
+to make GLUT progress from the StarPU driver loop, while the latter uses
+@code{glutIdleFunc} to make StarPU progress from the GLUT main loop.
 
 Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
 the CUDA pointer at registration, for instance:
 
 @cartouche
 @smallexample
+/* Get the CUDA worker id */
 for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
                 break;
 
+/* Build a CUDA pointer pointing at the OpenGL buffer */
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
-starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
+/* And register it to StarPU */
+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
+                            output, num_bytes / sizeof(float4), sizeof(float4));
+
+/* The handle can now be used as usual */
 starpu_insert_task(&cl, STARPU_RW, handle, 0);
+
+/* ... */
+
+/* This gets back data into the OpenGL buffer */
+starpu_data_unregister(handle);
 @end smallexample
 @end cartouche
 
@@ -1043,8 +1083,8 @@ directory. Simple examples include:
 @item @code{incrementer/}:
     Trivial incrementation test.
 @item @code{basic_examples/}:
-        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
-        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
+        Simple documented Hello world and vector/scalar product (as
+        shown in @ref{Basic Examples}), matrix
         product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
         interface, an example using the variable data interface, and an example
         using different formats on CPUs and GPUs.

File diff suppressed because it is too large
+ 494 - 159
doc/chapters/basic-api.texi


+ 112 - 141
doc/chapters/basic-examples.texi

@@ -2,62 +2,27 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Compiling and linking options::  
-* Hello World::                 Submitting Tasks
-* Vector Scaling Using the C Extension::  
-* Vector Scaling Using StarPu's API::  
+* Hello World using the C Extension::
+* Hello World using StarPU's API::
+* Vector Scaling Using the C Extension::
+* Vector Scaling Using StarPU's API::
 * Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
 @end menu
 
-@node Compiling and linking options
-@section Compiling and linking options
-
-Let's suppose StarPU has been installed in the directory
-@code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
-the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
-necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
-libraries at runtime.
-
-@example
-% PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
-% LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
-@end example
-
-The Makefile could for instance contain the following lines to define which
-options must be given to the compiler and to the linker:
-
-@cartouche
-@example
-CFLAGS          +=      $$(pkg-config --cflags starpu-1.0)
-LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
-@end example
-@end cartouche
-
-Make sure that @code{pkg-config --libs starpu-1.0} actually produces some output
-before going further: @code{PKG_CONFIG_PATH} has to point to the place where
-@code{starpu-1.0.pc} was installed during @code{make install}.
-
-Also pass the @code{--static} option if the application is to be linked statically.
-
-@node Hello World
-@section Hello World
+@node Hello World using the C Extension
+@section Hello World using the C Extension
 
 This section shows how to implement a simple program that submits a task
-to StarPU. You can either use the StarPU C extension (@pxref{C
-Extensions}) or directly use the StarPU's API.
-
-@menu
-* Hello World using the C Extension::  
-* Hello World using StarPU's API::  
-@end menu
-
-@node Hello World using the C Extension
-@subsection Hello World using the C Extension
+to StarPU using the StarPU C extension (@pxref{C
+Extensions})@footnote{The complete example, and additional examples,
+is available in the @file{gcc-plugin/examples} directory of the StarPU
+distribution.}. A similar example showing how to directly use the StarPU's API is shown
+in @ref{Hello World using StarPU's API}.
 
 GCC from version 4.5 permit to use the StarPU GCC plug-in (@pxref{C
 Extensions}). This makes writing a task both simpler and less error-prone.
@@ -68,10 +33,12 @@ has a single implementation for CPU:
 
 @cartouche
 @smallexample
-/* Task declaration.  */
+#include <stdio.h>
+
+/* @b{Task declaration.}  */
 static void my_task (int x) __attribute__ ((task));
 
-/* Definition of the CPU implementation of `my_task'.  */
+/* @b{Definition of the CPU implementation of `my_task'.}  */
 static void my_task (int x)
 @{
   printf ("Hello, world!  With x = %d\n", x);
@@ -79,16 +46,16 @@ static void my_task (int x)
 
 int main ()
 @{
-  /* Initialize StarPU.  */
+  /* @b{Initialize StarPU.}  */
 #pragma starpu initialize
 
-  /* Do an asynchronous call to `my_task'.  */
+  /* @b{Do an asynchronous call to `my_task'.}  */
   my_task (42);
 
-  /* Wait for the call to complete.  */
+  /* @b{Wait for the call to complete.}  */
 #pragma starpu wait
 
-  /* Terminate.  */
+  /* @b{Terminate.}  */
 #pragma starpu shutdown
 
   return 0;
@@ -101,29 +68,39 @@ The code can then be compiled and linked with GCC and the
 @code{-fplugin} flag:
 
 @example
-$ gcc hello-starpu.c \
+$ gcc `pkg-config starpu-1.0 --cflags` hello-starpu.c \
     -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
     `pkg-config starpu-1.0 --libs`
 @end example
 
-As can be seen above, basic use the C extensions allows programmers to
-use StarPU tasks while essentially annotating ``regular'' C code.
+The code can also be compiled without the StarPU C extension and will
+behave as a normal sequential code.
+
+@example
+$ gcc hello-starpu.c
+hello-starpu.c:33:1: warning: ‘task’ attribute directive ignored [-Wattributes]
+$ ./a.out
+Hello, world! With x = 42
+@end example
+
+As can be seen above, the C extensions allows programmers to
+use StarPU tasks by essentially annotating ``regular'' C code.
 
 @node Hello World using StarPU's API
-@subsection Hello World using StarPU's API
+@section Hello World using StarPU's API
 
-The remainder of this section shows how to achieve the same result using
-StarPU's standard C API.
+This section shows how to achieve the same result as in the previous
+section using StarPU's standard C API.
 
 @menu
-* Required Headers::            
-* Defining a Codelet::          
-* Submitting a Task::           
-* Execution of Hello World::    
+* Required Headers::
+* Defining a Codelet::
+* Submitting a Task::
+* Execution of Hello World::
 @end menu
 
 @node Required Headers
-@subsubsection Required Headers
+@subsection Required Headers
 
 The @code{starpu.h} header should be included in any code using StarPU.
 
@@ -135,11 +112,12 @@ The @code{starpu.h} header should be included in any code using StarPU.
 
 
 @node Defining a Codelet
-@subsubsection Defining a Codelet
+@subsection Defining a Codelet
 
 @cartouche
 @smallexample
-struct params @{
+struct params
+@{
     int i;
     float f;
 @};
@@ -161,7 +139,7 @@ struct starpu_codelet cl =
 
 A codelet is a structure that represents a computational kernel. Such a codelet
 may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, Cell's SPU, x86, ...). For compatibility, make sure that the whole
+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
 structure is initialized to zero, either by using memset, or by letting the
 compiler implicitly do it as examplified above.
 
@@ -198,10 +176,10 @@ Be aware that this may be a pointer to a
 if the codelet modifies this buffer, there is no guarantee that the initial
 buffer will be modified as well: this for instance implies that the buffer
 cannot be used as a synchronization medium. If synchronization is needed, data
-has to be registered to StarPU, see @ref{Vector Scaling Using StarPu's API}.
+has to be registered to StarPU, see @ref{Vector Scaling Using StarPU's API}.
 
 @node Submitting a Task
-@subsubsection Submitting a Task
+@subsection Submitting a Task
 
 @cartouche
 @smallexample
@@ -274,7 +252,11 @@ callback function is always executed on a CPU. The @code{callback_arg}
 pointer is passed as an argument of the callback. The prototype of a callback
 function must be:
 
-@code{void (*callback_function)(void *);}
+@cartouche
+@example
+void (*callback_function)(void *);
+@end example
+@end cartouche
 
 If the @code{synchronous} field is non-zero, task submission will be
 synchronous: the @code{starpu_task_submit} function will not return until the
@@ -287,12 +269,12 @@ disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
 @code{starpu_data_set_sequential_consistency_flag}.
 
 @node Execution of Hello World
-@subsubsection Execution of Hello World
+@subsection Execution of Hello World
 
 @smallexample
-% make hello_world
+$ make hello_world
 cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0) hello_world.c -o hello_world
-% ./hello_world
+$ ./hello_world
 Hello world (params = @{1, 2.000000@} )
 Callback function (arg 42)
 @end smallexample
@@ -300,27 +282,22 @@ Callback function (arg 42)
 @node Vector Scaling Using the C Extension
 @section Vector Scaling Using the C Extension
 
-The previous example has shown how to submit tasks. In this section,
-we show how StarPU tasks can manipulate data. The version of this
-example using StarPU's API is given in the next sections.
-
-
 @menu
-* Adding an OpenCL Task Implementation::  
-* Adding a CUDA Task Implementation::  
+* Adding an OpenCL Task Implementation::
+* Adding a CUDA Task Implementation::
 @end menu
 
-The simplest way to get started writing StarPU programs is using the C
-language extensions provided by the GCC plug-in (@pxref{C Extensions}).
-These extensions map directly to StarPU's main concepts: tasks, task
-implementations for CPU, OpenCL, or CUDA, and registered data buffers.
+The previous example has shown how to submit tasks. In this section,
+we show how StarPU tasks can manipulate data.
 
-The example below is a vector-scaling program, that multiplies elements
-of a vector by a given factor@footnote{The complete example, and
+We will first show how to use the C language extensions provided by
+the GCC plug-in (@pxref{C Extensions})@footnote{The complete example, and
 additional examples, is available in the @file{gcc-plugin/examples}
-directory of the StarPU distribution.}.  For comparison, the standard C
-version that uses StarPU's standard C programming interface is given in
-the next section (@pxref{Vector Scaling Using StarPu's API, standard C
+directory of the StarPU distribution.}. These extensions map directly
+to StarPU's main concepts: tasks, task implementations for CPU,
+OpenCL, or CUDA, and registered data buffers. The standard C version
+that uses StarPU's standard C programming interface is given in the
+next section (@pxref{Vector Scaling Using StarPU's API, standard C
 version of the example}).
 
 First of all, the vector-scaling task and its simple CPU implementation
@@ -328,12 +305,12 @@ has to be defined:
 
 @cartouche
 @smallexample
-/* Declare the `vector_scal' task.  */
+/* @b{Declare the `vector_scal' task.}  */
 static void vector_scal (unsigned size, float vector[size],
                          float factor)
   __attribute__ ((task));
 
-/* Define the standard CPU implementation.  */
+/* @b{Define the standard CPU implementation.}  */
 static void
 vector_scal (unsigned size, float vector[size], float factor)
 @{
@@ -358,9 +335,8 @@ main (void)
 #define FACTOR 3.14
 
   @{
-    float vector[NX] __attribute__ ((heap_allocated));
-
-#pragma starpu register vector
+    float vector[NX]
+       __attribute__ ((heap_allocated, registered));
 
     size_t i;
     for (i = 0; i < NX; i++)
@@ -369,7 +345,7 @@ main (void)
     vector_scal (NX, vector, FACTOR);
 
 #pragma starpu wait
-  @} /* VECTOR is automatically freed here.  */
+  @} /* @b{VECTOR is automatically freed here.}  */
 
 #pragma starpu shutdown
 
@@ -416,7 +392,7 @@ The program can be compiled and linked with GCC and the @code{-fplugin}
 flag:
 
 @example
-$ gcc hello-starpu.c \
+$ gcc `pkg-config starpu-1.0 --cflags` vector_scal.c \
     -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
     `pkg-config starpu-1.0 --libs`
 @end example
@@ -438,10 +414,7 @@ in our C file like this:
 
 @cartouche
 @smallexample
-/* Include StarPU's OpenCL integration.  */
-#include <starpu_opencl.h>
-
-/* The OpenCL programs, loaded from `main' (see below).  */
+/* @b{The OpenCL programs, loaded from 'main' (see below).}  */
 static struct starpu_opencl_program cl_programs;
 
 static void vector_scal_opencl (unsigned size, float vector[size],
@@ -456,14 +429,14 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
   cl_command_queue queue;
   cl_event event;
 
-  /* VECTOR is GPU memory pointer, not a main memory pointer.  */
+  /* @b{VECTOR is GPU memory pointer, not a main memory pointer.}  */
   cl_mem val = (cl_mem) vector;
 
   id = starpu_worker_get_id ();
   devid = starpu_worker_get_devid (id);
 
-  /* Prepare to invoke the kernel.  In the future, this will be largely
-     automated.  */
+  /* @b{Prepare to invoke the kernel.  In the future, this will be largely
+     automated.}  */
   err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
                                    "vector_mult_opencl", devid);
   if (err != CL_SUCCESS)
@@ -485,7 +458,7 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
   starpu_opencl_collect_stats (event);
   clReleaseEvent (event);
 
-  /* Done with KERNEL.  */
+  /* @b{Done with KERNEL.}  */
   starpu_opencl_release_kernel (kernel);
 @}
 @end smallexample
@@ -532,11 +505,10 @@ the CUDA Kernel}).
 
 @cartouche
 @smallexample
-/* CUDA implementation of the `vector_scal' task, to be compiled
-   with `nvcc'.  */
+/* @b{CUDA implementation of the `vector_scal' task, to be compiled
+   with `nvcc'.}  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 #include <stdlib.h>
 
 static __global__ void
@@ -548,7 +520,7 @@ vector_mult_cuda (float *val, unsigned n, float factor)
     val[i] *= factor;
 @}
 
-/* Definition of the task implementation declared in the C file.   */
+/* @b{Definition of the task implementation declared in the C file.}   */
 extern "C" void
 vector_scal_cuda (size_t size, float vector[], float factor)
 @{
@@ -570,8 +542,8 @@ CPU task implementation can be added.
 For more details on the C extensions provided by StarPU's GCC plug-in,
 @xref{C Extensions}.
 
-@node Vector Scaling Using StarPu's API
-@section Vector Scaling Using StarPu's API
+@node Vector Scaling Using StarPU's API
+@section Vector Scaling Using StarPU's API
 
 This section shows how to achieve the same result as explained in the
 previous section using StarPU's standard C API.
@@ -581,7 +553,7 @@ this example is given in @ref{Full source code for the 'Scaling a
 Vector' example}.
 
 @menu
-* Source Code of Vector Scaling::  
+* Source Code of Vector Scaling::
 * Execution of Vector Scaling::  Running the program
 @end menu
 
@@ -657,17 +629,17 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
     unsigned i;
     float *factor = cl_arg;
 
-    /* length of the vector */
+    /* @b{length of the vector} */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* CPU copy of the vector pointer */
+    /* @b{CPU copy of the vector pointer} */
     float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
 
     for (i = 0; i < n; i++)
         val[i] *= *factor;
 @}
 
-struct starpu_codelet cl = @{
-    .where = STARPU_CPU,
+struct starpu_codelet cl =
+@{
     .cpu_funcs = @{ scal_cpu_func, NULL @},
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
@@ -693,9 +665,9 @@ constant factor from this pointer.
 @subsection Execution of Vector Scaling
 
 @smallexample
-% make vector_scal
+$ make vector_scal
 cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0)  vector_scal.c   -o vector_scal
-% ./vector_scal
+$ ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
 
@@ -706,10 +678,10 @@ Contrary to the previous examples, the task submitted in this example may not
 only be executed by the CPUs, but also by a CUDA device.
 
 @menu
-* Definition of the CUDA Kernel::  
-* Definition of the OpenCL Kernel::  
-* Definition of the Main Code::  
-* Execution of Hybrid Vector Scaling::  
+* Definition of the CUDA Kernel::
+* Definition of the OpenCL Kernel::
+* Definition of the Main Code::
+* Execution of Hybrid Vector Scaling::
 @end menu
 
 @node Definition of the CUDA Kernel
@@ -724,7 +696,6 @@ call.
 @cartouche
 @smallexample
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
@@ -738,14 +709,15 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 @{
     float *factor = (float *)_args;
 
-    /* length of the vector */
+    /* @b{length of the vector} */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* CUDA copy of the vector pointer */
+    /* @b{CUDA copy of the vector pointer} */
     float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
     unsigned threads_per_block = 64;
     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>}
+@i{                    (val, n, *factor);}
 
 @i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
 @}
@@ -778,7 +750,6 @@ important when using partitioning, see @ref{Partitioning Data}.
 @cartouche
 @smallexample
 #include <starpu.h>
-@i{#include <starpu_opencl.h>}
 
 @i{extern struct starpu_opencl_program programs;}
 
@@ -790,9 +761,9 @@ void scal_opencl_func(void *buffers[], void *_args)
 @i{    cl_command_queue queue;}
 @i{    cl_event event;}
 
-    /* length of the vector */
+    /* @b{length of the vector} */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* OpenCL copy of the vector pointer */
+    /* @b{OpenCL copy of the vector pointer} */
     cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 @i{    id = starpu_worker_get_id();}
@@ -810,7 +781,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 @i{    @{}
 @i{        size_t global=n;}
 @i{        size_t local=1;}
-@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,}
+@i{                                     &global, &local, 0, NULL, &event);}
 @i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
 @i{    @}}
 
@@ -829,10 +801,10 @@ void scal_opencl_func(void *buffers[], void *_args)
 
 The CPU implementation is the same as in the previous section.
 
-Here is the source of the main application. You can notice the value of the
-field @code{where} for the codelet. We specify
-@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
-can be executed either on a CPU or on a CUDA or an OpenCL device.
+Here is the source of the main application. You can notice that the fields
+@code{cuda_funcs} and @code{opencl_funcs} of the codelet are set to
+define the pointers to the CUDA and OpenCL implementations of the
+task.
 
 @cartouche
 @smallexample
@@ -845,11 +817,10 @@ extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
 /* @b{Definition of the codelet} */
-static struct starpu_codelet cl = @{
-    .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
-                                     /* @b{on a CUDA device, or on an OpenCL device} */
+static struct starpu_codelet cl =
+@{
     .cuda_funcs = @{ scal_cuda_func, NULL @},
-    .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .cpu_funcs = @{ scal_cpu_func, NULL @},
     .opencl_funcs = @{ scal_opencl_func, NULL @},
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
@@ -959,20 +930,20 @@ clean:
 @end cartouche
 
 @smallexample
-% make
+$ make
 @end smallexample
 
 and to execute it, with the default configuration:
 
 @smallexample
-% ./vector_scal
+$ ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
 
 or for example, by disabling CPU devices:
 
 @smallexample
-% STARPU_NCPU=0 ./vector_scal
+$ STARPU_NCPU=0 ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
 
@@ -980,6 +951,6 @@ or by disabling CUDA devices (which may permit to enable the use of OpenCL,
 see @ref{Enabling OpenCL}):
 
 @smallexample
-% STARPU_NCUDA=0 ./vector_scal
+$ STARPU_NCUDA=0 ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample

+ 47 - 0
doc/chapters/benchmarks.texi

@@ -0,0 +1,47 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2012  University of Bordeaux
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Task size overhead::           Overhead of tasks depending on their size
+* Data transfer latency::        Latency of data transfers
+* Gemm::                         Matrix-matrix multiplication
+* Cholesky::                     Cholesky factorization
+* LU::                           LU factorization
+@end menu
+
+Some interesting benchmarks are installed among examples in
+/usr/lib/starpu/examples . Make sure to try various schedulers, for instance
+STARPU_SCHED=dmda
+
+@node Task size overhead
+@section Task size overhead
+
+This benchmark gives a glimpse into how big a size should be for StarPU overhead
+to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
+of the speedup of tasks of various sizes, depending on the number of CPUs being
+used.
+
+@node Data transfer latency
+@section Data transfer latency
+
+@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
+prints the measured latency.
+
+@node Gemm
+@section Matrix-matrix multiplication
+
+@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
+multiplication using BLAS and cuBLAS. They output the obtained GFlops.
+
+@node Cholesky
+@section Cholesky factorization
+
+@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
+
+@node LU
+@section LU factorization
+
+@code{lu*} perform an LU factorization. They use different dependency primitives.

+ 13 - 7
doc/chapters/c-extensions.texi

@@ -295,6 +295,7 @@ The following pragmas are provided:
 @item #pragma starpu register @var{ptr} [@var{size}]
 Register @var{ptr} as a @var{size}-element buffer.  When @var{ptr} has
 an array type whose size is known, @var{size} may be omitted.
+Alternatively, the @code{registered} attribute can be used (see below.)
 
 @item #pragma starpu unregister @var{ptr}
 Unregister the previously-registered memory area pointed to by
@@ -311,11 +312,19 @@ making it available to the tasks.
 
 @end table
 
-Additionally, the @code{heap_allocated} variable attribute offers a
-simple way to allocate storage for arrays on the heap:
+Additionally, the following attributes offer a simple way to allocate
+and register storage for arrays:
 
 @table @code
 
+@item registered
+@cindex @code{registered} attribute
+This attributes applies to local variables with an array type.  Its
+effect is to automatically register the array's storage, as per
+@code{#pragma starpu register}.  The array is automatically unregistered
+when the variable's scope is left.  This attribute is typically used in
+conjunction with the @code{heap_allocated} attribute, described below.
+
 @item heap_allocated
 @cindex @code{heap_allocated} attribute
 This attributes applies to local variables with an array type.  Its
@@ -351,16 +360,13 @@ main (int argc, char *argv[])
 
   @{
     float matrix[nblocks][nblocks][size]
-      __attribute__ ((heap_allocated));
-
-#pragma starpu register matrix
+      __attribute__ ((heap_allocated, registered));
 
     cholesky (nblocks, size, matrix);
 
 #pragma starpu wait
-#pragma starpu unregister matrix
 
-  @}   /* MATRIX is automatically freed here.  */
+  @}   /* MATRIX is automatically unregistered & freed here.  */
 
 #pragma starpu shutdown
 

+ 400 - 177
doc/chapters/configuration.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
@@ -17,251 +17,361 @@
 The following arguments can be given to the @code{configure} script.
 
 @menu
-* Common configuration::        
-* Configuring workers::         
-* Advanced configuration::      
+* Common configuration::
+* Configuring workers::
+* Extension configuration::
+* Advanced configuration::
 @end menu
 
 @node Common configuration
 @subsection Common configuration
 
-@table @code
+@defvr {Configure option} --enable-debug
+Enable debugging messages.
+@end defvr
 
-@item --enable-debug
+@defvr {Configure option} --enable-debug
 Enable debugging messages.
+@end defvr
 
-@item --enable-fast
+@defvr {Configure option} --enable-fast
 Disable assertion checks, which saves computation time.
+@end defvr
 
-@item --enable-verbose
+@defvr {Configure option} --enable-verbose
 Increase the verbosity of the debugging messages.  This can be disabled
 at runtime by setting the environment variable @code{STARPU_SILENT} to
 any value.
 
 @smallexample
-% STARPU_SILENT=1 ./vector_scal
+$ STARPU_SILENT=1 ./vector_scal
 @end smallexample
+@end defvr
 
-@item --enable-coverage
+@defvr {Configure option} --enable-coverage
 Enable flags for the @code{gcov} coverage tool.
+@end defvr
+
+@defvr {Configure option} --enable-quick-check
+Specify tests and examples should be run on a smaller data set, i.e
+allowing a faster execution time
+@end defvr
+
+@defvr {Configure option} --enable-long-check
+Enable some exhaustive checks which take a really long time.
+@end defvr
+
+@defvr {Configure option} --with-hwloc
+Specify hwloc should be used by StarPU. hwloc should be found by the
+means of the tools @code{pkg-config}.
+@end defvr
+
+@defvr {Configure option} --with-hwloc=@var{prefix}
+Specify hwloc should be used by StarPU. hwloc should be found in the
+directory specified by @var{prefix}.
+@end defvr
 
-@end table
+@defvr {Configure option} --without-hwloc
+Specify hwloc should not be used by StarPU.
+@end defvr
+
+@defvr {Configure option} --disable-build-doc
+Disable the creation of the documentation. This should be done on a
+machine which does not have the tools @code{makeinfo} and @code{tex}.
+@end defvr
+
+Additionally, the @command{configure} script recognize many variables, which
+can be listed by typing @code{./configure --help}. For example,
+@code{./configure NVCCFLAGS="-arch sm_13"} adds a flag for the compilation of
+CUDA kernels.
 
 @node Configuring workers
 @subsection Configuring workers
 
-@table @code
-
-@item --enable-maxcpus=@var{count}
+@defvr {Configure option} --enable-maxcpus=@var{count}
 Use at most @var{count} CPU cores.  This information is then
 available as the @code{STARPU_MAXCPUS} macro.
+@end defvr
 
-@item --disable-cpu
+@defvr {Configure option} --disable-cpu
 Disable the use of CPUs of the machine. Only GPUs etc. will be used.
+@end defvr
 
-@item --enable-maxcudadev=@var{count}
+@defvr {Configure option} --enable-maxcudadev=@var{count}
 Use at most @var{count} CUDA devices.  This information is then
 available as the @code{STARPU_MAXCUDADEVS} macro.
+@end defvr
 
-@item --disable-cuda
+@defvr {Configure option} --disable-cuda
 Disable the use of CUDA, even if a valid CUDA installation was detected.
+@end defvr
 
-@item --with-cuda-dir=@var{prefix}
+@defvr {Configure option} --with-cuda-dir=@var{prefix}
 Search for CUDA under @var{prefix}, which should notably contain
 @file{include/cuda.h}.
+@end defvr
 
-@item --with-cuda-include-dir=@var{dir}
+@defvr {Configure option} --with-cuda-include-dir=@var{dir}
 Search for CUDA headers under @var{dir}, which should
 notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
 value given to @code{--with-cuda-dir}.
+@end defvr
 
-@item --with-cuda-lib-dir=@var{dir}
+@defvr {Configure option} --with-cuda-lib-dir=@var{dir}
 Search for CUDA libraries under @var{dir}, which should notably contain
 the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
 @code{/lib} appended to the value given to @code{--with-cuda-dir}.
+@end defvr
 
-@item --disable-cuda-memcpy-peer
+@defvr {Configure option} --disable-cuda-memcpy-peer
 Explicitly disable peer transfers when using CUDA 4.0.
+@end defvr
 
-@item --enable-maxopencldev=@var{count}
+@defvr {Configure option} --enable-maxopencldev=@var{count}
 Use at most @var{count} OpenCL devices.  This information is then
 available as the @code{STARPU_MAXOPENCLDEVS} macro.
+@end defvr
 
-@item --disable-opencl
+@defvr {Configure option} --disable-opencl
 Disable the use of OpenCL, even if the SDK is detected.
+@end defvr
 
-@item --with-opencl-dir=@var{prefix}
+@defvr {Configure option} --with-opencl-dir=@var{prefix}
 Search for an OpenCL implementation under @var{prefix}, which should
 notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
 Mac OS).
+@end defvr
 
-@item --with-opencl-include-dir=@var{dir}
+@defvr {Configure option} --with-opencl-include-dir=@var{dir}
 Search for OpenCL headers under @var{dir}, which should notably contain
 @file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
 @code{/include} appended to the value given to @code{--with-opencl-dir}.
+@end defvr
 
-@item --with-opencl-lib-dir=@var{dir}
+@defvr {Configure option} --with-opencl-lib-dir=@var{dir}
 Search for an OpenCL library under @var{dir}, which should notably
 contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
 @code{/lib} appended to the value given to @code{--with-opencl-dir}.
+@end defvr
 
-@item --enable-gordon
-Enable the use of the Gordon runtime for Cell SPUs.
-@c TODO: rather default to enabled when detected
+@defvr {Configure option} --enable-opencl-simulator
+Enable considering the provided OpenCL implementation as a simulator, i.e. use
+the kernel duration returned by OpenCL profiling information as wallclock time
+instead of the actual measured real time. This requires simgrid support.
+@end defvr
 
-@item --with-gordon-dir=@var{prefix}
-Search for the Gordon SDK under @var{prefix}.
-
-@item --enable-maximplementations=@var{count}
+@defvr {Configure option} --enable-maximplementations=@var{count}
 Allow for at most @var{count} codelet implementations for the same
 target device.  This information is then available as the
 @code{STARPU_MAXIMPLEMENTATIONS} macro.
+@end defvr
+
+@defvr {Configure option} --enable-max-sched-ctxs=@var{count}
+Allow for at most @var{count} scheduling contexts
+This information is then available as the
+@code{STARPU_NMAX_SCHED_CTXS} macro.
+@end defvr
+
+@defvr {Configure option} --disable-asynchronous-copy
+Disable asynchronous copies between CPU and GPU devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+@end defvr
+
+@defvr {Configure option} --disable-asynchronous-cuda-copy
+Disable asynchronous copies between CPU and CUDA devices.
+@end defvr
+
+@defvr {Configure option} --disable-asynchronous-opencl-copy
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+@end defvr
+
+@node Extension configuration
+@subsection Extension configuration
+
+@defvr {Configure option} --disable-socl
+Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
+default, it is enabled when an OpenCL implementation is found.
+@end defvr
+
+@defvr {Configure option} --disable-starpu-top
+Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
+is enabled when the required dependencies are found.
+@end defvr
+
+@defvr {Configure option} --disable-gcc-extensions
+Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
+enabled when the GCC compiler provides a plug-in support.
+@end defvr
 
-@end table
+@defvr {Configure option} --with-mpicc=@var{path}
+Use the @command{mpicc} compiler at @var{path}, for starpumpi
+(@pxref{StarPU MPI support}).
+@end defvr
 
 @node Advanced configuration
 @subsection Advanced configuration
 
-@table @code
-
-@item --enable-perf-debug
+@defvr {Configure option} --enable-perf-debug
 Enable performance debugging through gprof.
+@end defvr
 
-@item --enable-model-debug
+@defvr {Configure option} --enable-model-debug
 Enable performance model debugging.
+@end defvr
 
-@item --enable-stats
+@defvr {Configure option} --enable-stats
 @c see ../../src/datawizard/datastats.c
-Enable gathering of memory transfer statistics.
+Enable gathering of various data statistics (@pxref{Data statistics}).
+@end defvr
 
-@item --enable-maxbuffers
+@defvr {Configure option} --enable-maxbuffers
 Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
+@end defvr
 
-@item --enable-allocation-cache
+@defvr {Configure option} --enable-allocation-cache
 Enable the use of a data allocation cache to avoid the cost of it with
 CUDA. Still experimental.
+@end defvr
 
-@item --enable-opengl-render
+@defvr {Configure option} --enable-opengl-render
 Enable the use of OpenGL for the rendering of some examples.
 @c TODO: rather default to enabled when detected
+@end defvr
 
-@item --enable-blas-lib
+@defvr {Configure option} --enable-blas-lib
 Specify the blas library to be used by some of the examples. The
 library has to be 'atlas' or 'goto'.
+@end defvr
 
-@item --disable-starpufft
+@defvr {Configure option} --disable-starpufft
 Disable the build of libstarpufft, even if fftw or cuFFT is available.
+@end defvr
 
-@item --with-magma=@var{prefix}
+@defvr {Configure option} --with-magma=@var{prefix}
 Search for MAGMA under @var{prefix}.  @var{prefix} should notably
 contain @file{include/magmablas.h}.
+@end defvr
 
-@item --with-fxt=@var{prefix}
+@defvr {Configure option} --with-fxt=@var{prefix}
 Search for FxT under @var{prefix}.
 @url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
 traces of scheduling events, which can then be rendered them using ViTE
 (@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
 notably contain @code{include/fxt/fxt.h}.
+@end defvr
 
-@item --with-perf-model-dir=@var{dir}
+@defvr {Configure option} --with-perf-model-dir=@var{dir}
 Store performance models under @var{dir}, instead of the current user's
 home.
+@end defvr
 
-@item --with-mpicc=@var{path}
-Use the @command{mpicc} compiler at @var{path}, for starpumpi
-(@pxref{StarPU MPI support}).
-
-@item --with-goto-dir=@var{prefix}
-Search for GotoBLAS under @var{prefix}.
+@defvr {Configure option} --with-goto-dir=@var{prefix}
+Search for GotoBLAS under @var{prefix}, which should notably contain @file{libgoto.so} or @file{libgoto2.so}.
+@end defvr
 
-@item --with-atlas-dir=@var{prefix}
+@defvr {Configure option} --with-atlas-dir=@var{prefix}
 Search for ATLAS under @var{prefix}, which should notably contain
 @file{include/cblas.h}.
+@end defvr
 
-@item --with-mkl-cflags=@var{cflags}
+@defvr {Configure option} --with-mkl-cflags=@var{cflags}
 Use @var{cflags} to compile code that uses the MKL library.
+@end defvr
 
-@item --with-mkl-ldflags=@var{ldflags}
+@defvr {Configure option} --with-mkl-ldflags=@var{ldflags}
 Use @var{ldflags} when linking code that uses the MKL library.  Note
 that the
 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
 MKL website} provides a script to determine the linking flags.
+@end defvr
 
-@item --disable-gcc-extensions
-Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
-enabled when the GCC compiler provides a plug-in support.
+@defvr {Configure option} --disable-build-examples
+Disable the build of examples.
+@end defvr
 
-@item --disable-socl
-Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
-default, it is enabled when an OpenCL implementation is found.
 
-@item --disable-starpu-top
-Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
-is enabled when the required dependencies are found.
+@defvr {Configure option} --enable-sched-ctx-hypervisor
+Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}).
+By default, it is disabled.
+@end defvr
+
+@defvr {Configure option} --enable-memory-stats
+Enable memory statistics (@pxref{Memory feedback}).
+@end defvr
+
+@defvr {Configure option} --enable-simgrid
+Enable simulation of execution in simgrid, to allow easy experimentation with
+various numbers of cores and GPUs, or amount of memory, etc. Experimental.
+
+The path to simgrid can be specified through the @code{SIMGRID_CFLAGS} and
+@code{SIMGRID_LIBS} environment variables, for instance:
+@example
+export SIMGRID_CFLAGS="-I/usr/local/simgrid/include"
+export SIMGRID_LIBS="-L/usr/local/simgrid/lib -lsimgrid"
+@end example
+@end defvr
 
-@end table
 @node Execution configuration through environment variables
 @section Execution configuration through environment variables
 
 @menu
 * Workers::                     Configuring workers
 * Scheduling::                  Configuring the Scheduling engine
+* Extensions::
 * Misc::                        Miscellaneous and debug
 @end menu
 
 @node Workers
 @subsection Configuring workers
 
-@menu
-* STARPU_NCPU::                	Number of CPU workers
-* STARPU_NCUDA::                	Number of CUDA workers
-* STARPU_NOPENCL::              	Number of OpenCL workers
-* STARPU_NGORDON::              	Number of SPU workers (Cell)
-* STARPU_WORKERS_NOBIND::       	Do not bind workers
-* STARPU_WORKERS_CPUID::        	Bind workers to specific CPUs
-* STARPU_WORKERS_CUDAID::       	Select specific CUDA devices
-* STARPU_WORKERS_OPENCLID::     	Select specific OpenCL devices
-* STARPU_SINGLE_COMBINED_WORKER:: 	Do not use concurrent workers
-* STARPU_MIN_WORKERSIZE::	 	Minimum size of the combined workers
-* STARPU_MAX_WORKERSIZE:: 		Maximum size of the combined workers
-@end menu
-
-@node STARPU_NCPU
-@subsubsection @code{STARPU_NCPU} -- Number of CPU workers
-
-Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
+@defvr {Environment variable} STARPU_NCPU
+Specify the number of CPU workers (thus not including workers dedicated to control accelerators). Note that by default, StarPU will not allocate
 more CPU workers than there are physical CPUs, and that some CPUs are used to control
 the accelerators.
+@end defvr
 
-@node STARPU_NCUDA
-@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
+@defvr {Environment variable} STARPU_NCPUS
+This variable is deprecated. You should use @code{STARPU_NCPU}.
+@end defvr
 
+@defvr {Environment variable} STARPU_NCUDA
 Specify the number of CUDA devices that StarPU can use. If
 @code{STARPU_NCUDA} is lower than the number of physical devices, it is
 possible to select which CUDA devices should be used by the means of the
 @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
 create as many CUDA workers as there are CUDA devices.
+@end defvr
 
-@node STARPU_NOPENCL
-@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
-
+@defvr {Environment variable} STARPU_NOPENCL
 OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
-
-@node STARPU_NGORDON
-@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
-
-Specify the number of SPUs that StarPU can use.
-
-@node STARPU_WORKERS_NOBIND
-@subsubsection @code{STARPU_WORKERS_NOBIND} -- Do not bind workers to specific CPUs
-
+@end defvr
+
+@defvr {Environment variable} STARPU_OPENCL_ON_CPUS
+By default, the OpenCL driver only enables GPU and accelerator
+devices. By setting the environment variable
+@code{STARPU_OPENCL_ON_CPUS} to 1, the OpenCL driver will also enable
+CPU devices.
+@end defvr
+
+@defvr {Environment variable} STARPU_OPENCL_ONLY_ON_CPUS
+By default, the OpenCL driver enables GPU and accelerator
+devices. By setting the environment variable
+@code{STARPU_OPENCL_ONLY_ON_CPUS} to 1, the OpenCL driver will ONLY enable
+CPU devices.
+@end defvr
+
+@defvr {Environment variable} STARPU_WORKERS_NOBIND
 Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
+@end defvr
 
-@node STARPU_WORKERS_CPUID
-@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
-
+@defvr {Environment variable} STARPU_WORKERS_CPUID
 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
 specifies on which logical CPU the different workers should be
 bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
@@ -271,7 +381,7 @@ determined by the OS, or provided by the @code{hwloc} library in case it is
 available.
 
 Note that the first workers correspond to the CUDA workers, then come the
-OpenCL and the SPU, and finally the CPU workers. For example if
+OpenCL workers, and finally the CPU workers. For example if
 we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
 and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
@@ -284,10 +394,9 @@ third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
 
 This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
 @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
-@node STARPU_WORKERS_CUDAID
-@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
-
+@defvr {Environment variable} STARPU_WORKERS_CUDAID
 Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
 possible to select which CUDA devices should be used by StarPU. On a machine
 equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
@@ -297,125 +406,239 @@ the one reported by CUDA).
 
 This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
-@node STARPU_WORKERS_OPENCLID
-@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
-
+@defvr {Environment variable} STARPU_WORKERS_OPENCLID
 OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
 
 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+@end defvr
 
-@node STARPU_SINGLE_COMBINED_WORKER
-@subsubsection @code{STARPU_SINGLE_COMBINED_WORKER} -- Do not use concurrent workers
-
+@defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
 If set, StarPU will create several workers which won't be able to work
 concurrently. It will create combined workers which size goes from 1 to the
 total number of CPU workers in the system.
-
-@node STARPU_MIN_WORKERSIZE
-@subsubsection @code{STARPU_MIN_WORKERSIZE} -- Minimum size of the combined workers
-
-Let the user give a hint to StarPU about which how many workers
-(minimum boundary) the combined workers should contain.
-
-@node STARPU_MAX_WORKERSIZE
-@subsubsection @code{STARPU_MAX_WORKERSIZE} -- Maximum size of the combined workers
-
-Let the user give a hint to StarPU about which how many workers
-(maximum boundary) the combined workers should contain.
+@end defvr
+
+@defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
+Let the user decide how many elements are allowed between combined workers
+created from hwloc information. For instance, in the case of sockets with 6
+cores without shared L2 caches, if @code{SYNTHESIZE_ARITY_COMBINED_WORKER} is
+set to 6, no combined worker will be synthesized beyond one for the socket
+and one per core. If it is set to 3, 3 intermediate combined workers will be
+synthesized, to divide the socket cores into 3 chunks of 2 cores. If it set to
+2, 2 intermediate combined workers will be synthesized, to divide the the socket
+cores into 2 chunks of 3 cores, and then 3 additional combined workers will be
+synthesized, to divide the former synthesized workers into a bunch of 2 cores,
+and the remaining core (for which no combined worker is synthesized since there
+is already a normal worker for it).
+
+The default, 2, thus makes StarPU tend to building a binary trees of combined
+workers.
+@end defvr
+
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_COPY
+Disable asynchronous copies between CPU and GPU devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+@end defvr
+
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
+Disable asynchronous copies between CPU and CUDA devices.
+@end defvr
+
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+@end defvr
+
+@defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
+Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
+instead. This permits to test the performance effect of GPU-Direct.
+@end defvr
 
 @node Scheduling
 @subsection Configuring the Scheduling engine
 
-@menu
-* STARPU_SCHED::                Scheduling policy
-* STARPU_CALIBRATE::            Calibrate performance models
-* STARPU_PREFETCH::             Use data prefetch
-* STARPU_SCHED_ALPHA::          Computation factor
-* STARPU_SCHED_BETA::           Communication factor
-@end menu
-
-@node STARPU_SCHED
-@subsubsection @code{STARPU_SCHED} -- Scheduling policy
-
+@defvr {Environment variable} STARPU_SCHED
 Choose between the different scheduling policies proposed by StarPU: work
 random, stealing, greedy, with performance models, etc.
 
 Use @code{STARPU_SCHED=help} to get the list of available schedulers.
+@end defvr
 
-@node STARPU_CALIBRATE
-@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
-
+@defvr {Environment variable} STARPU_CALIBRATE
 If this variable is set to 1, the performance models are calibrated during
 the execution. If it is set to 2, the previous values are dropped to restart
 calibration from scratch. Setting this variable to 0 disable calibration, this
 is the default behaviour.
 
-Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
+Note: this currently only applies to @code{dm} and @code{dmda} scheduling policies.
+@end defvr
 
-@node STARPU_PREFETCH
-@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
+@defvr {Environment variable} STARPU_BUS_CALIBRATE
+If this variable is set to 1, the bus is recalibrated during intialization.
+@end defvr
 
+@defvr {Environment variable} STARPU_PREFETCH
+@anchor{STARPU_PREFETCH}
 This variable indicates whether data prefetching should be enabled (0 means
 that it is disabled). If prefetching is enabled, when a task is scheduled to be
 executed e.g. on a GPU, StarPU will request an asynchronous transfer in
 advance, so that data is already present on the GPU when the task starts. As a
 result, computation and data transfers are overlapped.
 Note that prefetching is enabled by default in StarPU.
+@end defvr
 
-@node STARPU_SCHED_ALPHA
-@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
-
+@defvr {Environment variable} STARPU_SCHED_ALPHA
 To estimate the cost of a task StarPU takes into account the estimated
 computation time (obtained thanks to performance models). The alpha factor is
 the coefficient to be applied to it before adding it to the communication part.
+@end defvr
 
-@node STARPU_SCHED_BETA
-@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
-
+@defvr {Environment variable} STARPU_SCHED_BETA
 To estimate the cost of a task StarPU takes into account the estimated
 data transfer time (obtained thanks to performance models). The beta factor is
 the coefficient to be applied to it before adding it to the computation part.
+@end defvr
+
+@defvr {Environment variable} STARPU_SCHED_GAMMA
+Define the execution time penalty of a joule (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_IDLE_POWER
+Define the idle power of the machine (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_PROFILING
+Enable on-line performance monitoring (@pxref{Enabling on-line performance monitoring}).
+@end defvr
+
+@node Extensions
+@subsection Extensions
+
+@defvr {Environment variable} SOCL_OCL_LIB_OPENCL
+THE SOCL test suite is only run when the environment variable
+@code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
+of the libOpenCL.so file of the OCL ICD implementation.
+@end defvr
+
+@defvr {Environment variable} STARPU_COMM_STATS
+@anchor{STARPU_COMM_STATS}
+Communication statistics for starpumpi (@pxref{StarPU MPI support})
+will be enabled when the environment variable @code{STARPU_COMM_STATS}
+is defined to an value other than 0.
+@end defvr
+
+@defvr {Environment variable} STARPU_MPI_CACHE
+@anchor{STARPU_MPI_CACHE}
+Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
+disabled when the environment variable @code{STARPU_MPI_CACHE} is set
+to 0. It is enabled by default or for any other values of the variable
+@code{STARPU_MPI_CACHE}.
+@end defvr
 
 @node Misc
 @subsection Miscellaneous and debug
 
-@menu
-* STARPU_SILENT::               Disable verbose mode
-* STARPU_LOGFILENAME::          Select debug file name
-* STARPU_FXT_PREFIX::           FxT trace location
-* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
-* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
-@end menu
-
-@node STARPU_SILENT
-@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
-
+@defvr {Environment variable} STARPU_HOME
+@anchor{STARPU_HOME}
+This specifies the main directory in which StarPU stores its
+configuration files. The default is @code{$HOME} on Unix environments,
+and @code{$USERPROFILE} on Windows environments.
+@end defvr
+
+@defvr {Environment variable} STARPU_HOSTNAME
+When set, force the hostname to be used when dealing performance model
+files. Models are indexed by machine name. When running for example on
+a homogenenous cluster, it is possible to share the models between
+machines by setting @code{export STARPU_HOSTNAME=some_global_name}.
+@end defvr
+
+@defvr {Environment variable} STARPU_OPENCL_PROGRAM_DIR
+@anchor{STARPU_OPENCL_PROGRAM_DIR}
+This specifies the directory where the OpenCL codelet source files are
+located. The function @ref{starpu_opencl_load_program_source} looks
+for the codelet in the current directory, in the directory specified
+by the environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, in the
+directory @code{share/starpu/opencl} of the installation directory of
+StarPU, and finally in the source directory of StarPU.
+@end defvr
+
+@defvr {Environment variable} STARPU_SILENT
 This variable allows to disable verbose mode at runtime when StarPU
-has been configured with the option @code{--enable-verbose}.
-
-@node STARPU_LOGFILENAME
-@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
+has been configured with the option @code{--enable-verbose}. It also
+disables the display of StarPU information and warning messages.
+@end defvr
 
+@defvr {Environment variable} STARPU_LOGFILENAME
 This variable specifies in which file the debugging output should be saved to.
+@end defvr
 
-@node STARPU_FXT_PREFIX
-@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
-
+@defvr {Environment variable} STARPU_FXT_PREFIX
 This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
+@end defvr
 
-@node STARPU_LIMIT_GPU_MEM
-@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
-
+@defvr {Environment variable} STARPU_LIMIT_CUDA_devid_MEM
 This variable specifies the maximum number of megabytes that should be
-available to the application on each GPUs. In case this value is smaller than
-the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
-on the device. This variable is intended to be used for experimental purposes
-as it emulates devices that have a limited amount of memory.
-
-@node STARPU_GENERATE_TRACE
-@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
+available to the application on the CUDA device with the identifier
+@code{devid}. This variable is intended to be used for experimental
+purposes as it emulates devices that have a limited amount of memory.
+When defined, the variable overwrites the value of the variable
+@code{STARPU_LIMIT_CUDA_MEM}.
+@end defvr
+
+@defvr {Environment variable} STARPU_LIMIT_CUDA_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on each CUDA devices. This variable is
+intended to be used for experimental purposes as it emulates devices
+that have a limited amount of memory.
+@end defvr
 
-When set to 1, this variable indicates that StarPU should automatically
-generate a Paje trace when starpu_shutdown is called.
+@defvr {Environment variable} STARPU_LIMIT_OPENCL_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the OpenCL device with the identifier
+@code{devid}. This variable is intended to be used for experimental
+purposes as it emulates devices that have a limited amount of memory.
+When defined, the variable overwrites the value of the variable
+@code{STARPU_LIMIT_OPENCL_MEM}.
+@end defvr
+
+@defvr {Environment variable} STARPU_LIMIT_OPENCL_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on each OpenCL devices. This variable is
+intended to be used for experimental purposes as it emulates devices
+that have a limited amount of memory.
+@end defvr
+
+@defvr {Environment variable} STARPU_GENERATE_TRACE
+When set to @code{1}, this variable indicates that StarPU should automatically
+generate a Paje trace when @code{starpu_shutdown()} is called.
+@end defvr
+
+@defvr {Environment variable} STARPU_MEMORY_STATS
+When set to 0, disable the display of memory statistics on data which
+have not been unregistered at the end of the execution (@pxref{Memory
+feedback}).
+@end defvr
+
+@defvr {Environment variable} STARPU_BUS_STATS
+When defined, statistics about data transfers will be displayed when calling
+@code{starpu_shutdown()} (@pxref{Profiling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_WORKER_STATS
+When defined, statistics about the workers will be displayed when calling
+@code{starpu_shutdown()} (@pxref{Profiling}). When combined with the
+environment variable @code{STARPU_PROFILING}, it displays the power
+consumption (@pxref{Power-based scheduling}).
+@end defvr
+
+@defvr {Environment variable} STARPU_STATS
+When set to 0, data statistics will not be displayed at the
+end of the execution of an application (@pxref{Data statistics}).
+@end defvr

+ 0 - 1
doc/chapters/fdl-1.3.texi

@@ -505,4 +505,3 @@ to permit their use in free software.
 @c Local Variables:
 @c ispell-local-pdict: "ispell-dict"
 @c End:
-

+ 3 - 3
doc/chapters/fft-support.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
@@ -48,8 +48,8 @@ The flags required to compile or link against the FFT library are accessible
 with the following commands:
 
 @example
-% pkg-config --cflags starpufft-1.0  # options for the compiler
-% pkg-config --libs starpufft-1.0    # options for the linker
+$ pkg-config --cflags starpufft-1.0  # options for the compiler
+$ pkg-config --libs starpufft-1.0    # options for the linker
 @end example
 
 Also pass the @code{--static} option if the application is to be linked statically.

+ 60 - 29
doc/chapters/installing.texi

@@ -2,14 +2,14 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Downloading StarPU::          
-* Configuration of StarPU::     
-* Building and Installing StarPU::  
+* Downloading StarPU::
+* Configuration of StarPU::
+* Building and Installing StarPU::
 @end menu
 
 StarPU can be built and installed by the standard means of the GNU
@@ -20,33 +20,40 @@ can be used to install StarPU.
 @section Downloading StarPU
 
 @menu
-* Getting Sources::             
-* Optional dependencies::       
+* Getting Sources::
+* Optional dependencies::
 @end menu
 
 @node Getting Sources
 @subsection Getting Sources
 
-The latest official release tarballs of StarPU sources are available
-for download from
-@indicateurl{https://gforge.inria.fr/frs/?group_id=1570}.
+StarPU's sources can be obtained from the
+@url{http://runtime.bordeaux.inria.fr/StarPU/files/,download page} of
+the StarPU website.
 
-The latest nightly development snapshot is available from
-@indicateurl{http://starpu.gforge.inria.fr/testing/}.
+All releases and the development tree of StarPU are freely available
+on INRIA's gforge under the LGPL license. Some releases are available
+under the BSD license.
+
+The latest release can be downloaded from the
+@url{http://gforge.inria.fr/frs/?group_id=1570,INRIA's gforge} or
+directly from the @url{http://runtime.bordeaux.inria.fr/StarPU/files/,StarPU download page}.
+
+The latest nightly snapshot can be downloaded from the @url{http://starpu.gforge.inria.fr/testing/,StarPU gforge website}.
 
 @example
-% wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
+$ wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
 @end example
 
-Additionally, the code can be directly checked out of Subversion, it
-should be done only if you need the very latest changes (i.e. less
-than a day!).@footnote{The client side of the software Subversion can
-be obtained from @indicateurl{http://subversion.tigris.org}. If you
+And finally, current development version is also accessible via svn.
+It should be used only if you need the very latest changes (i.e. less
+than a day!)@footnote{The client side of the software Subversion can
+be obtained from @url{http://subversion.tigris.org}. If you
 are running on Windows, you will probably prefer to use
 @url{http://tortoisesvn.tigris.org/, TortoiseSVN}.}.
 
 @example
-% svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
+svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk StarPU
 @end example
 
 @node Optional dependencies
@@ -58,12 +65,19 @@ recommended.  It allows for topology aware scheduling, which improves
 performance.  @code{hwloc} is available in major free operating system
 distributions, and for most operating systems.
 
+If @code{hwloc} is not available on your system, the option
+@code{--without-hwloc} should be explicitely given when calling the
+@code{configure} script. If @code{hwloc} is installed with a @code{pkg-config} file,
+no option is required, it will be detected automatically, otherwise
+@code{with-hwloc=prefix} should be used to specify the location
+of @code{hwloc}.
+
 @node Configuration of StarPU
 @section Configuration of StarPU
 
 @menu
-* Generating Makefiles and configuration scripts::  
-* Running the configuration::   
+* Generating Makefiles and configuration scripts::
+* Running the configuration::
 @end menu
 
 @node Generating Makefiles and configuration scripts
@@ -75,43 +89,60 @@ the configure scripts and the Makefiles. This requires the
 availability of @code{autoconf}, @code{automake} >= 2.60, and @code{makeinfo}.
 
 @example
-% ./autogen.sh
+$ ./autogen.sh
 @end example
 
 @node Running the configuration
 @subsection Running the configuration
 
 @example
-% ./configure
+$ ./configure
 @end example
 
 Details about options that are useful to give to @code{./configure} are given in
 @ref{Compilation configuration}.
 
+By default, the files produced during the compilation are placed in
+the source directory. As the compilation generates a lot of files, it
+is advised to to put them all in a separate directory. It is then
+easier to cleanup, and this allows to compile several configurations
+out of the same source tree. For that, simply enter the directory
+where you want the compilation to produce its files, and invoke the
+@code{configure} script located in the StarPU source directory.
+
+@example
+$ mkdir build
+$ cd build
+$ ../configure
+@end example
+
 @node Building and Installing StarPU
 @section Building and Installing StarPU
 
 @menu
-* Building::                    
-* Sanity Checks::               
-* Installing::                  
+* Building::
+* Sanity Checks::
+* Installing::
 @end menu
 
 @node Building
 @subsection Building
 
 @example
-% make
+$ make
 @end example
 
 @node Sanity Checks
 @subsection Sanity Checks
 
-In order to make sure that StarPU is working properly on the system, it is also
-possible to run a test suite.
+Once everything is built, you may want to test the result. An
+extensive set of regression tests is provided with StarPU. Running the
+tests is done by calling @code{make check}. These tests are run every night
+and the result from the main profile is publicly
+@url{http://starpu.gforge.inria.fr/testing/,available}.
 
 @example
-% make check
+$ make check
 @end example
 
 @node Installing
@@ -121,7 +152,7 @@ In order to install StarPU at the location that was specified during
 configuration:
 
 @example
-% make install
+$ make install
 @end example
 
 Libtool interface versioning information are included in

+ 28 - 11
doc/chapters/introduction.texi

@@ -2,13 +2,16 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
 * Motivation::                  Why StarPU ?
 * StarPU in a Nutshell::        The Fundamentals of StarPU
+* Application taskification::   How to taskify an application
+* Glossary::
+* Research Papers::
 @end menu
 
 @node Motivation
@@ -70,22 +73,22 @@ policies in a portable fashion (@pxref{Scheduling Policy API}).
 The remainder of this section describes the main concepts used in StarPU.
 
 @menu
-* Codelet and Tasks::           
-* StarPU Data Management Library::  
-* Glossary::
-* Research Papers::
+* Codelet and Tasks::
+* StarPU Data Management Library::
 @end menu
 
 @c explain the notion of codelet and task (i.e. g(A, B)
 @node Codelet and Tasks
 @subsection Codelet and Tasks
 
+@cindex codelet
 One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
 computational kernel that can possibly be implemented on multiple architectures
-such as a CPU, a CUDA device or a Cell's SPU.
+such as a CPU, a CUDA device or an OpenCL device.
 
 @c TODO insert illustration f: f_spu, f_cpu, ...
 
+@cindex task
 Another important data structure is the @b{task}. Executing a StarPU task
 consists in applying a codelet on a data set, on one of the architectures on
 which the codelet is implemented. A task thus describes the codelet that it
@@ -97,6 +100,7 @@ called once StarPU has properly executed the task. It also contains optional
 fields that the application may use to give hints to the scheduler (such as
 priority levels).
 
+@cindex tag
 By default, task dependencies are inferred from data dependency (sequential
 coherence) by StarPU. The application can however disable sequential coherency
 for some data, and dependencies be expressed by hand.
@@ -121,8 +125,21 @@ where it was last needed, even if was modified there, and it
 allows multiple copies of the same data to reside at the same time on
 several processing units as long as it is not modified.
 
+@node Application taskification
+@section Application taskification
+
+TODO
+
+@c TODO: section describing what taskifying an application means: before
+@c porting to StarPU, turn the program into:
+@c "pure" functions, which only access data from their passed parameters
+@c a main function which just calls these pure functions
+@c
+@c and then it's trivial to use StarPU or any other kind of task-based library:
+@c simply replace calling the function with submitting a task.
+
 @node Glossary
-@subsection Glossary
+@section Glossary
 
 A @b{codelet} records pointers to various implementations of the same
 theoretical function.
@@ -147,7 +164,7 @@ A @b{worker} execute tasks. There is typically one per CPU computation core and
 one per accelerator (for which a whole CPU core is dedicated).
 
 A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
-OpenCL and Gordon drivers. They usually start several workers to actually drive
+and OpenCL drivers. They usually start several workers to actually drive
 them.
 
 A @b{performance model} is a (dynamic or static) model of the performance of a
@@ -175,10 +192,10 @@ unregister it.
 
 
 @node Research Papers
-@subsection Research Papers
+@section Research Papers
 
 Research papers about StarPU can be found at
-@indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}.
+@url{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}.
 
 A good overview is available in the research report at
-@indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}.
+@url{http://hal.archives-ouvertes.fr/inria-00467677}.

+ 106 - 144
doc/chapters/mpi-support.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
@@ -20,146 +20,25 @@ distributed application, by automatically issuing all required data transfers
 according to the task graph and an application-provided distribution.
 
 @menu
-* The API::
 * Simple Example::
+* Exchanging User Defined Data Interface::
 * MPI Insert Task Utility::
 * MPI Collective Operations::
 @end menu
 
-@node The API
-@section The API
-
-@subsection Compilation
+@node Simple Example
+@section Simple Example
 
 The flags required to compile or link against the MPI layer are then
 accessible with the following commands:
 
 @example
-% pkg-config --cflags starpumpi-1.0  # options for the compiler
-% pkg-config --libs starpumpi-1.0    # options for the linker
+$ pkg-config --cflags starpumpi-1.0  # options for the compiler
+$ pkg-config --libs starpumpi-1.0    # options for the linker
 @end example
 
 Also pass the @code{--static} option if the application is to be linked statically.
 
-@subsection Initialisation
-
-@deftypefun int starpu_mpi_initialize (void)
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions. This
-function does not call @code{MPI_Init}, it should be called beforehand.
-@end deftypefun
-
-@deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
-Initializes the starpumpi library. This must be called between calling
-@code{starpu_init} and other @code{starpu_mpi} functions.
-This function calls @code{MPI_Init}, and therefore should be prefered
-to the previous one for MPI implementations which are not thread-safe.
-Returns the current MPI node rank and world size.
-@end deftypefun
-
-@deftypefun int starpu_mpi_shutdown (void)
-Cleans the starpumpi library. This must be called between calling
-@code{starpu_mpi} functions and @code{starpu_shutdown}.
-@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
-by calling @code{starpu_mpi_initialize_extended}.
-@end deftypefun
-
-@subsection Communication
-
-The standard point to point communications of MPI have been
-implemented. The semantic is similar to the MPI one, but adapted to
-the DSM provided by StarPU. A MPI request will only be submitted when
-the data is available in the main memory of the node submitting the
-request.
-
-@deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
-Performs a standard-mode, blocking send of @var{data_handle} to the
-node @var{dest} using the message tag @code{mpi_tag} within the
-communicator @var{comm}.
-@end deftypefun
-
-@deftypefun int starpu_mpi_recv (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
-Performs a standard-mode, blocking receive in @var{data_handle} from the
-node @var{source} using the message tag @code{mpi_tag} within the
-communicator @var{comm}.
-@end deftypefun
-
-@deftypefun int starpu_mpi_isend (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
-Posts a standard-mode, non blocking send of @var{data_handle} to the
-node @var{dest} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. After the call, the pointer to the request
-@var{req} can be used to test the completion of the communication.
-@end deftypefun
-
-@deftypefun int starpu_mpi_irecv (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
-Posts a nonblocking receive in @var{data_handle} from the
-node @var{source} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. After the call, the pointer to the request
-@var{req} can be used to test the completion of the communication.
-@end deftypefun
-
-@deftypefun int starpu_mpi_isend_detached (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
-Posts a standard-mode, non blocking send of @var{data_handle} to the
-node @var{dest} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. On completion, the @var{callback} function is
-called with the argument @var{arg}.
-@end deftypefun
-
-@deftypefun int starpu_mpi_irecv_detached (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
-Posts a nonblocking receive in @var{data_handle} from the
-node @var{source} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. On completion, the @var{callback} function is
-called with the argument @var{arg}.
-@end deftypefun
-
-@deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
-Returns when the operation identified by request @var{req} is complete.
-@end deftypefun
-
-@deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
-If the operation identified by @var{req} is complete, set @var{flag}
-to 1. The @var{status} object is set to contain information on the
-completed operation.
-@end deftypefun
-
-@deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
-Blocks the caller until all group members of the communicator
-@var{comm} have called it.
-@end deftypefun
-
-@deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
-Posts a standard-mode, non blocking send of @var{data_handle} to the
-node @var{dest} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. On completion, @var{tag} is unlocked.
-@end deftypefun
-
-@deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
-Posts a nonblocking receive in @var{data_handle} from the
-node @var{source} using the message tag @code{mpi_tag} within the
-communicator @var{comm}. On completion, @var{tag} is unlocked.
-@end deftypefun
-
-@deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
-Posts @var{array_size} standard-mode, non blocking send. Each post
-sends the n-th data of the array @var{data_handle} to the n-th node of
-the array @var{dest}
-using the n-th message tag of the array @code{mpi_tag} within the n-th
-communicator of the array
-@var{comm}. On completion of the all the requests, @var{tag} is unlocked.
-@end deftypefun
-
-@deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
-Posts @var{array_size} nonblocking receive. Each post receives in the
-n-th data of the array @var{data_handle} from the n-th
-node of the array @var{source} using the n-th message tag of the array
-@code{mpi_tag} within the n-th communicator of the array @var{comm}.
-On completion of the all the requests, @var{tag} is unlocked.
-@end deftypefun
-
-@page
-@node Simple Example
-@section Simple Example
-
 @cartouche
 @smallexample
 void increment_token(void)
@@ -242,16 +121,80 @@ int main(int argc, char **argv)
 @end cartouche
 
 @page
+@node Exchanging User Defined Data Interface
+@section Exchanging User Defined Data Interface
+
+New data interfaces defined as explained in @ref{An example
+of data interface} can also be used within StarPU-MPI and exchanged
+between nodes. Two functions needs to be defined through
+the type @code{struct starpu_data_interface_ops} (@pxref{Data
+Interface API}). The pack function takes a handle and returns a
+contiguous memory buffer along with its size where data to be conveyed to another node
+should be copied. The reversed operation is implemented in the unpack
+function which takes a contiguous memory buffer and recreates the data
+handle.
+
+@cartouche
+@smallexample
+static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, size_t *count)
+@{
+  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+  struct starpu_complex_interface *complex_interface =
+    (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
+
+  *count = complex_get_size(handle);
+  *ptr = malloc(*count);
+  memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
+  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
+         complex_interface->nx*sizeof(double));
+
+  return 0;
+@}
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+@{
+  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+  struct starpu_complex_interface *complex_interface =
+    (struct starpu_complex_interface *)	starpu_data_get_interface_on_node(handle, node);
+
+  memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
+  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double),
+         complex_interface->nx*sizeof(double));
+
+  return 0;
+@}
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+static struct starpu_data_interface_ops interface_complex_ops =
+@{
+  ...
+  .pack_data = complex_pack_data,
+  .unpack_data = complex_unpack_data
+@};
+@end smallexample
+@end cartouche
+
+@page
 @node MPI Insert Task Utility
 @section MPI Insert Task Utility
 
 To save the programmer from having to explicit all communications, StarPU
 provides an "MPI Insert Task Utility". The principe is that the application
 decides a distribution of the data over the MPI nodes by allocating it and
-notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
-data. All MPI nodes then process the whole task graph, and StarPU automatically
-determines which node actually execute which task, as well as the required MPI
-transfers.
+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns"
+which data. It also decides, for each handle, an MPI tag which will be used to
+exchange the content of the handle. All MPI nodes then process the whole task
+graph, and StarPU automatically determines which node actually execute which
+task, and trigger the required MPI transfers.
 
 @deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
 Tell StarPU-MPI which MPI tag to use when exchanging the data.
@@ -297,22 +240,32 @@ handle.
 
 The internal algorithm is as follows:
 @enumerate
-@item Find out whether we (as an MPI node) are to execute the codelet
-because we own the data to be written to. If different nodes own data
-to be written to, the argument @code{STARPU_EXECUTE_ON_NODE} or
-@code{STARPU_EXECUTE_ON_DATA} has to be used to specify which MPI node will
-execute the task.
+@item Find out which MPI node is going to execute the codelet.
+      @enumerate
+      @item If there is only one node owning data in W mode, it will
+      be selected;
+      @item If there is several nodes owning data in W node, the one
+      selected will be the one having the least data in R mode so as
+      to minimize the amount of data to be transfered;
+      @item The argument @code{STARPU_EXECUTE_ON_NODE} followed by an
+      integer can be used to specify the node;
+      @item The argument @code{STARPU_EXECUTE_ON_DATA} followed by a
+      data handle can be used to specify that the node owing the given
+      data will execute the codelet.
+      @end enumerate
 @item Send and receive data as requested. Nodes owning data which need to be
 read by the task are sending them to the MPI node which will execute it. The
 latter receives them.
 @item Execute the codelet. This is done by the MPI node selected in the
 1st step of the algorithm.
-@item In the case when different MPI nodes own data to be written to, send
-written data back to their owners.
+@item If several MPI nodes own data to be written to, send written
+data back to their owners.
 @end enumerate
 
-The algorithm also includes a cache mechanism that allows not to send
-data twice to the same MPI node, unless the data has been modified.
+The algorithm also includes a communication cache mechanism that
+allows not to send data twice to the same MPI node, unless the data
+has been modified. The cache can be disabled
+(@pxref{STARPU_MPI_CACHE}).
 
 @end deftypefun
 
@@ -405,21 +358,32 @@ each task, only the MPI node which owns the data being written to (here,
 @code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
 automatically send the required data.
 
+This can be a concern with a growing number of nodes. To avoid this, the
+application can prune the task for loops according to the data distribution,
+so as to only submit tasks on nodes which have to care about them (either to
+execute them, or to send the required data).
+
 @node MPI Collective Operations
 @section MPI Collective Operations
 
-@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Scatter data among processes of the communicator based on the ownership of
 the data. For each data of the array @var{data_handles}, the
 process @var{root} sends the data to the process owning this data.
 Processes receiving data must have valid data handles to receive them.
+On completion of the collective communication, the @var{scallback} function is
+called with the argument @var{sarg} on the process @var{root}, the @var{rcallback} function is
+called with the argument @var{rarg} on any other process.
 @end deftypefun
 
-@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
 Gather data from the different processes of the communicator onto the
 process @var{root}. Each process owning data handle in the array
 @var{data_handles} will send them to the process @var{root}. The
 process @var{root} must have valid data handles to receive the data.
+On completion of the collective communication, the @var{rcallback} function is
+called with the argument @var{rarg} on the process @var{root}, the @var{scallback} function is
+called with the argument @var{sarg} on any other process.
 @end deftypefun
 
 @page
@@ -475,5 +439,3 @@ for(x = 0; x < nblocks ;  x++) @{
 starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
 @end smallexample
 @end cartouche
-
-

+ 210 - 24
doc/chapters/perf-feedback.texi

@@ -2,22 +2,43 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
+* Task debugger::               Using the Temanejo task debugger
 * On-line::                     On-line performance feedback
 * Off-line::                    Off-line performance feedback
 * Codelet performance::         Performance of codelets
-* Theoretical lower bound on execution time API::  
+* Theoretical lower bound on execution time API::
+* Memory feedback::
+* Data statistics::
 @end menu
 
+@node Task debugger
+@section Using the Temanejo task debugger
+
+StarPU can connect to Temanejo (see
+@url{http://www.hlrs.de/temanejo}), to permit
+nice visual task debugging. To do so, build Temanejo's @code{libayudame.so},
+install @code{Ayudame.h} to e.g. @code{/usr/local/include}, apply the
+@code{tools/patch-ayudame} to it to fix C build, re-@code{./configure}, make
+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
+to your application, any options you want to pass it, the path to libayudame.so.
+
+Make sure to specify at least the same number of CPUs in the dialog box as your
+machine has, otherwise an error will happen during execution. Future versions
+of Temanejo should be able to tell StarPU the number of CPUs to use.
+
+Tag numbers have to be below @code{4000000000000000000ULL} to be usable for
+Temanejo (so as to distinguish them from tasks).
+
 @node On-line
 @section On-line performance feedback
 
 @menu
-* Enabling monitoring::         Enabling on-line performance monitoring
+* Enabling on-line performance monitoring::
 * Task feedback::               Per-task feedback
 * Codelet feedback::            Per-codelet feedback
 * Worker feedback::             Per-worker feedback
@@ -25,7 +46,7 @@
 * StarPU-Top::                  StarPU-Top interface
 @end menu
 
-@node Enabling monitoring
+@node Enabling on-line performance monitoring
 @subsection Enabling on-line performance monitoring
 
 In order to enable online performance monitoring, the application can call
@@ -87,7 +108,7 @@ because there is no task to execute at all (@code{sleeping_time}), and the
 number of tasks that were executed while profiling was enabled.
 These values give an estimation of the proportion of time spent do real work,
 and the time spent either sleeping because there are not enough executable
-tasks or simply wasted in pure StarPU overhead. 
+tasks or simply wasted in pure StarPU overhead.
 
 Calling @code{starpu_worker_get_profiling_info} resets the profiling
 information associated to a worker.
@@ -98,7 +119,7 @@ generate a graphic showing the evolution of these values during the time, for
 the different workers.
 
 @node Bus feedback
-@subsection Bus-related feedback 
+@subsection Bus-related feedback
 
 TODO: ajouter STARPU_BUS_STATS
 
@@ -200,13 +221,13 @@ start the application itself (possibly on a remote machine). The SSH checkbox
 should be checked, and a command line provided, e.g.:
 
 @example
-ssh myserver STARPU_SCHED=heft ./application
+$ ssh myserver STARPU_SCHED=dmda ./application
 @end example
 
 If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
 
 @example
-ssh -L 2011:localhost:2011 myserver STARPU_SCHED=heft ./application
+$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
 @end example
 
 and "localhost" should be used as IP Address to connect to.
@@ -226,26 +247,26 @@ and "localhost" should be used as IP Address to connect to.
 @subsection Generating traces with FxT
 
 StarPU can use the FxT library (see
-@indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traces
+@url{https://savannah.nongnu.org/projects/fkt/}) to generate traces
 with a limited runtime overhead.
 
 You can either get a tarball:
 @example
-% wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz
+$ wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz
 @end example
 
 or use the FxT library from CVS (autotools are required):
 @example
-% cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
-% ./bootstrap
+$ cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
+$ ./bootstrap
 @end example
 
 Compiling and installing the FxT library in the @code{$FXTDIR} path is
 done following the standard procedure:
 @example
-% ./configure --prefix=$FXTDIR
-% make
-% make install
+$ ./configure --prefix=$FXTDIR
+$ make
+$ make install
 @end example
 
 In order to have StarPU to generate traces, StarPU should be configured with
@@ -270,11 +291,11 @@ the @code{STARPU_FXT_PREFIX} environment variable.
 When the FxT trace file @code{filename} has been generated, it is possible to
 generate a trace in the Paje format by calling:
 @example
-% starpu_fxt_tool -i filename
+$ starpu_fxt_tool -i filename
 @end example
 
 Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
-to 1 before application execution will make StarPU do it automatically at
+to @code{1} before application execution will make StarPU do it automatically at
 application shutdown.
 
 This will create a @code{paje.trace} file in the current directory that
@@ -282,12 +303,19 @@ can be inspected with the @url{http://vite.gforge.inria.fr/, ViTE trace
 visualizing open-source tool}.  It is possible to open the
 @code{paje.trace} file with ViTE by using the following command:
 @example
-% vite paje.trace
+$ vite paje.trace
 @end example
 
 To get names of tasks instead of "unknown", fill the optional @code{name} field
 of the codelets, or use a performance model for them.
 
+In the MPI execution case, collect the trace files from the MPI nodes, and
+specify them all on the @code{starpu_fxt_tool} command, for instance:
+
+@smallexample
+$ starpu_fxt_tool -i filename1 -i filename2
+@end smallexample
+
 By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option @code{-c} to @code{starpu_fxt_tool}.
 
@@ -355,7 +383,7 @@ file: <starpu_slu_lu_model_12.hannibal>
 @end example
 
 Here, the codelets of the lu example are available. We can examine the
-performance of the 22 kernel (in micro-seconds):
+performance of the 22 kernel (in micro-seconds), which is history-based:
 
 @example
 $ starpu_perfmodel_display -s starpu_slu_lu_model_22
@@ -378,15 +406,68 @@ execution, the GPUs are about 20 times faster than the CPUs (numbers are in
 us). The standard deviation is extremely low for the GPUs, and less than 10% for
 CPUs.
 
-The @code{starpu_regression_display} tool does the same for regression-based
-performance models. It also writes a @code{.gp} file in the current directory,
-to be run in the @code{gnuplot} tool, which shows the corresponding curve.
+This tool can also be used for regression-based performance models. It will then
+display the regression formula, and in the case of non-linear regression, the
+same performance log as for history-based performance models:
+
+@example
+$ starpu_perfmodel_display -s non_linear_memset_regression_based.type
+performance model for cpu_impl_0
+	Regression : #sample = 1400
+	Linear: y = alpha size ^ beta
+		alpha = 1.335973e-03
+		beta = 8.024020e-01
+	Non-Linear: y = a size ^b + c
+		a = 5.429195e-04
+		b = 8.654899e-01
+		c = 9.009313e-01
+# hash		size		mean		stddev		n
+a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
+870a30aa	8192           	1.827970e+00   	2.037181e-01   	100
+48e988e9	16384          	2.652800e+00   	1.876459e-01   	100
+961e65d2	32768          	4.255530e+00   	3.518025e-01   	100
+...
+@end example
+
+The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
+It writes a @code{.gp} file in the current directory, to be run in the
+@code{gnuplot} tool, which shows the corresponding curve.
 
 The same can also be achieved by using StarPU's library API, see
-@ref{Performance Model API} and notably the @code{starpu_load_history_debug}
+@ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
 function. The source code of the @code{starpu_perfmodel_display} tool can be a
 useful example.
 
+When the FxT trace file @code{filename} has been generated, it is possible to
+get a profiling of each codelet by calling:
+@example
+$ starpu_fxt_tool -i filename
+$ starpu_codelet_profile distrib.data codelet_name
+@end example
+
+This will create profiling data files, and a @code{.gp} file in the current
+directory, which draws the distribution of codelet time over the application
+execution, according to data input size.
+
+This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
+the fxt trace:
+
+@example
+$ starpu_perfmodel_display -s non_linear_memset_regression_based.type -i /tmp/prof_file_foo_0
+@end example
+
+It willd produce a @code{.gp} file which contains both the performance model
+curves, and the profiling measurements.
+
+If you have the R statistical tool installed, you can additionally use
+
+@example
+$ starpu_codelet_histo_profile distrib.data
+@end example
+
+Which will create one pdf file per codelet and per input size, showing a
+histogram of the codelet execution time distribution.
+
 @node Theoretical lower bound on execution time API
 @section Theoretical lower bound on execution time
 
@@ -415,7 +496,7 @@ Print the DAG that was recorded
 @end deftypefun
 
 @deftypefun void starpu_bound_compute ({double *}@var{res}, {double *}@var{integer_res}, int @var{integer})
-Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script)
+Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script). It returns 0 if some performance models are not calibrated.
 @end deftypefun
 
 @deftypefun void starpu_bound_print_lp ({FILE *}@var{output})
@@ -433,3 +514,108 @@ Emit statistics of actual execution vs theoretical upper bound. @var{integer}
 permits to choose between integer solving (which takes a long time but is
 correct), and relaxed solving (which provides an approximate solution).
 @end deftypefun
+
+@node Memory feedback
+@section Memory feedback
+
+It is possible to enable memory statistics. To do so, you need to pass the option
+@code{--enable-memory-stats} when running configure. It is then
+possible to call the function @code{starpu_display_memory_stats()} to
+display statistics about the current data handles registered within StarPU.
+
+Moreover, statistics will be displayed at the end of the execution on
+data handles which have not been cleared out. This can be disabled by
+setting the environment variable @code{STARPU_MEMORY_STATS} to 0.
+
+For example, if you do not unregister data at the end of the complex
+example, you will get something similar to:
+
+@example
+$ STARPU_MEMORY_STATS=0 ./examples/interface/complex
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 78.00 + 78.00 i
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 45.00 + 12.00 i
+@end example
+
+@example
+$ STARPU_MEMORY_STATS=1 ./examples/interface/complex
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 78.00 + 78.00 i
+Complex[0] = 45.00 + 12.00 i
+Complex[0] = 45.00 + 12.00 i
+
+#---------------------
+Memory stats:
+#-------
+Data on Node #3
+#-----
+Data : 0x553ff40
+Size : 16
+
+#--
+Data access stats
+/!\ Work Underway
+Node #0
+	Direct access : 4
+	Loaded (Owner) : 0
+	Loaded (Shared) : 0
+	Invalidated (was Owner) : 0
+
+Node #3
+	Direct access : 0
+	Loaded (Owner) : 0
+	Loaded (Shared) : 1
+	Invalidated (was Owner) : 0
+
+#-----
+Data : 0x5544710
+Size : 16
+
+#--
+Data access stats
+/!\ Work Underway
+Node #0
+	Direct access : 2
+	Loaded (Owner) : 0
+	Loaded (Shared) : 1
+	Invalidated (was Owner) : 1
+
+Node #3
+	Direct access : 0
+	Loaded (Owner) : 1
+	Loaded (Shared) : 0
+	Invalidated (was Owner) : 0
+@end example
+
+@node Data statistics
+@section Data statistics
+
+Different data statistics can be displayed at the end of the execution
+of the application. To enable them, you need to pass the option
+@code{--enable-stats} when calling @code{configure}. When calling
+@code{starpu_shutdown()} various statistics will be displayed,
+execution, MSI cache statistics, allocation cache statistics, and data
+transfer statistics. The display can be disabled by setting the
+environment variable @code{STARPU_STATS} to 0.
+
+@example
+$ ./examples/cholesky/cholesky_tag
+Computation took (in ms)
+518.16
+Synthetic GFlops : 44.21
+#---------------------
+MSI cache stats :
+TOTAL MSI stats	hit 1622 (66.23 %)	miss 827 (33.77 %)
+...
+@end example
+
+@example
+$ STARPU_STATS=0 ./examples/cholesky/cholesky_tag
+Computation took (in ms)
+518.16
+Synthetic GFlops : 44.21
+@end example
+
+@c TODO: data transfer stats are similar to the ones displayed when
+@c setting STARPU_BUS_STATS

+ 269 - 10
doc/chapters/perf-optimization.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
@@ -14,13 +14,16 @@ TODO: improve!
 * Task submission::
 * Task priorities::
 * Task scheduling policy::
+* Task scheduling contexts::
 * Performance model calibration::
 * Task distribution vs Data transfer::
 * Data prefetch::
 * Power-based scheduling::
+* Static scheduling::
 * Profiling::
 * CUDA-specific optimizations::
 * Performance debugging::
+* Simulated performance::
 @end menu
 
 Simply encapsulating application kernels into tasks already permits to
@@ -79,16 +82,61 @@ In the same vein, accumulation of results in the same data can become a
 bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
 accumulation (@pxref{Data reduction}).
 
+Applications often need a data just for temporary results.  In such a case,
+registration can be made without an initial value, for instance this produces a vector data:
+
+@cartouche
+@smallexample
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+@end smallexample
+@end cartouche
+
+StarPU will then allocate the actual buffer only when it is actually needed,
+e.g. directly on the GPU without allocating in main memory.
+
+In the same vein, once the temporary results are not useful any more, the
+data should be thrown away. If the handle is not to be reused, it can be
+unregistered:
+
+@cartouche
+@smallexample
+starpu_unregister_submit(handle);
+@end smallexample
+@end cartouche
+
+actual unregistration will be done after all tasks working on the handle
+terminate.
+
+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
+
+@cartouche
+@smallexample
+starpu_invalidate_submit(handle);
+@end smallexample
+@end cartouche
+
+the buffers containing the current value will then be freed, and reallocated
+only when another task writes some value to the handle.
+
 @node Task granularity
 @section Task granularity
 
 Like any other runtime, StarPU has some overhead to manage tasks. Since
 it does smart scheduling and data management, that overhead is not always
 neglectable. The order of magnitude of the overhead is typically a couple of
-microseconds. The amount of work that a task should do should thus be somewhat
+microseconds, which is actually quite smaller than the CUDA overhead itself. The
+amount of work that a task should do should thus be somewhat
 bigger, to make sure that the overhead becomes neglectible. The offline
 performance feedback can provide a measure of task length, which should thus be
-checked if bad performance are observed.
+checked if bad performance are observed. To get a grasp at the scalability
+possibility according to task size, one can run
+@code{tests/microbenchs/tasks_size_overhead.sh} which draws curves of the
+speedup of independent tasks of very small sizes.
+
+The choice of scheduler also has impact over the overhead: for instance, the
+@code{dmda} scheduler takes time to make a decision, while @code{eager} does
+not. @code{tasks_size_overhead.sh} can again be used to get a grasp at how much
+impact that has on the target machine.
 
 @node Task submission
 @section Task submission
@@ -149,14 +197,48 @@ buffers.
 The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
 also supports arbitrary priority values.
 
-The @b{heft} (heterogeneous earliest finish time) scheduler is similar to dmda, it also supports task bundles.
+The @b{heft} (heterogeneous earliest finish time) scheduler is deprecated. It
+is now just an alias for @b{dmda}.
 
 The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
 parallel tasks (still experimental).
 
-The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
+The @b{peager} (parallel eager) scheduler is similar to eager, it also
 supports parallel tasks (still experimental).
 
+@node Task scheduling contexts
+@section Task scheduling contexts
+Task scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
+GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
+
+By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
+If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
+Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
+These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
+In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
+By passing a set of workers together with the scheduling policy to the function @code{starpu_sched_ctx_create}, you will get an identifier of the context created which you will use to indicate the context you want to submit the tasks to.
+
+@cartouche
+@smallexample
+/* @b{the list of ressources the context will manage} */
+int workerids[3] = @{1, 3, 10@};
+
+/* @b{indicate the scheduling policy to be used within the context, the list of 
+   workers assigned to it, the number of workers, the name of the context} */
+int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
+
+/* @b{let StarPU know that the folowing tasks will be submitted to this context} */
+starpu_task_set_context(id);
+
+/* @b{submit the task to StarPU} */
+starpu_task_submit(task);
+
+@end smallexample
+@end cartouche
+
+Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine. 
+Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
+
 @node Performance model calibration
 @section Performance model calibration
 
@@ -166,7 +248,7 @@ to configure a performance model for the codelets of the application (see
 @ref{Performance model example} for instance). History-based performance models
 use on-line calibration.  StarPU will automatically calibrate codelets
 which have never been calibrated yet, and save the result in
-@code{~/.starpu/sampling/codelets}.
+@code{$STARPU_HOME/.starpu/sampling/codelets}.
 The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
 @code{export STARPU_CALIBRATE=1} . This may be necessary if your application
 has not-so-stable performance. StarPU will force calibration (and thus ignore
@@ -195,7 +277,7 @@ A graph can be drawn by using the @code{starpu_perfmodel_plot}:
 
 @example
 $ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
-98304 393216 1572864 
+98304 393216 1572864
 $ gnuplot starpu_starpu_dlu_lu_model_22.gp
 $ gv starpu_starpu_dlu_lu_model_22.eps
 @end example
@@ -210,6 +292,50 @@ disables data transfer / computation overlapping, and should thus not be used
 for eventual benchmarks. Note 2: history-based performance models get calibrated
 only if a performance-model-based scheduler is chosen.
 
+The history-based performance models can also be explicitly filled by the
+application without execution, if e.g. the application already has a series of
+measurements. This can be done by using @code{starpu_perfmodel_update_history},
+for instance:
+
+@cartouche
+@smallexample
+static struct starpu_perfmodel perf_model = @{
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "my_perfmodel",
+@};
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CUDA,
+    .cuda_funcs = @{ cuda_func1, cuda_func2, NULL @},
+    .nbuffers = 1,
+    .modes = @{STARPU_W@},
+    .model = &perf_model
+@};
+
+void feed(void) @{
+    struct my_measure *measure;
+    struct starpu_task task;
+    starpu_task_init(&task);
+
+    task.cl = &cl;
+
+    for (measure = &measures[0]; measure < measures[last]; measure++) @{
+        starpu_data_handle_t handle;
+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
+	task.handles[0] = handle;
+	starpu_perfmodel_update_history(&perf_model, &task,
+	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
+	                                measure->implementation, measure->time);
+	starpu_task_clean(&task);
+	starpu_data_unregister(handle);
+    @}
+@}
+@end smallexample
+@end cartouche
+
+Measurement has to be provided in milliseconds for the completion time models,
+and in Joules for the energy consumption models.
+
 @node Task distribution vs Data transfer
 @section Task distribution vs Data transfer
 
@@ -268,15 +394,42 @@ be obtained from the machine power supplier.
 The power actually consumed by the total execution can be displayed by setting
 @code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
 
+On-line task consumption measurement is currently only supported through the
+@code{CL_PROFILING_POWER_CONSUMED} OpenCL extension, implemented in the MoviSim
+simulator. Applications can however provide explicit measurements by using the
+@code{starpu_perfmodel_update_history} function (examplified in @ref{Performance
+model example} with the @code{power_model} performance model. Fine-grain
+measurement is often not feasible with the feedback provided by the hardware, so
+the user can for instance run a given task a thousand times, measure the global
+consumption for that series of tasks, divide it by a thousand, repeat for
+varying kinds of tasks and task sizes, and eventually feed StarPU
+with these manual measurements through @code{starpu_perfmodel_update_history}.
+
+@node Static scheduling
+@section Static scheduling
+
+In some cases, one may want to force some scheduling, for instance force a given
+set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
+be scheduled on any other device. This can indeed be useful to guide StarPU into
+some work distribution, while still letting some degree of dynamism. For
+instance, to force execution of a task on CUDA0:
+
+@cartouche
+@smallexample
+task->execute_on_a_specific_worker = 1;
+task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
+@end smallexample
+@end cartouche
+
 @node Profiling
 @section Profiling
 
-A quick view of how many tasks each worker has executed can be obtained by setting 
+A quick view of how many tasks each worker has executed can be obtained by setting
 @code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
 execution did happen on accelerators without penalizing performance with
 the profiling overhead.
 
-A quick view of how much data transfers have been issued can be obtained by setting 
+A quick view of how much data transfers have been issued can be obtained by setting
 @code{export STARPU_BUS_STATS=1} .
 
 More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
@@ -315,7 +468,8 @@ detailed in the next chapter. The various informations should be checked for.
 @itemize
 @item What does the Gantt diagram look like? (see @ref{Gantt diagram})
 @itemize
-  @item If it's mostly green (running tasks), then the machine is properly
+  @item If it's mostly green (tasks running in the initial context) or context specific 
+  color prevailing, then the machine is properly
   utilized, and perhaps the codelets are just slow. Check their performance, see
   @ref{Codelet performance}.
   @item If it's mostly purple (FetchingInput), tasks keep waiting for data
@@ -333,3 +487,108 @@ detailed in the next chapter. The various informations should be checked for.
   greedy algorithm which thus performs badly.
 @end itemize
 @end itemize
+
+You can also use the Temanejo task debugger (see @ref{Task debugger}) to
+visualize the task graph more easily.
+
+@node Simulated performance
+@section Simulated performance
+
+StarPU can use Simgrid in order to simulate execution on an arbitrary
+platform.
+
+@subsection Calibration
+
+The idea is to first compile StarPU normally, and run the application,
+so as to automatically benchmark the bus and the codelets.
+
+@smallexample
+$ ./configure && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+   is not calibrated, forcing calibration for this run. Use the
+   STARPU_CALIBRATE environment variable to control this.
+$ ...
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST PASSED
+@end smallexample
+
+Note that we force to use the dmda scheduler to generate performance
+models for the application. The application may need to be run several
+times before the model is calibrated.
+
+@subsection Simulation
+
+Then, recompile StarPU, passing @code{--enable-simgrid} to @code{./configure}, and re-run the
+application:
+
+@smallexample
+$ ./configure --enable-simgrid && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST FAILED !!!
+@end smallexample
+
+It is normal that the test fails: since the computation are not actually done
+(that is the whole point of simgrid), the result is wrong, of course.
+
+If the performance model is not calibrated enough, the following error
+message will be displayed
+
+@smallexample
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+    is not calibrated, forcing calibration for this run. Use the
+    STARPU_CALIBRATE environment variable to control this.
+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
+    matvecmult does not have a perfmodel, or is not calibrated enough
+@end smallexample
+
+The number of devices can be chosen as usual with @code{STARPU_NCPU},
+@code{STARPU_NCUDA}, and @code{STARPU_NOPENCL}.  For now, only the number of
+cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
+lower than the real number on the current machine.
+
+The Simgrid default stack size is small; to increase it use the
+parameter @code{--cfg=contexts/stack_size}, for example:
+
+@smallexample
+$ ./example --cfg=contexts/stack_size:8192
+TEST FAILED !!!
+@end smallexample
+
+Note: of course, if the application uses @code{gettimeofday} to make its
+performance measurements, the real time will be used, which will be bogus. To
+get the simulated time, it has to use @code{starpu_timing_now} which returns the
+virtual timestamp in ms.
+
+@subsection Simulation on another machine
+
+The simgrid support even permits to perform simulations on another machine, your
+desktop, typically. To achieve this, one still needs to perform the Calibration
+step on the actual machine to be simulated, then copy them to your desktop
+machine (the @code{$STARPU_HOME/.starpu} directory). One can then perform the
+Simulation step on the desktop machine, by setting the @code{STARPU_HOSTNAME}
+environment variable to the name of the actual machine, to make StarPU use the
+performance models of the simulated machine even on the desktop machine.
+
+If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
+use simgrid to simulate execution with CUDA/OpenCL devices, but the application
+source code will probably disable the CUDA and OpenCL codelets in that
+case. Since during simgrid execution, the functions of the codelet are actually
+not called, one can use dummy functions such as the following to still permit
+CUDA or OpenCL execution:
+
+@smallexample
+static struct starpu_codelet cl11 =
+@{
+	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = @{(void*)1, NULL@},
+#endif
+	.nbuffers = 1,
+	.modes = @{STARPU_RW@},
+	.model = &chol_model_11
+@};
+@end smallexample

+ 6 - 7
doc/chapters/scaling-vector-example.texi

@@ -7,10 +7,10 @@
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Main application::            
-* CPU Kernel::                 
-* CUDA Kernel::                
-* OpenCL Kernel::              
+* Main application::
+* CPU Kernel::
+* CUDA Kernel::
+* OpenCL Kernel::
 @end menu
 
 @node Main application
@@ -32,8 +32,8 @@
 @section OpenCL Kernel
 
 @menu
-* Invoking the kernel::         
-* Source of the kernel::        
+* Invoking the kernel::
+* Source of the kernel::
 @end menu
 
 @node Invoking the kernel
@@ -45,4 +45,3 @@
 @subsection Source of the kernel
 
 @include chapters/vector_scal_opencl_codelet.texi
-

+ 394 - 0
doc/chapters/sched_ctx_hypervisor.texi

@@ -0,0 +1,394 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@cindex Scheduling Context Hypervisor
+
+StarPU proposes a platform for constructing Scheduling Contexts, for deleting and modifying them dynamically.
+A parallel kernel, can thus be isolated into a scheduling context and interferences between several parallel kernels are avoided.
+If the user knows exactly how many workers each scheduling context needs, he can assign them to the contexts at their creation time or modify them during the execution of the program.
+
+The Scheduling Context Hypervisor Plugin is available for the users who do not dispose of a regular parallelism, who cannot know in advance the exact size of the context and need to resize the contexts according to the behavior of the parallel kernel.
+The Hypervisor receives information from StarPU concerning the execution of the tasks, the efficiency of the resources, etc. and it decides accordingly when and how the contexts can be resized.
+Basic strategies of resizing scheduling contexts already exist but a platform for implementing additional custom ones is available.
+
+@menu
+* Managing the hypervisor::				Initialize the hypervisor
+* Registering Scheduling Contexts to the hypervisor:: 	Contexts have to register to the hypervisor
+* The user's input in the resizing process:: 		The user can help the hypervisor decide how to resize
+* Resizing strategies::					Several resizing strategies are proposed
+* Performance Counters::              			StarPU provides information to the Hypervisor through performance counters
+* Defining a new hypervisor policy::      		New Policies can be implemented
+@end menu
+
+@node Managing the hypervisor
+@section Managing the hypervisor
+There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
+
+@deftypefun {struct starpu_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
+Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
+These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
+@end deftypefun
+
+Note: The Hypervisor is actually a worker that takes this role once certain conditions trigger the resizing process (there is no additional thread assigned to the hypervisor).
+
+@deftypefun void sched_ctx_hypervisor_shutdown (void)
+The hypervisor and all information is freed. There is no synchronization between this function and starpu_shutdown. Thus, this should be done after starpu_shutdown(),
+because the performance counters will still need allocated callback functions.
+@end deftypefun
+
+@node Registering Scheduling Contexts to the hypervisor
+@section Registering Scheduling Contexts to the hypervisor
+Scheduling Contexts that have to be resized by the hypervisor must be first registered to the hypervisor. Whenever we want to exclude contexts from the resizing process we have to unregister them from the hypervisor.
+
+@deftypefun void sched_ctx_hypervisor_register_ctx (unsigned @var{sched_ctx}, double @var{total_flops})
+Register the context to the hypervisor, and indicate the number of flops the context will execute (needed for Gflops rate based strategy @pxref{Resizing strategies} or any other custom strategy needing it, for the others we can pass 0.0)
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_unregister_ctx (unsigned @var{sched_ctx})
+Unregister the context from the hypervisor
+@end deftypefun
+
+@node The user's input in the resizing process
+@section The user's input in the resizing process
+The user can totally forbid the resizing of a certain context or can then change his mind and allow it (in this case the resizing is managed by the hypervisor, that can forbid it or allow it)
+
+@deftypefun void sched_ctx_hypervisor_stop_resize (unsigned @var{sched_ctx})
+Forbid resizing of a context
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_start_resize (unsigned @var{sched_ctx})
+Allow resizing of a context
+@end deftypefun
+
+The user can then provide information to the hypervisor concerning the conditions of resizing.
+
+@deftypefun void sched_ctx_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
+Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
+
+@defmac HYPERVISOR_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and a double value indicating
+the maximum idle time allowed for a worker before the resizing process should be triggered
+@end defmac
+
+@defmac HYPERVISOR_PRIORITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and an int value indicating
+the priority of the workers previously mentioned.
+The workers with the smallest priority are moved the first.
+@end defmac
+
+@defmac HYPERVISOR_MIN_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the minimum number of workers a context should have, underneath this limit the context cannot execute.
+@end defmac
+
+@defmac HYPERVISOR_MAX_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the maximum number of workers a context should have, above this limit the context would not be able to scale
+@end defmac
+
+@defmac HYPERVISOR_GRANULARITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the granularity of the resizing process (the number of workers should be moved from the context once it is resized)
+This parameter is ignore for the Gflops rate based strategy @pxref{Resizing strategies}, the number of workers that have to be moved is calculated by the strategy.
+@end defmac
+
+@defmac HYPERVISOR_FIXED_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 2 arguments:
+an array of int for the workerids to apply the condition and an int to indicate the size of the array.
+These workers are not allowed to be moved from the context.
+@end defmac
+
+@defmac HYPERVISOR_MIN_TASKS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int)
+that indicated the minimum number of tasks that have to be executed before the context could be resized.
+This parameter is ignored for the Application Driven strategy @pxref{Resizing strategies} where the user indicates exactly when the resize should be done.
+@end defmac
+
+@defmac HYPERVISOR_NEW_WORKERS_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument, a double value indicating
+the maximum idle time allowed for workers that have just been moved from other contexts in the current context.
+@end defmac
+
+@defmac HYPERVISOR_TIME_TO_APPLY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int) indicating the tag
+an executed task should have such that this configuration should be taken into account.
+@end defmac
+@end deftypefun
+@node Resizing strategies
+@section Resizing strategies
+
+The plugin proposes several strategies for resizing the scheduling context.
+
+The @b{Application driven} strategy uses the user's input concerning the moment when he wants to resize the contexts.
+Thus, the users tags the task that should trigger the resizing process. We can set directly the corresponding field in the @code{starpu_task} data structure is @code{hypervisor_tag} or
+use the macro @code{STARPU_HYPERVISOR_TAG} in @code{starpu_insert_task} function.
+
+@cartouche
+@smallexample
+task.hypervisor_tag = 2;
+@end smallexample
+@end cartouche
+
+or
+
+@cartouche
+@smallexample
+starpu_insert_task(&codelet,
+		    ...,
+		    STARPU_HYPERVISOR_TAG, 2,
+                    0);
+@end smallexample
+@end cartouche
+
+Then the user has to indicate that when a task with the specified tag is executed the contexts should resize.
+
+@cartouche
+@smallexample
+sched_ctx_hypervisor_resize(sched_ctx, 2);
+@end smallexample
+@end cartouche
+
+The user can use the same tag to change the resizing configuration of the contexts if he considers it necessary.
+@cartouche
+@smallexample
+sched_ctx_hypervisor_ioctl(sched_ctx,
+                    HYPERVISOR_MIN_WORKERS, 6,
+                    HYPERVISOR_MAX_WORKERS, 12,
+                    HYPERVISOR_TIME_TO_APPLY, 2,
+                    NULL);
+@end smallexample
+@end cartouche
+
+
+The @b{Idleness} based strategy resizes the scheduling contexts every time one of their workers stays idle
+for a period longer than the one imposed by the user (see @pxref{The user's input in the resizing process})
+
+@cartouche
+@smallexample
+int workerids[3] = @{1, 3, 10@};
+int workerids2[9] = @{0, 2, 4, 5, 6, 7, 8, 9, 11@};
+sched_ctx_hypervisor_ioctl(sched_ctx_id,
+            HYPERVISOR_MAX_IDLE, workerids, 3, 10000.0,
+            HYPERVISOR_MAX_IDLE, workerids2, 9, 50000.0,
+            NULL);
+@end smallexample
+@end cartouche
+
+The @b{Gflops rate} based strategy resizes the scheduling contexts such that they all finish at the same time.
+The velocity of each of them is considered and once one of them is significantly slower the resizing process is triggered.
+In order to do these computations the user has to input the total number of instructions needed to be executed by the
+parallel kernels and the number of instruction to be executed by each task.
+The number of flops to be executed by a context are passed as parameter when they are registered to the hypervisor,
+ (@code{sched_ctx_hypervisor_register_ctx(sched_ctx_id, flops)}) and the one to be executed by each task are passed when the task is submitted.
+The corresponding field in the @code{starpu_task} data structure is @code{flops} and
+the corresponding macro in @code{starpu_insert_task} function is @code{STARPU_FLOPS}. When the task is executed
+the resizing process is triggered.
+@cartouche
+@smallexample
+task.flops = 100;
+@end smallexample
+@end cartouche
+
+or
+
+@cartouche
+@smallexample
+starpu_insert_task(&codelet,
+                    ...,
+                    STARPU_FLOPS, 100,
+                    0);
+@end smallexample
+@end cartouche
+
+@node Performance Counters
+@section Performance Counters
+
+The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
+
+@deftp {Data Type} {struct starpu_performance_counters}
+@anchor{struct starpu_performance_counters}
+
+@table @asis
+@item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
+Informs the hypervisor for how long a worker has been idle in the specified context
+@item @code{void (*notify_idle_end)(unsigned sched_ctx_id, int worker)}
+Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context.
+The idle counter it though reset.
+@item @code{void (*notify_pushed_task)(unsigned sched_ctx_id, int worker)}
+Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+@item @code{void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops)}
+Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+@item @code{void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid)}
+Notifies the hypervisor a task has just been executed
+
+@end table
+@end deftp
+
+TODO maybe they should be hidden to the user
+
+@node Defining a new hypervisor policy
+@section Defining a new hypervisor policy
+
+@menu
+* Hypervisor Policy API:: Hypervisor Policy API
+* Hypervisor example::
+@end menu
+
+@node Hypervisor Policy API
+@subsection Hypervisor Policy API
+
+While Scheduling Context Hypervisor Plugin comes with a variety of resizing policies (@pxref{Resizing strategies}),
+it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own resizing policy.
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_policy}
+This structure contains all the methods that implement a hypervisor resizing policy.
+
+@table @asis
+@item @code{const char* name}
+Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
+@item @code{unsigned custom}
+Indicates whether the policy is custom or not
+@item @code{void (*handle_idle_cycle)(unsigned sched_ctx_id, int worker)}
+It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
+@item @code{void (*handle_pushed_task)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_poped_task)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_idle_end)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is executed on the indicated worker and context after a long period of idle time
+@item @code{void (*handle_post_exec_hook)(unsigned sched_ctx_id, struct starpu_htbl32_node* resize_requests, int task_tag)}
+It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
+@end table
+@end deftp
+
+The Hypervisor provides also a structure with configuration information of each context, which can be used to construct new resize strategies.
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_policy_config }
+This structure contains all configuration information of a context
+
+@table @asis
+@item @code{int min_nworkers}
+Indicates the minimum number of workers needed by the context
+@item @code{int max_nworkers}
+Indicates the maximum number of workers needed by the context
+@item @code{int granularity}
+Indicates the workers granularity of the context
+@item @code{int priority[STARPU_NMAXWORKERS]}
+Indicates the priority of each worker in the context
+@item @code{double max_idle[STARPU_NMAXWORKERS]}
+Indicates the maximum idle time accepted before a resize is triggered
+@item @code{int fixed_workers[STARPU_NMAXWORKERS]}
+Indicates which workers can be moved and which ones are fixed
+@item @code{double new_workers_max_idle}
+Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
+@end table
+@end deftp
+
+Additionally, the hypervisor provides a structure with information obtained from StarPU by means of the performance counters
+
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_wrapper}
+This structure is a wrapper of the contexts available in StarPU
+and contains all information about a context obtained by incrementing the performance counters
+
+@table @asis
+@item @code{unsigned sched_ctx}
+The context wrapped
+@item @code{struct sched_ctx_hypervisor_policy_config *config}
+The corresponding resize configuration
+@item @code{double current_idle_time[STARPU_NMAXWORKERS]}
+The idle time counter of each worker of the context
+@item @code{int pushed_tasks[STARPU_NMAXWORKERS]}
+The number of pushed tasks of each worker of the context
+@item @code{int poped_tasks[STARPU_NMAXWORKERS]}
+The number of poped tasks of each worker of the context
+@item @code{double total_flops}
+The total number of flops to execute by the context
+@item @code{double total_elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each workers of the context
+@item @code{double elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each worker of the context from last resize
+@item @code{double remaining_flops}
+The number of flops that still have to be executed by the workers in the context
+@item @code{double start_time}
+The time when he started executed
+@item @code{struct sched_ctx_hypervisor_resize_ack resize_ack}
+The structure confirming the last resize finished and a new one can be done
+@end table
+@end deftp
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_resize_ack}
+This structures checks if the workers moved to another context are actually taken into account in that context
+@table @asis
+@item @code{int receiver_sched_ctx}
+The context receiving the new workers
+@item @code{int *moved_workers}
+The workers moved to the receiver context
+@item @code{int nmoved_workers}
+The number of workers moved
+@item @code{int *acked_workers}
+If the value corresponding to a worker is 1, this one is taken into account in the new context if 0 not yet
+@end table
+@end deftp
+
+The following functions can be used in the resizing strategies.
+
+@deftypefun void sched_ctx_hypervisor_move_workers (unsigned @var{sender_sched_ctx}, unsigned @var{receiver_sched_ctx}, {int *}@var{workers_to_move}, unsigned @var{nworkers_to_move}, unsigned @var{now});
+Moves workers from one context to another
+@end deftypefun
+
+@deftypefun {struct sched_ctx_hypervisor_policy_config *} sched_ctx_hypervisor_get_config (unsigned @var{sched_ctx});
+Returns the configuration structure of a context
+@end deftypefun
+
+@deftypefun {int *} sched_ctx_hypervisor_get_sched_ctxs ();
+Gets the contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun int sched_ctx_hypervisor_get_nsched_ctxs ();
+Gets the number of contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun {struct sched_ctx_hypervisor_wrapper *} sched_ctx_hypervisor_get_wrapper (unsigned @var{sched_ctx});
+Returns the wrapper corresponding the context @code{sched_ctx}
+@end deftypefun
+
+@deftypefun double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sched_ctx_hypervisor_wrapper *} @var{sc_w});
+Returns the flops of a context elapsed from the last resize
+@end deftypefun
+
+@deftypefun {char *} sched_ctx_hypervisor_get_policy ();
+Returns the name of the resizing policy the hypervisor uses
+@end deftypefun
+
+@node Hypervisor example
+@subsection Hypervisor example
+
+@cartouche
+@smallexample
+
+struct sched_ctx_hypervisor_policy dummy_policy =
+@{
+       .handle_poped_task = dummy_handle_poped_task,
+       .handle_pushed_task = dummy_handle_pushed_task,
+       .handle_idle_cycle = dummy_handle_idle_cycle,
+       .handle_idle_end = dummy_handle_idle_end,
+       .handle_post_exec_hook = dummy_handle_post_exec_hook,
+       .custom = 1,
+       .name = "dummy"
+@};
+
+@end smallexample
+@end cartouche
+
+@c Local Variables:
+@c TeX-master: "../starpu.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 7 - 15
doc/chapters/socl.texi

@@ -2,24 +2,16 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2012  Univ. of Bordeaux
 @c See the file starpu.texi for copying conditions.
 
-SOCL is an extension that aims at implementing the OpenCL standard on
-top of StarPU. It allows to gives a (relatively) clean and
-standardized API to StarPU.
-By allowing OpenCL applications to use StarPU transparently, it
-provides users with the latest StarPU enhancements without any further
-development, and allows these OpenCL applications to easily fall back
-to another OpenCL implementation.
+SOCL is an OpenCL implementation based on StarPU. It gives a unified access to
+every available OpenCL device: applications can now share entities such as
+Events, Contexts or Command Queues between several OpenCL implementations.
 
-This section does not require detailed knowledge of the StarPU
-library.
+In addition, command queues that are created without specifying a device provide
+automatic scheduling of the submitted commands on OpenCL devices contained in
+the context to which the command queue is attached.
 
 Note: as of StarPU @value{VERSION}, this is still an area under
 development and subject to change.
-
-TODO
-
-
-
-

+ 36 - 15
doc/chapters/using.texi

@@ -2,20 +2,22 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Setting flags for compiling and linking applications::  
-* Running a basic StarPU application::  
+* Setting flags for compiling::
+* Running a basic StarPU application::
 * Kernel threads started by StarPU::
 * Enabling OpenCL::
 @end menu
 
-@node Setting flags for compiling and linking applications
-@section Setting flags for compiling and linking applications
+@node Setting flags for compiling
+@section Setting flags for compiling, linking and running applications
 
+StarPU provides a pkg-config executable to obtain relevant compiler
+and linker flags.
 Compiling and linking an application against StarPU may require to use
 specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
 To this end, it is possible to use the @code{pkg-config} tool.
@@ -26,7 +28,7 @@ that @code{pkg-config} can find it. For example if StarPU was installed in
 @code{$prefix_dir}:
 
 @example
-% PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
+$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
 @end example
 
 The flags required to compile or link against StarPU are then
@@ -36,13 +38,34 @@ with the @code{libstarpu} package. Similar packages are provided for
 @code{libstarpumpi} and @code{libstarpufft}.}:
 
 @example
-% pkg-config --cflags starpu-1.0  # options for the compiler
-% pkg-config --libs starpu-1.0    # options for the linker
+$ pkg-config --cflags starpu-1.0  # options for the compiler
+$ pkg-config --libs starpu-1.0    # options for the linker
 @end example
 
+Make sure that @code{pkg-config --libs starpu-1.0} actually produces some output
+before going further: @code{PKG_CONFIG_PATH} has to point to the place where
+@code{starpu-1.0.pc} was installed during @code{make install}.
+
 Also pass the @code{--static} option if the application is to be
 linked statically.
 
+It is also necessary to set the variable @code{LD_LIBRARY_PATH} to
+locate dynamic libraries at runtime.
+
+@example
+$ LD_LIBRARY_PATH=$prefix_dir/lib:$LD_LIBRARY_PATH
+@end example
+
+When using a Makefile, the following lines can be added to set the
+options for the compiler and the linker:
+
+@cartouche
+@example
+CFLAGS          +=      $$(pkg-config --cflags starpu-1.0)
+LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
+@end example
+@end cartouche
+
 @node Running a basic StarPU application
 @section Running a basic StarPU application
 
@@ -52,15 +75,14 @@ Basic examples using StarPU are built in the directory
 @code{vector_scal}.
 
 @example
-% ./examples/basic_examples/vector_scal
+$ ./examples/basic_examples/vector_scal
 BEFORE: First element was 1.000000
 AFTER: First element is 3.140000
-%
 @end example
 
 When StarPU is used for the first time, the directory
 @code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
-that directory (@code{STARPU_HOME} defaults to @code{$HOME})
+that directory (@pxref{STARPU_HOME}).
 
 Please note that buses are benchmarked when StarPU is launched for the
 first time. This may take a few minutes, or less if @code{hwloc} is
@@ -93,13 +115,13 @@ between them.
 To enable OpenCL, you need either to disable CUDA when configuring StarPU:
 
 @example
-% ./configure --disable-cuda
+$ ./configure --disable-cuda
 @end example
 
 or when running applications:
 
 @example
-% STARPU_NCUDA=0 ./application
+$ STARPU_NCUDA=0 ./application
 @end example
 
 OpenCL will automatically be started on any device not yet used by
@@ -108,6 +130,5 @@ enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
 so:
 
 @example
-% STARPU_NCUDA=2 ./application
+$ STARPU_NCUDA=2 ./application
 @end example
-

+ 0 - 1
doc/chapters/vector_scal_c.texi

@@ -14,7 +14,6 @@
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 #define    NX    2048
 

+ 1 - 1
doc/chapters/vector_scal_cpu.texi

@@ -51,7 +51,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
     float factor = *(float *) cl_arg;
     FACTOR = _mm_set1_ps(factor);
 
-    unsigned int i;	
+    unsigned int i;
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
 

+ 4 - 4
doc/chapters/vector_scal_cuda.texi

@@ -1,13 +1,12 @@
 @c -*-texinfo-*-
 
 @c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c Copyright (C) 2009-2012  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)
@@ -28,7 +27,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned threads_per_block = 64;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
+	                (val, n, *factor);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 @}

+ 1 - 2
doc/chapters/vector_scal_opencl.texi

@@ -2,12 +2,11 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program programs;
 

+ 20 - 3
doc/starpu.css

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * Permission is granted to copy, distribute and/or modify this document
  * under the terms of the GNU Free Documentation License, Version 1.3
@@ -11,7 +11,7 @@
  */
 
 body {
-	font-size: 13px;
+	font-family: sans-serif;
 /*	margin-top: 0px; */
 }
 
@@ -39,7 +39,7 @@ div.node hr.node {
 h1 {
 	font: bold normal 2.5em sans-serif ;
 	margin: 0px;
-	color: #0020a0;
+	color: rgb(226,0,38);
 }
 h1.sub {
 	font: bold normal 2em sans-serif ;
@@ -141,3 +141,20 @@ p.updated {
 	font-size: 10px;
 	font-style: italic;
 }
+
+div.contents {
+	margin-top: 12px;
+	margin-bottom: 3px;
+	font-variant: small-caps;
+	padding-left: 1em;
+	padding-right: 1em;
+	padding-top: 1px;
+	padding-bottom: 1px;
+	margin-top: 12px;
+	margin-bottom: 12px;
+	margin-top:0px;
+	margin-left: auto;
+	margin-right: auto;
+	border-top: 4px solid rgb(204,209,222);
+	border-bottom: 4px solid rgb(204,209,222);
+}

+ 47 - 20
doc/starpu.texi

@@ -8,10 +8,10 @@
 @include chapters/version.texi
 
 @copying
-Copyright @copyright{} 2009--2011  Universit@'e de Bordeaux 1
+Copyright @copyright{} 2009--2013  Universit@'e de Bordeaux 1
 
 @noindent
-Copyright @copyright{} 2010, 2011, 2012  Centre National de la Recherche Scientifique
+Copyright @copyright{} 2010--2013  Centre National de la Recherche Scientifique
 
 @noindent
 Copyright @copyright{} 2011, 2012 Institut National de Recherche en Informatique et Automatique
@@ -47,8 +47,10 @@ Free Documentation License''.
 @contents
 @page
 
+@ifnottex
 @node Top
-@top Preface
+@top StarPU Handbook
+@end ifnottex
 
 This manual documents the usage of StarPU version @value{VERSION}.  It
 was last updated on @value{UPDATED}.
@@ -63,27 +65,30 @@ was last updated on @value{UPDATED}.
 @comment  better formatting.
 @comment
 @menu
-* Introduction::                Getting started
-* Installing StarPU::           How to configure, build and install StarPU
-* Using StarPU::                How to run StarPU application
-* Basic Examples::              Basic examples of the use of StarPU
-* Advanced Examples::           Advanced examples of the use of StarPU
-* Performance optimization::    How to optimize performance with StarPU
-* Performance feedback::        Performance debugging tools
-* Tips and Tricks::             Tips and tricks to know about
-* StarPU MPI support::          How to combine StarPU with MPI
-* StarPU FFT support::          How to perform FFT computations with StarPU
-* C Extensions::                Easier StarPU programming with GCC
-* SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
-* StarPU Basic API::            The Basic API to use StarPU
-* StarPU Advanced API::         Advanced use of StarPU
-* Configuring StarPU::          How to configure StarPU
-* Full source code for the 'Scaling a Vector' example::  
+* Introduction::                	Getting started
+* Installing StarPU::           	How to configure, build and install StarPU
+* Using StarPU::                	How to run StarPU application
+* Basic Examples::              	Basic examples of the use of StarPU
+* Advanced Examples::           	Advanced examples of the use of StarPU
+* Benchmarks::                  	Benchmarks worth running
+* Performance optimization::    	How to optimize performance with StarPU
+* Performance feedback::        	Performance debugging tools
+* Tips and Tricks::             	Tips and tricks to know about
+* StarPU MPI support::          	How to combine StarPU with MPI
+* StarPU FFT support::          	How to perform FFT computations with StarPU
+* C Extensions::                	Easier StarPU programming with GCC
+* SOCL OpenCL Extensions::      	How to use OpenCL on top of StarPU
+* Scheduling Context Hypervisor:: 	How to use Scheduling Context Hypervisor with StarPU
+* StarPU Basic API::            	The Basic API to use StarPU
+* StarPU Advanced API::         	Advanced use of StarPU
+* Configuring StarPU::          	How to configure StarPU
+* Full source code for the 'Scaling a Vector' example::
 * GNU Free Documentation License::  How you can copy and share this manual.
 
 * Concept Index::               Index of programming concepts.
 * Function Index::              Index of C functions.
-* Datatype Index::              Index of C datatypes
+* Datatype Index::              Index of C datatypes.
+* Configuration Index::         Index of configuration options.
 @end menu
 
 @c ---------------------------------------------------------------------
@@ -127,6 +132,14 @@ was last updated on @value{UPDATED}.
 @include chapters/advanced-examples.texi
 
 @c ---------------------------------------------------------------------
+@c Benchmarks
+@c ---------------------------------------------------------------------
+
+@node Benchmarks
+@chapter Benchmarks
+@include chapters/benchmarks.texi
+
+@c ---------------------------------------------------------------------
 @c Performance options
 @c ---------------------------------------------------------------------
 
@@ -183,6 +196,14 @@ was last updated on @value{UPDATED}.
 @include chapters/socl.texi
 
 @c ---------------------------------------------------------------------
+@c Scheduling Context Hypervisor
+@c ---------------------------------------------------------------------
+
+@node Scheduling Context Hypervisor
+@chapter Scheduling Context Hypervisor
+@include chapters/sched_ctx_hypervisor.texi
+
+@c ---------------------------------------------------------------------
 @c StarPU API
 @c ---------------------------------------------------------------------
 
@@ -230,6 +251,8 @@ was last updated on @value{UPDATED}.
 @c Indices
 @c ---------------------------------------------------------------------
 
+@c comment it out for now, it is too small to be kept for now. See how
+@c it can be merged with the glossary section in the introduction
 @node Concept Index
 @unnumbered Concept Index
 @printindex cp
@@ -242,4 +265,8 @@ was last updated on @value{UPDATED}.
 @unnumbered Datatype Index
 @printindex tp
 
+@node Configuration Index
+@unnumbered Configuration Index
+@printindex vr
+
 @bye

+ 10 - 25
doc/tutorial/Makefile

@@ -1,32 +1,18 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
 #
-# Redistribution  and  use  in  source and binary forms, with or without
-# modification,  are  permitted  provided  that the following conditions
-# are met:
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
 #
-# * Redistributions  of  source  code  must  retain  the above copyright
-#   notice,  this  list  of  conditions  and  the  following  disclaimer.
-# * Redistributions  in  binary  form must reproduce the above copyright
-#   notice,  this list of conditions and the following disclaimer in the
-#   documentation  and/or other materials provided with the distribution.
-# * The name of the author may not be used to endorse or promote products
-#   derived from this software without specific prior written permission.
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 #
-# THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
-# SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
-# LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
-# DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 CFLAGS          +=      $$(pkg-config --cflags libstarpu-1.0)
 LDFLAGS         +=      $$(pkg-config --libs libstarpu-1.0)
@@ -40,7 +26,7 @@ HAS_OPENCL	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i opencl)
 
 all: hello_world vector_scal
 
-VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o 
+VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o
 ifneq ($(strip $(HAS_CUDA)),)
 VECTOR_SCAL_PREREQUISITES	+=	vector_scal_cuda.o
 VECTOR_SCAL_COMPILER		=	$(NVCC)
@@ -56,4 +42,3 @@ vector_scal: $(VECTOR_SCAL_PREREQUISITES)
 
 clean:
 	rm -f hello_world vector_scal *.o
-

+ 12 - 24
doc/tutorial/hello_world.c

@@ -1,36 +1,24 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 #include <starpu.h>
 
-struct params {
+struct params
+{
     int i;
     float f;
 };

+ 9 - 23
doc/tutorial/vector_scal.c

@@ -1,31 +1,18 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 /*
@@ -36,7 +23,6 @@
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 #define    NX    2048
 

+ 9 - 22
doc/tutorial/vector_scal_cpu.c

@@ -1,31 +1,18 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 #include <starpu.h>

+ 9 - 23
doc/tutorial/vector_scal_cuda.cu

@@ -1,35 +1,21 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
 {

+ 9 - 23
doc/tutorial/vector_scal_opencl.c

@@ -1,35 +1,21 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program programs;
 

+ 9 - 22
doc/tutorial/vector_scal_opencl_kernel.cl

@@ -1,31 +1,18 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
- * Redistribution  and  use  in  source and binary forms, with or without
- * modification,  are  permitted  provided  that the following conditions
- * are met:
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- * * Redistributions  of  source  code  must  retain  the above copyright
- *   notice,  this  list  of  conditions  and  the  following  disclaimer.
- * * Redistributions  in  binary  form must reproduce the above copyright
- *   notice,  this list of conditions and the following disclaimer in the
- *   documentation  and/or other materials provided with the distribution.
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
- * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
- * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
- * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
 __kernel void vector_mult_opencl(__global float* val, int nx, float factor)

+ 45 - 39
examples/Makefile.am

@@ -1,9 +1,9 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2013  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2012 INRIA
+# Copyright (C) 2011-2012  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,11 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) -Werror=implicit
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
-LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
 
 SUBDIRS = stencil
 
@@ -39,7 +39,6 @@ EXTRA_DIST = 					\
 	spmd/vector_scal_spmd.c			\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_opencl.cl			\
-	gordon/null_kernel_gordon.c		\
 	mult/xgemm.c				\
 	lu/xlu.c				\
 	lu/xlu_pivot.c				\
@@ -47,6 +46,8 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
+	sched_ctx_utils/sched_ctx_utils.c		\
+	sched_ctx/sched_ctx.c		\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -59,42 +60,17 @@ EXTRA_DIST = 					\
 	reductions/dot_product_opencl_kernels.cl	\
 	scheduler/schedulers.sh
 
-CLEANFILES = 					\
-	gordon/null_kernel_gordon.spuelf
-
-
-CLEANFILES += *.gcno *.gcda *.linkinfo
+CLEANFILES = *.gcno *.gcda *.linkinfo
 
 if STARPU_USE_CUDA
 
-NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS) -arch sm_13
+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS)
 
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
 
 endif
 
-if STARPU_USE_GORDON
-
-SPU_CC ?= spu-gcc
-SPU_LD ?= spu-ld
-
-SPULDFLAGS =
-SPULIBS = -lblas #-lc -lgloss -lc
-
-.c.spuo:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_CC) -c -fpic $< -o $@
-
-.spuo.spuelf:
-	$(MKDIR_P) `dirname $@`
-	$(SPU_LD) $(SPULDFLAGS) $< -o $@ $(SPULIBS)
-
-BUILT_SOURCES +=				\
-	gordon/null_kernel_gordon.spuelf
-
-endif
-
 if STARPU_HAVE_ICC
 .icc.o:
 	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
@@ -120,11 +96,11 @@ noinst_HEADERS = 				\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
+	sched_ctx_utils/sched_ctx_utils.h	\
 	common/blas_model.h			\
 	common/blas.h				\
 	mult/simple.h				\
 	mult/double.h				\
-	gordon/null.h				\
 	fortran/bindings/StarPU_fortran.h	\
 	ppm_downscaler/ppm_downscaler.h		\
 	ppm_downscaler/yuv_downscaler.h		\
@@ -136,12 +112,14 @@ noinst_HEADERS = 				\
 	filters/custom_mf/custom_interface.h    \
 	filters/custom_mf/custom_types.h	\
 	interface/complex_interface.h		\
+	interface/complex_codelet.h		\
 	pi/pi.h					\
 	pi/SobolQRNG/sobol.h			\
 	pi/SobolQRNG/sobol_gold.h		\
 	pi/SobolQRNG/sobol_gpu.h		\
 	pi/SobolQRNG/sobol_primitives.h         \
-	reductions/dot_product.h
+	reductions/dot_product.h                \
+	basic_examples/vector_scal_cpu_template.h
 
 #####################################
 # What to install and what to check #
@@ -164,9 +142,16 @@ LOADER			=	loader
 loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
+
+if STARPU_HAVE_AM111
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
+LOG_COMPILER		=	$(LOADER_BIN)
+else
 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
 endif
 
+endif
+
 examplebin_PROGRAMS +=				\
 	basic_examples/hello_world		\
 	basic_examples/vector_scal		\
@@ -180,6 +165,8 @@ examplebin_PROGRAMS +=				\
 	filters/fblock				\
 	filters/fmatrix				\
 	filters/shadow				\
+	filters/shadow2d			\
+	filters/shadow3d			\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\
@@ -193,16 +180,21 @@ examplebin_PROGRAMS +=				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
-	scheduler/dummy_sched			\
+	sched_ctx/sched_ctx			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	ppm_downscaler/ppm_downscaler		\
 	ppm_downscaler/yuv_downscaler
 
+if !STARPU_SIMGRID
+examplebin_PROGRAMS +=				\
+	scheduler/dummy_sched
+
 if STARPU_HAVE_F77_H
 examplebin_PROGRAMS +=				\
 	basic_examples/vector_scal_fortran
 endif
+endif
 
 if !NO_BLAS_LIB
 examplebin_PROGRAMS +=				\
@@ -260,6 +252,7 @@ STARPU_EXAMPLES +=				\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
+	sched_ctx/sched_ctx				\
 	reductions/dot_product			\
 	reductions/minmax_reduction
 
@@ -310,7 +303,6 @@ basic_examples_vector_scal_SOURCES =		\
 if STARPU_HAVE_ICC
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cpu_icc.icc
-basic_examples/vector_scal_cpu_icc.o: CFLAGS += -Dscal_cpu_func=scal_cpu_func_icc -Dscal_sse_func=scal_sse_func_icc
 endif
 
 if STARPU_USE_CUDA
@@ -523,6 +515,7 @@ cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
+	sched_ctx_utils/sched_ctx_utils.c	\
 	common/blas.c
 
 cholesky_cholesky_implicit_LDADD =		\
@@ -663,6 +656,8 @@ cg_cg_SOURCES =					\
 if STARPU_USE_CUDA
 cg_cg_SOURCES +=				\
 	cg/cg_dot_kernel.cu
+cg/cg_dot_kernel.o: cg/cg_dot_kernel.cu
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS) -arch sm_13
 endif
 
 cg_cg_LDADD =					\
@@ -755,6 +750,8 @@ interface_complex_SOURCES	=	\
 if STARPU_USE_CUDA
 interface_complex_SOURCES	+=	\
 	interface/complex_kernels.cu
+interface/complex_kernels.o: interface/complex_kernels.cu
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS) -arch sm_13
 endif
 
 if STARPU_USE_OPENCL
@@ -782,6 +779,8 @@ reductions_dot_product_SOURCES =		\
 if STARPU_USE_CUDA
 reductions_dot_product_SOURCES +=		\
 	reductions/dot_product_kernels.cu
+reductions/dot_product_kernels.o: reductions/dot_product_kernels.cu
+	$(NVCC) $< -c -o $@ $(NVCCFLAGS) -arch sm_13
 endif
 if STARPU_USE_OPENCL
 nobase_STARPU_OPENCL_DATA_DATA += \
@@ -841,13 +840,20 @@ endif
 
 if HAVE_OPENGL
 examplebin_PROGRAMS +=				\
-	gl_interop/gl_interop
+	gl_interop/gl_interop			\
+	gl_interop/gl_interop_idle
 
 gl_interop_gl_interop_SOURCES =			\
 	gl_interop/gl_interop.c
 
 gl_interop_gl_interop_LDADD =			\
 	$(STARPU_OPENGL_RENDER_LDFLAGS)
+
+gl_interop_gl_interop_idle_SOURCES =		\
+	gl_interop/gl_interop_idle.c
+
+gl_interop_gl_interop_idle_LDADD =		\
+	$(STARPU_OPENGL_RENDER_LDFLAGS)
 endif
 
 ####################

+ 10 - 3
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -179,9 +179,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	{
 		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].plan, starpu_cuda_get_local_stream());
 
 		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].inv_plan, starpu_cuda_get_local_stream());
 
 		cudaMalloc((void **)&plans[workerid].localout,
 					nsamples*sizeof(cufftComplex));
@@ -198,11 +200,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	
 	/* filter low freqs */
 	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 	/* filter high freqs */
 	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 	/* inverse FFT */
 	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
@@ -210,6 +212,7 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -410,6 +413,8 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	starpu_helper_cublas_init();
+
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 
 	struct starpu_data_filter f =
@@ -458,6 +463,8 @@ int main(int argc, char **argv)
 	starpu_data_unpartition(A_handle, 0);
 	starpu_data_unregister(A_handle);
 
+	starpu_helper_cublas_shutdown();
+
 	/* we are done ! */
 	starpu_shutdown();
 

+ 1 - 4
examples/axpy/axpy.c

@@ -28,9 +28,6 @@
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
 #endif
-#ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
-#endif
 
 #include "axpy.h"
 
@@ -74,7 +71,7 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 18 - 0
examples/axpy/axpy.h

@@ -1,3 +1,21 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
 #ifndef AXPY_H__
 #define AXPY_H__
 

+ 0 - 1
examples/axpy/axpy_opencl.c

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include "axpy.h"
 
 extern struct starpu_opencl_program opencl_program;

+ 0 - 1
examples/basic_examples/block.c

@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include <pthread.h>
 #include <math.h>
 

+ 0 - 1
examples/basic_examples/block_cuda.cu

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 {

+ 0 - 1
examples/basic_examples/block_opencl.c

@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 #define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
 do						    	    \

+ 1 - 0
examples/basic_examples/hello_world.c

@@ -87,6 +87,7 @@ int main(int argc, char **argv)
 	/* the codelet does not manipulate any data that is managed
 	 * by our DSM */
 	cl.nbuffers = 0;
+	cl.name="hello";
 
 	/* the task uses codelet "cl" */
 	task->cl = &cl;

+ 1 - 4
examples/basic_examples/multiformat.c

@@ -15,9 +15,6 @@
  */
 
 #include <starpu.h>
-#ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
-#endif
 #include "multiformat_types.h"
 
 static int ncpu = 0;
@@ -256,7 +253,7 @@ struct starpu_opencl_program opencl_conversion_program;
 #endif
 
 static int
-gpus_available()
+gpus_available(void)
 {
 #ifdef STARPU_USE_CUDA
 	if (ncuda > 0)

+ 0 - 1
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 #include "multiformat_types.h"
 
 static __global__ void cpu_to_cuda_cuda(struct point *src,

+ 0 - 1
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program opencl_conversion_program;
 

+ 0 - 1
examples/basic_examples/multiformat_cuda.cu

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 #include "multiformat_types.h"
 
 static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)

+ 0 - 1
examples/basic_examples/multiformat_opencl.c

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program opencl_program;
 

+ 4 - 5
examples/basic_examples/variable.c

@@ -29,7 +29,6 @@ extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 #endif
 
 #ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
 extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 struct starpu_opencl_program opencl_program;
 #endif
@@ -46,7 +45,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	niter /= 100;
 #endif
         if (argc == 2) niter = atoi(argv[1]);
@@ -75,7 +74,6 @@ int main(int argc, char **argv)
 	for (i = 0; i < niter; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
-                int ret;
 
 		task->cl = &cl;
 
@@ -87,7 +85,8 @@ int main(int argc, char **argv)
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");
-			exit(0);
+			starpu_data_unregister(float_array_handle);
+			goto enodev;
 		}
 	}
 
@@ -100,7 +99,7 @@ int main(int argc, char **argv)
 
 	starpu_shutdown();
 
-	return 0;
+	return (foo == niter) ? EXIT_SUCCESS:EXIT_FAILURE;
 
 enodev:
 	starpu_shutdown();

+ 1 - 2
examples/basic_examples/variable_kernels.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void cuda_variable(float * tab)
 {

+ 1 - 2
examples/basic_examples/variable_kernels_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program opencl_program;
 void opencl_codelet(void *descr[], void *_args)

+ 9 - 3
examples/basic_examples/vector_scal.c

@@ -24,7 +24,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
@@ -42,13 +41,13 @@ extern void scal_opencl_func(void *buffers[], void *_args);
 static struct starpu_perfmodel vector_scal_model =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale"
+	.symbol = "vector_scal"
 };
 
 static struct starpu_perfmodel vector_scal_power_model =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_power"
+	.symbol = "vector_scal_power"
 };
 
 static struct starpu_codelet cl =
@@ -88,8 +87,15 @@ struct starpu_opencl_program opencl_program;
 
 static int approximately_equal(float a, float b)
 {
+#ifdef STARPU_HAVE_NEARBYINTF
 	int ai = (int) nearbyintf(a * 1000.0);
 	int bi = (int) nearbyintf(b * 1000.0);
+#elif defined(STARPU_HAVE_RINTF)
+	int ai = (int) rintf(a * 1000.0);
+	int bi = (int) rintf(b * 1000.0);
+#else
+#error "Please define either nearbyintf or rintf."
+#endif
 	return ai == bi;
 }
 

+ 1 - 2
examples/basic_examples/vector_scal_c.c

@@ -26,7 +26,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include <stdio.h>
 
 
@@ -36,7 +35,7 @@ extern void scal_cuda_func(void *buffers[], void *_args);
 static struct starpu_perfmodel vector_scal_model =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scale_model"
+	.symbol = "vector_scal_model"
 };
 
 static struct starpu_codelet cl =

+ 4 - 63
examples/basic_examples/vector_scal_cpu.c

@@ -15,70 +15,11 @@
  */
 
 /*
- * This example complements vector_scale.c: here we implement a CPU version.
+ * This example complements vector_scal.c: here we implement a CPU version.
  */
 
-#include <starpu.h>
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
+#include "vector_scal_cpu_template.h"
 
-/* This kernel takes a buffer and scales it by a constant factor */
-void scal_cpu_func(void *buffers[], void *cl_arg)
-{
-	unsigned i;
-	float *factor = (float *) cl_arg;
+VECTOR_SCAL_CPU_FUNC(scal_cpu_func)
+VECTOR_SCAL_SSE_FUNC(scal_sse_func)
 
-	/*
-	 * The "buffers" array matches the task->handles array: for instance
-	 * task->handles[0] is a handle that corresponds to a data with
-	 * vector "interface", so that the first entry of the array in the
-	 * codelet  is a pointer to a structure describing such a vector (ie.
-	 * struct starpu_vector_interface *). Here, we therefore manipulate
-	 * the buffers[0] element as a vector: nx gives the number of elements
-	 * in the array, ptr gives the location of the array (that was possibly
-	 * migrated/replicated), and elemsize gives the size of each elements.
-	 */
-
-	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
-
-	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(vector);
-
-	/* get a pointer to the local copy of the vector : note that we have to
-	 * cast it in (float *) since a vector could contain any type of
-	 * elements so that the .ptr field is actually a uintptr_t */
-	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-
-	/* scale the vector */
-	for (i = 0; i < n; i++)
-		val[i] *= *factor;
-}
-
-#ifdef __SSE__
-void scal_sse_func(void *buffers[], void *cl_arg)
-{
-	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
-	unsigned int n_iterations = n/4;
-
-	__m128 *VECTOR = (__m128*) vector;
-	__m128 FACTOR __attribute__((aligned(16)));
-	float factor = *(float *) cl_arg;
-	FACTOR = _mm_set1_ps(factor);
-
-	unsigned int i;	
-	for (i = 0; i < n_iterations; i++)
-		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
-
-	unsigned int remainder = n%4;
-	if (remainder != 0)
-	{
-		unsigned int start = 4 * n_iterations;
-		for (i = start; i < start+remainder; ++i)
-		{
-			vector[i] = factor * vector[i];
-		}
-	}
-}
-#endif

+ 0 - 1
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -1 +0,0 @@
-vector_scal_cpu.c

+ 26 - 0
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example complements vector_scal.c: here we implement a CPU version,
+ * meant to be compiled by icc.
+ */
+
+#include "vector_scal_cpu_template.h"
+
+VECTOR_SCAL_CPU_FUNC(scal_cpu_func_icc)
+VECTOR_SCAL_SSE_FUNC(scal_sse_func_icc)
+

+ 93 - 0
examples/basic_examples/vector_scal_cpu_template.h

@@ -0,0 +1,93 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example complements vector_scal.c: here we implement a CPU version.
+ */
+
+#ifndef __VECTOR_SCAL_CPU_TEMPLATE_H__
+#define __VECTOR_SCAL_CPU_TEMPLATE_H__
+
+#include <starpu.h>
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
+/* This kernel takes a buffer and scales it by a constant factor */
+#define VECTOR_SCAL_CPU_FUNC(func_name)                                        \
+void func_name(void *buffers[], void *cl_arg)                                  \
+{                                                                              \
+	unsigned i;                                                            \
+	float *factor = (float *) cl_arg;                                      \
+                                                                               \
+	/*                                                                     \
+	 * The "buffers" array matches the task->handles array: for instance   \
+	 * task->handles[0] is a handle that corresponds to a data with        \
+	 * vector "interface", so that the first entry of the array in the     \
+	 * codelet  is a pointer to a structure describing such a vector (ie.  \
+	 * struct starpu_vector_interface *). Here, we therefore manipulate    \
+	 * the buffers[0] element as a vector: nx gives the number of elements \
+	 * in the array, ptr gives the location of the array (that was possibly \
+	 * migrated/replicated), and elemsize gives the size of each elements.  \
+	 */                                                                    \
+                                                                               \
+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0]; \
+                                                                               \
+	/* length of the vector */                                             \
+	unsigned n = STARPU_VECTOR_GET_NX(vector);                             \
+                                                                               \
+	/* get a pointer to the local copy of the vector : note that we have to \
+	 * cast it in (float *) since a vector could contain any type of       \
+	 * elements so that the .ptr field is actually a uintptr_t */          \
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);                   \
+                                                                               \
+	/* scale the vector */                                                 \
+	for (i = 0; i < n; i++)                                                \
+		val[i] *= *factor;                                             \
+}
+
+#ifdef __SSE__
+#define VECTOR_SCAL_SSE_FUNC(func_name)                                        \
+void func_name(void *buffers[], void *cl_arg)                                  \
+{                                                                              \
+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);           \
+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);                     \
+	unsigned int n_iterations = n/4;                                       \
+                                                                               \
+	__m128 *VECTOR = (__m128*) vector;                                     \
+	__m128 FACTOR __attribute__((aligned(16)));                            \
+	float factor = *(float *) cl_arg;                                      \
+	FACTOR = _mm_set1_ps(factor);                                          \
+                                                                               \
+	unsigned int i;	                                                       \
+	for (i = 0; i < n_iterations; i++)                                     \
+		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);                     \
+                                                                               \
+	unsigned int remainder = n%4;                                          \
+	if (remainder != 0)                                                    \
+	{                                                                      \
+		unsigned int start = 4 * n_iterations;                         \
+		for (i = start; i < start+remainder; ++i)                      \
+		{                                                              \
+			vector[i] = factor * vector[i];                        \
+		}                                                              \
+	}                                                                      \
+}
+#else /* !__SSE__ */
+#define VECTOR_SCAL_SSE_FUNC(func_name)
+#endif /* !__SSE__ */
+
+#endif /* !__VECTOR_SCAL_CPU_TEMPLATE_H__ */

+ 2 - 3
examples/basic_examples/vector_scal_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -16,11 +16,10 @@
  */
 
 /*
- * This example complements vector_scale.c: here we implement a CUDA version.
+ * This example complements vector_scal.c: here we implement a CUDA version.
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void vector_mult_cuda(float *val, unsigned n,
                                         float factor)

+ 2 - 3
examples/basic_examples/vector_scal_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
@@ -17,11 +17,10 @@
  */
 
 /*
- * This example complements vector_scale.c: here we implement a OpenCL version.
+ * This example complements vector_scal.c: here we implement a OpenCL version.
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 extern struct starpu_opencl_program opencl_program;
 

+ 0 - 1
examples/binary/binary.c

@@ -22,7 +22,6 @@
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 #ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
 extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 struct starpu_opencl_program opencl_program;
 #endif

+ 1 - 1
examples/cg/cg.c

@@ -406,7 +406,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 
-#ifdef STARPU_SLOW_MACHINE
+#ifdef STARPU_QUICK_CHECK
 	i_max = 16;
 #endif
 

+ 0 - 3
examples/cg/cg.h

@@ -24,11 +24,8 @@
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cublas.h>
-#include <starpu_cuda.h>
 #endif
 
-#include <starpu.h>
-
 #define DOUBLE
 
 #ifdef DOUBLE

+ 1 - 2
examples/cg/cg_dot_kernel.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 #include "cg.h"
 

+ 34 - 4
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,23 @@ static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE
 }
 #endif
 
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+	enum starpu_archtype type = starpu_worker_get_type(workerid);
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
+		return 1;
+
+#ifdef STARPU_USE_CUDA
+	/* Cuda device */
+	const struct cudaDeviceProp *props;
+	props = starpu_cuda_get_device_properties(workerid);
+	if (props->major >= 2 || props->minor >= 3)
+		/* At least compute capability 1.3, supports doubles */
+		return 1;
+#endif
+	/* Old card, does not support doubles */
+	return 0;
+}
 
 /*
  *	Reduction accumulation methods
@@ -76,6 +93,7 @@ static struct starpu_perfmodel accumulate_variable_model =
 
 struct starpu_codelet accumulate_variable_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -114,6 +132,7 @@ static struct starpu_perfmodel accumulate_vector_model =
 
 struct starpu_codelet accumulate_vector_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -154,6 +173,7 @@ static struct starpu_perfmodel bzero_variable_model =
 
 struct starpu_codelet bzero_variable_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -191,6 +211,7 @@ static struct starpu_perfmodel bzero_vector_model =
 
 struct starpu_codelet bzero_vector_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -246,6 +267,7 @@ static struct starpu_perfmodel dot_kernel_model =
 
 static struct starpu_codelet dot_kernel_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dot_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -264,9 +286,13 @@ int dot_kernel(starpu_data_handle_t v1,
 	int ret;
 
 	/* Blank the accumulation variable */
-	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
-	if (ret == -ENODEV) return ret;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	if (use_reduction)
+		starpu_data_invalidate_submit(s);
+	else {
+		ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	}
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
@@ -321,6 +347,7 @@ static struct starpu_perfmodel scal_kernel_model =
 
 static struct starpu_codelet scal_kernel_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -394,6 +421,7 @@ static struct starpu_perfmodel gemv_kernel_model =
 
 static struct starpu_codelet gemv_kernel_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
@@ -493,6 +521,7 @@ static struct starpu_perfmodel scal_axpy_kernel_model =
 
 static struct starpu_codelet scal_axpy_kernel_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
@@ -567,6 +596,7 @@ static struct starpu_perfmodel axpy_kernel_model =
 
 static struct starpu_codelet axpy_kernel_cl =
 {
+	.can_execute = can_execute,
 	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA

+ 47 - 8
examples/cholesky/cholesky.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,7 +30,6 @@
 
 #include <common/blas.h>
 #include <starpu.h>
-#include <starpu_bound.h>
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define NMAXBLOCKS	32
@@ -59,10 +58,16 @@
 static unsigned size = 4*1024;
 static unsigned nblocks = 16;
 static unsigned nbigblocks = 8;
-static unsigned pinned = 0;
+static unsigned pinned = 1;
 static unsigned noprio = 0;
 static unsigned check = 0;
 static unsigned bound = 0;
+static unsigned bound_deps = 0;
+static unsigned bound_lp = 0;
+static unsigned with_ctxs = 0;
+static unsigned with_noctxs = 0;
+static unsigned chole1 = 0;
+static unsigned chole2 = 0;
 
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
@@ -83,6 +88,29 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	int i;
 	for (i = 1; i < argc; i++)
 	{
+		if (strcmp(argv[i], "-with_ctxs") == 0) 
+		{
+			with_ctxs = 1;
+			break;
+		}
+		if (strcmp(argv[i], "-with_noctxs") == 0) 
+		{
+			with_noctxs = 1;
+			break;
+		}
+		
+		if (strcmp(argv[i], "-chole1") == 0) 
+		{
+			chole1 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-chole2") == 0) 
+		{
+			chole2 = 1;
+			break;
+		}
+
 		if (strcmp(argv[i], "-size") == 0)
 		{
 		        char *argptr;
@@ -101,9 +129,9 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-pin") == 0)
+		if (strcmp(argv[i], "-no-pin") == 0)
 		{
-			pinned = 1;
+			pinned = 0;
 		}
 
 		if (strcmp(argv[i], "-no-prio") == 0)
@@ -116,14 +144,25 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			bound = 1;
 		}
 
+		if (strcmp(argv[i], "-bound-lp") == 0)
+		{
+			bound_lp = 1;
+		}
+
+		if (strcmp(argv[i], "-bound-deps") == 0)
+		{
+			bound_deps = 1;
+		}
+
 		if (strcmp(argv[i], "-check") == 0)
 		{
 			check = 1;
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i],"--help") == 0)
 		{
-			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
+			fprintf(stderr,"usage : %s [-size size] [-nblocks nblocks] [-no-pin] [-no-prio] [-bound] [-bound-deps] [-bound-lp] [-check]\n", argv[0]);
+			fprintf(stderr,"Currently selected: %ux%u and %ux%u blocks\n", size, size, nblocks, nblocks);
 		}
 	}
 }

+ 10 - 6
examples/cholesky/cholesky_grain_tag.c

@@ -288,6 +288,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	starpu_helper_cublas_init();
 
+#ifndef STARPU_SIMGRID
 	if (pinned)
 	{
 		starpu_malloc((void **)A, dim*dim*sizeof(float));
@@ -296,21 +297,22 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
+#endif
 }
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 	int ret;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	ret = cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
@@ -345,9 +347,10 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
-	float *mat;
+	float *mat = NULL;
 	initialize_system(&mat, size, pinned);
 
+#ifndef STARPU_SIMGRID
 	unsigned i,j;
 	for (i = 0; i < size; i++)
 	{
@@ -357,6 +360,7 @@ int main(int argc, char **argv)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
+#endif
 
 
 #ifdef CHECK_OUTPUT

+ 75 - 35
examples/cholesky/cholesky_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -17,7 +17,7 @@
  */
 
 #include "cholesky.h"
-
+#include "../sched_ctx_utils/sched_ctx_utils.h"
 /*
  *	Create the codelets
  */
@@ -29,6 +29,8 @@ static struct starpu_codelet cl11 =
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_RW},
@@ -42,6 +44,8 @@ static struct starpu_codelet cl21 =
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
 #endif
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
@@ -56,6 +60,8 @@ static struct starpu_codelet cl22 =
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
 #endif
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW},
@@ -75,17 +81,17 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	int ret;
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	unsigned i,j,k;
 
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	if (bound)
-		starpu_bound_start(0, 0);
+		starpu_bound_start(bound_deps, 0);
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
 	{
@@ -135,21 +141,32 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	if (bound)
 		starpu_bound_stop();
 
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	FPRINTF(stderr, "Computation took (in ms)\n");
-	FPRINTF(stdout, "%2.2f\n", timing/1000);
+	end = starpu_timing_now();
 
+	double timing = end - start;
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
-	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-	if (bound)
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
+	else
 	{
-		double res;
-		starpu_bound_compute(&res, NULL, 0);
-		FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		FPRINTF(stderr, "Computation took (in ms)\n");
+		FPRINTF(stdout, "%2.2f\n", timing/1000);
+	
+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound_lp)
+		{
+			FILE *f = fopen("cholesky.lp", "w");
+			starpu_bound_print_lp(f);
+		}
+		if (bound)
+		{
+			double res;
+			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		}
 	}
 	return 0;
 }
@@ -184,28 +201,14 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	return ret;
 }
 
-int main(int argc, char **argv)
+static void execute_cholesky(unsigned size, unsigned nblocks)
 {
 	int ret;
+	float *mat = NULL;
+	unsigned i,j;
 
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	ret = starpu_init(NULL);
-	if (ret == -ENODEV)
-		return 77;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	starpu_helper_cublas_init();
-
-	float *mat;
+#ifndef STARPU_SIMGRID
 	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
-
-	unsigned i,j;
 	for (i = 0; i < size; i++)
 	{
 		for (j = 0; j < size; j++)
@@ -214,6 +217,7 @@ int main(int argc, char **argv)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
+#endif
 
 /* #define PRINT_OUTPUT */
 #ifdef PRINT_OUTPUT
@@ -314,9 +318,45 @@ int main(int argc, char **argv)
 	        }
 		free(test_mat);
 	}
+	starpu_free(mat);
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		parse_args_ctx(argc, argv);
+
+	int ret;
+	ret = starpu_init(NULL);
+
+	if (ret == -ENODEV)
+                return 77;
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	if(with_ctxs)
+	{
+		construct_contexts(execute_cholesky);
+		start_2benchs(execute_cholesky);
+	}
+	else if(with_noctxs)
+		start_2benchs(execute_cholesky);
+	else if(chole1)
+		start_1stbench(execute_cholesky);
+	else if(chole2)
+		start_2ndbench(execute_cholesky);
+	else
+		execute_cholesky(size, nblocks);
 
 	starpu_helper_cublas_shutdown();
-	starpu_free(mat);
 	starpu_shutdown();
 
 	return ret;

+ 5 - 8
examples/cholesky/cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,16 +15,13 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <starpu_config.h>
+#include <starpu.h>
 #include "cholesky.h"
 #include "../common/blas.h"
-#ifdef STARPU_USE_CUDA
-#include <starpu_cuda.h>
-#ifdef STARPU_HAVE_MAGMA
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #include "magma_lapack.h"
 #endif
-#endif
 
 /*
  *   U22 
@@ -196,7 +193,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaThreadSynchronize();
+			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			STARPU_ASSERT(!cures);
 			}
 #else

+ 10 - 6
examples/cholesky/cholesky_tag.c

@@ -175,15 +175,15 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
 static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -230,10 +230,10 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	starpu_data_unpartition(dataA, 0);
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
@@ -254,6 +254,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	starpu_helper_cublas_init();
 
+#ifndef STARPU_SIMGRID
 	if (pinned)
 	{
 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
@@ -262,6 +263,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
+#endif
 	return 0;
 }
 
@@ -318,10 +320,11 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
-	float *mat;
+	float *mat = NULL;
 	int ret = initialize_system(&mat, size, pinned);
 	if (ret) return ret;
 
+#ifndef STARPU_SIMGRID
 	unsigned i,j;
 	for (i = 0; i < size; i++)
 	{
@@ -331,6 +334,7 @@ int main(int argc, char **argv)
 			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
 		}
 	}
+#endif
 
 
 #ifdef CHECK_OUTPUT

+ 12 - 31
examples/cholesky/cholesky_tile_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,18 +42,11 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_POTRF
-	.gordon_func = SPU_FUNC_POTRF,
-#else
-#warning SPU_FUNC_POTRF is not available
-#endif
-#endif
 	.nbuffers = 1,
 	.model = &chol_model_11
 };
@@ -84,18 +77,11 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_STRSM
-	.gordon_func = SPU_FUNC_STRSM,
-#else
-#warning SPU_FUNC_STRSM is not available
-#endif
-#endif
 	.nbuffers = 2,
 	.model = &chol_model_21
 };
@@ -135,18 +121,11 @@ static int create_task_21(unsigned k, unsigned j)
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-	.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
-#endif
 	.nbuffers = 3,
 	.model = &chol_model_22
 };
@@ -195,8 +174,8 @@ static int cholesky_no_stride(void)
 {
 	int ret;
 
-	struct timeval start;
-	struct timeval end;
+	double start;
+	double end;
 
 	struct starpu_task *entry_task = NULL;
 
@@ -234,7 +213,7 @@ static int cholesky_no_stride(void)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
+	start = starpu_timing_now();
 	ret = starpu_task_submit(entry_task);
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -242,9 +221,9 @@ static int cholesky_no_stride(void)
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
 
-	gettimeofday(&end, NULL);
+	end = starpu_timing_now();
 
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	double timing = end - start;
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stdout, "%2.2f\n", timing/1000);
 
@@ -257,7 +236,6 @@ static int cholesky_no_stride(void)
 int main(int argc, char **argv)
 {
 	unsigned x, y;
-	unsigned i, j;
 	int ret;
 
 	parse_args(argc, argv);
@@ -275,6 +253,7 @@ int main(int argc, char **argv)
 
 	starpu_helper_cublas_init();
 
+#ifndef STARPU_SIMGRID
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
@@ -297,6 +276,7 @@ int main(int argc, char **argv)
 	for (x = 0; x < nblocks; x++)
 	if (x <= y)
 	{
+		unsigned i, j;
 		for (i = 0; i < BLOCKSIZE; i++)
 		for (j = 0; j < BLOCKSIZE; j++)
 		{
@@ -308,6 +288,7 @@ int main(int argc, char **argv)
 				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
 		}
 	}
+#endif
 
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)

+ 0 - 1
examples/cpp/incrementer_cpp.cpp

@@ -25,7 +25,6 @@ extern "C" void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args
 #endif
 
 #ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
 extern "C" void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
 struct starpu_opencl_program opencl_program;
 #endif

+ 0 - 1
examples/filters/custom_mf/conversion.cu

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 #include "custom_types.h"
 #include "custom_interface.h"
 

+ 0 - 1
examples/filters/custom_mf/conversion_opencl.c

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include "custom_types.h"
 #include "custom_interface.h"
 

+ 0 - 1
examples/filters/custom_mf/cuda.cu

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 #include "custom_types.h"
 #include "custom_interface.h"
 

+ 57 - 171
examples/filters/custom_mf/custom_interface.c

@@ -14,20 +14,10 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 #include <starpu.h>
-#include <starpu_hash.h>
-#ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
-#endif
 #include "custom_interface.h"
 #include "custom_types.h"
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node,
-			   void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
-			    void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
-			    void *dst_interface, unsigned dst_node);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
 				  void *dst_interface, unsigned dst_node,
 				  cudaStream_t stream);
@@ -50,19 +40,18 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 				 void *dst_interface, unsigned dst_node);
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 				    void *dst_interface, unsigned dst_node,
-				    void *event);
+				    cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 				    void *dst_interface, unsigned dst_node,
-				    void *event);
+				    cl_event *event);
 #endif /* !STARPU_USE_OPENCL */
 
 static struct starpu_data_copy_methods custom_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-	.ram_to_spu = NULL,
+	.ram_to_ram = NULL,
 #ifdef STARPU_USE_CUDA
-	.ram_to_cuda        = copy_ram_to_cuda,
-	.cuda_to_ram        = copy_cuda_to_ram,
+	.ram_to_cuda        = NULL,
+	.cuda_to_ram        = NULL,
 	.ram_to_cuda_async  = copy_ram_to_cuda_async,
 	.cuda_to_ram_async  = copy_cuda_to_ram_async,
 	.cuda_to_cuda       = copy_cuda_to_cuda,
@@ -75,23 +64,18 @@ static struct starpu_data_copy_methods custom_copy_data_methods_s =
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
-	.cuda_to_spu = NULL,
-	.spu_to_ram  = NULL,
-	.spu_to_cuda = NULL,
-	.spu_to_spu  = NULL
 };
 
 static void     register_custom_handle(starpu_data_handle_t handle,
-				       uint32_t home_node,
+				       unsigned home_node,
 				       void *data_interface);
 static ssize_t  allocate_custom_buffer_on_node(void *data_interface_,
-					       uint32_t dst_node);
+					       unsigned dst_node);
 static void*    custom_handle_to_pointer(starpu_data_handle_t data_handle,
-					 uint32_t node);
-static void     free_custom_buffer_on_node(void *data_interface, uint32_t node);
+					 unsigned node);
+static void     free_custom_buffer_on_node(void *data_interface, unsigned node);
 static size_t   custom_interface_get_size(starpu_data_handle_t handle);
 static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle);
-static int      custom_compare(void *data_interface_a, void *data_interface_b);
 static void     display_custom_interface(starpu_data_handle_t handle, FILE *f);
 static uint32_t custom_get_nx(starpu_data_handle_t handle);
 
@@ -113,10 +97,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 	.copy_methods          = &custom_copy_data_methods_s,
 	.get_size              = custom_interface_get_size,
 	.footprint             = footprint_custom_interface_crc32,
-	.compare               = custom_compare,
-#ifdef STARPU_USE_GORDON
-	.convert_to_gordon     = NULL,
-#endif
+	.compare               = NULL,
 	.interfaceid           = -1,
 	.interface_size        = sizeof(struct custom_data_interface),
 	.display               = display_custom_interface,
@@ -125,7 +106,7 @@ static struct starpu_data_interface_ops interface_custom_ops =
 };
 
 static void
-register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
+register_custom_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
 {
 	struct custom_data_interface *custom_interface;
 	custom_interface = (struct custom_data_interface *) data_interface;
@@ -162,135 +143,65 @@ register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *da
 	}
 }
 
-static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t node)
+static ssize_t allocate_custom_buffer_on_node(void *data_interface, unsigned node)
 {
 	ssize_t size = 0;
 	struct custom_data_interface *custom_interface;
 	custom_interface = (struct custom_data_interface *) data_interface;
 
-	switch(starpu_node_get_kind(node))
-	{
-	case STARPU_CPU_RAM:
-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		custom_interface->cpu_ptr = (void*) malloc(size);
-		if (!custom_interface->cpu_ptr)
-			return -ENOMEM;
+	size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+	custom_interface->cpu_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->cpu_ptr)
+		goto fail_cpu;
 #ifdef STARPU_USE_CUDA
-		custom_interface->cuda_ptr = (void *) malloc(size);
-		if (!custom_interface->cuda_ptr)
-		{
-			free(custom_interface->cpu_ptr);
-			custom_interface->cpu_ptr = NULL;
-			return -ENOMEM;
-		}
-#endif /* !STARPU_USE_CUDA */
+	custom_interface->cuda_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->cuda_ptr)
+		goto fail_cuda;
+#endif
 #ifdef STARPU_USE_OPENCL
-		custom_interface->opencl_ptr = malloc(size);
-		if (custom_interface->cuda_ptr == NULL)
-		{
-			free(custom_interface->cpu_ptr);
-#ifdef STARPU_USE_CUDA
-			free(custom_interface->cuda_ptr);
-#endif /* !STARPU_USE_CUDA */
-			return -ENOMEM;
-		}
-#endif /* !STARPU_USE_OPENCL */
-			
-		break;
+	custom_interface->opencl_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	if (!custom_interface->opencl_ptr)
+		goto fail_opencl;
+#endif
+
+	return size
 #ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-	{
-		cudaError_t err;
-		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		err = cudaMalloc(&custom_interface->cuda_ptr, size);
-		if (err != cudaSuccess)
-			return -ENOMEM;
-
-		err = cudaMalloc(&custom_interface->cpu_ptr, size);
-		if (err != cudaSuccess)
-		{
-			cudaFree(custom_interface->cuda_ptr);
-			return -ENOMEM;
-		}
-		break;
-	}
+		+size
 #endif
 #ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
-	{
-		cl_int err;
-		cl_mem memory;
-		ssize_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-		err = starpu_opencl_allocate_memory(&memory, size, CL_MEM_READ_WRITE);
-		if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-
-		custom_interface->opencl_ptr = memory;
-
-		break;
-	}
-#endif /* !STARPU_USE_OPENCL */
-	default:
-		assert(0);
-	}
-
-	/* XXX We may want to return cpu_size + cuda_size + ... */
-	return size;
+		+size
+#endif
+		;
+#ifdef STARPU_USE_OPENCL
+fail_opencl:
+#ifdef STARPU_USE_CUDA
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+#endif
+#endif
+#ifdef STARPU_USE_CUDA
+fail_cuda:
+#endif
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
+fail_cpu:
+	return -ENOMEM;
 }
 
-static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
+static void free_custom_buffer_on_node(void *data_interface, unsigned node)
 {
-	struct custom_data_interface *custom_interface;
-	custom_interface = (struct custom_data_interface *) data_interface;
+	struct custom_data_interface *custom_interface = (struct custom_data_interface *) data_interface;
+	size_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
 
-	switch(starpu_node_get_kind(node))
-	{
-	case STARPU_CPU_RAM:
-		if (custom_interface->cpu_ptr != NULL)
-		{
-			free(custom_interface->cpu_ptr);
-			custom_interface->cpu_ptr = NULL;
-		}
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
 #ifdef STARPU_USE_CUDA
-		if (custom_interface->cuda_ptr != NULL)
-		{
-			free(custom_interface->cuda_ptr);
-			custom_interface->cuda_ptr = NULL;
-		}
-#endif /* !STARPU_USE_CUDA */
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+#endif
 #ifdef STARPU_USE_OPENCL
-		if (custom_interface->opencl_ptr != NULL)
-		{
-			free(custom_interface->opencl_ptr);
-			custom_interface->opencl_ptr = NULL;
-		}
-#endif /* !STARPU_USE_OPENCL */
-		break;
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		if (custom_interface->cpu_ptr != NULL)
-		{
-			cudaError_t err;
-			err = cudaFree(custom_interface->cpu_ptr);
-			if (err != cudaSuccess)
-				fprintf(stderr, "cudaFree failed...\n");
-		}
-		if (custom_interface->cuda_ptr != NULL)
-		{
-			cudaError_t err;
-			err = cudaFree(custom_interface->cuda_ptr);
-			if (err != cudaSuccess)
-				fprintf(stderr, "cudaFree failed...\n");
-		}
-		break;
-#endif /* !STARPU_USE_CUDA */
-	default:
-		assert(0);
-	}
+	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
+#endif
 }
 
 static void*
-custom_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
+custom_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
 {
 	struct custom_data_interface *data_interface =
 		(struct custom_data_interface *) starpu_data_get_interface_on_node(handle, node);
@@ -329,16 +240,11 @@ static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
 	return starpu_crc32_be(custom_get_nx(handle), 0);
 }
 
-static int custom_compare(void *data_interface_a, void *data_interface_b)
-{
-	/* TODO */
-	assert(0);
-}
-
 static void display_custom_interface(starpu_data_handle_t handle, FILE *f)
 {
-	/* TODO */
-	assert(0);
+	struct custom_data_interface *ci = (struct custom_data_interface *)
+		starpu_data_get_interface_on_node(handle, 0);
+	fprintf(f, "Custom interface of size %d", ci->nx);
 }
 
 static uint32_t
@@ -352,12 +258,11 @@ custom_get_nx(starpu_data_handle_t handle)
 
 
 void custom_data_register(starpu_data_handle_t *handle,
-				 uint32_t home_node,
+				 unsigned home_node,
 				 void *ptr,
 				 uint32_t nx,
 				 struct starpu_multiformat_data_interface_ops *format_ops)
 {
-	/* XXX Deprecated fields ? */
 	struct custom_data_interface custom =
 	{
 		.cpu_ptr = ptr,
@@ -377,26 +282,7 @@ void custom_data_register(starpu_data_handle_t *handle,
 	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
 }
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node,
-			   void *dst_interface, unsigned dst_node)
-{
-	/* TODO */
-	assert(0);
-}
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
-			    void *dst_interface, unsigned dst_node)
-{
-	/* TODO */
-	assert(0);
-}
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
-			    void *dst_interface, unsigned dst_node)
-{
-	/* TODO */
-	assert(0);
-}
-
 static int
 copy_cuda_common_async(void *src_interface, unsigned src_node,
 		       void *dst_interface, unsigned dst_node,
@@ -513,7 +399,7 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 				    void *dst_interface, unsigned dst_node,
-				    void *event)
+				    cl_event *event)
 {
 	ssize_t size;
 	struct custom_data_interface *src_custom, *dst_custom;
@@ -556,7 +442,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 				    void *dst_interface, unsigned dst_node,
-				    void *event)
+				    cl_event *event)
 {
 	ssize_t size;
 	struct custom_data_interface *src_custom, *dst_custom;

+ 1 - 1
examples/filters/custom_mf/custom_interface.h

@@ -26,7 +26,7 @@ struct custom_data_interface
 };
 
 void custom_data_register(starpu_data_handle_t *handle,
-				 uint32_t home_node,
+				 unsigned home_node,
 				 void *ptr,
 				 uint32_t nx,
 				 struct starpu_multiformat_data_interface_ops* ops);

+ 0 - 3
examples/filters/custom_mf/custom_mf_filter.c

@@ -16,9 +16,6 @@
 #include <starpu.h>
 #include "custom_interface.h"
 #include "custom_types.h"
-#ifdef STARPU_USE_OPENCL
-#include <starpu_opencl.h>
-#endif /* !STARPU_USE_OPENCL */
 
 #define N 12
 

+ 0 - 1
examples/filters/custom_mf/custom_opencl.c

@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 #include "custom_types.h"
 #include "custom_interface.h"
 

+ 1 - 2
examples/filters/fblock.c

@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 #define NX    5
 #define NY    4
@@ -139,7 +138,7 @@ int main(int argc, char **argv)
         /* Submit a task on each sub-block */
         for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
         {
-                int ret,multiplier=i;
+                int multiplier=i;
                 struct starpu_task *task = starpu_task_create();
 
                 FPRINTF(stderr,"Dealing with sub-block %d\n", i);

+ 1 - 2
examples/filters/fblock_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_cuda.h>
 
 static __global__ void fblock_cuda(int *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float factor)
 {

+ 1 - 2
examples/filters/fblock_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -16,7 +16,6 @@
  */
 
 #include <starpu.h>
-#include <starpu_opencl.h>
 
 #define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
 do                                                          \

+ 189 - 0
examples/filters/shadow.c

@@ -0,0 +1,189 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the shadow filter: a source "vector" of NX
+ * elements (plus 2*SHADOW wrap-around elements) is partitioned into vectors
+ * with some shadowing, and these are copied into a destination "vector2" of
+ * NRPARTS*(NX/NPARTS+2*SHADOW) elements, partitioned in the traditionnal way,
+ * thus showing how shadowing shows up.
+ *
+ * For instance, with NX=8, SHADOW=1, and NPARTS=4:
+ *
+ * vector
+ * x0 x1 x2 x3 x4 x5 x6 x7 x8 x9
+ *
+ * is partitioned into 4 pieces:
+ *
+ * x0 x1 x2 x3
+ *       x2 x3 x4 x5
+ *             x4 x5 x6 x7
+ *                   x6 x7 x8 x9
+ *
+ * which are copied into the 4 destination subparts of vector2, thus getting in
+ * the end:
+ *
+ * x0 x1 x2 x3 x2 x3 x4 x5 x4 x5 x6 x7 x6 x7 x8 x9
+ */
+
+#include <starpu.h>
+
+/* Shadow width */
+#define SHADOW 2
+#define NX    30
+#define PARTS 3
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        unsigned i;
+
+        /* length of the shadowed source vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the shadowed source vector pointer */
+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+
+        /* length of the destination vector */
+        unsigned n2 = STARPU_VECTOR_GET_NX(buffers[1]);
+        /* local copy of the destination vector pointer */
+        int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	for (i = 0; i < n; i++)
+		val2[i] = val[i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source vector */
+        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        /* local copy of the shadowed source vector pointer */
+        int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+
+        /* length of the destination vector */
+        unsigned n2 = STARPU_VECTOR_GET_NX(buffers[1]);
+        /* local copy of the destination vector pointer */
+        int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned i, j;
+        int vector[NX + 2*SHADOW];
+        int vector2[NX + PARTS*2*SHADOW];
+	starpu_data_handle_t handle, handle2;
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+			|STARPU_CUDA
+#endif
+			,
+                .cpu_funcs = {cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+                .cuda_funcs = {cuda_func, NULL},
+#endif
+                .nbuffers = 2,
+		.modes = {STARPU_R, STARPU_W}
+        };
+
+        for(i=0 ; i<NX ; i++) vector[SHADOW+i] = i;
+	for(i=0 ; i<SHADOW ; i++) vector[i] = vector[i+NX];
+	for(i=0 ; i<SHADOW ; i++) vector[SHADOW+NX+i] = vector[SHADOW+i];
+        FPRINTF(stderr,"IN  Vector: ");
+        for(i=0 ; i<NX + 2*SHADOW ; i++) FPRINTF(stderr, "%5d ", vector[i]);
+        FPRINTF(stderr,"\n");
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare source vector to StarPU */
+	starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX + 2*SHADOW, sizeof(vector[0]));
+
+	/* Declare destination vector to StarPU */
+	starpu_vector_data_register(&handle2, 0, (uintptr_t)vector2, NX + PARTS*2*SHADOW, sizeof(vector[0]));
+
+        /* Partition the source vector in PARTS sub-vectors with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_block_shadow_filter_func_vector,
+		.nchildren = PARTS,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOW /* Shadow width */
+	};
+	starpu_data_partition(handle, &f);
+
+        /* Partition the destination vector in PARTS sub-vectors */
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.nchildren = PARTS,
+	};
+	starpu_data_partition(handle2, &f2);
+
+        /* Submit a task on each sub-vector */
+	for (i=0; i<starpu_data_get_nb_children(handle); i++)
+	{
+                starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
+                starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 1, i);
+                struct starpu_task *task = starpu_task_create();
+
+		task->handles[0] = sub_handle;
+		task->handles[1] = sub_handle2;
+                task->cl = &cl;
+                task->synchronous = 1;
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV) goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle2, 0);
+        starpu_data_unregister(handle);
+        starpu_data_unregister(handle2);
+	starpu_shutdown();
+
+        FPRINTF(stderr,"OUT Vector: ");
+        for(i=0 ; i<NX + PARTS*2*SHADOW ; i++) FPRINTF(stderr, "%5d ", vector2[i]);
+        FPRINTF(stderr,"\n");
+	for(i=0 ; i<PARTS ; i++)
+		for (j=0 ; j<NX/PARTS ; j++)
+			STARPU_ASSERT(vector2[i*(NX/PARTS+2*SHADOW)+j] == vector[i*(NX/PARTS)+j]);
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 291 - 0
examples/filters/shadow2d.c

@@ -0,0 +1,291 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the matrix shadow filters: a source "matrix" of
+ * NX*NY elements (plus 2*NX*SHADOWX+2*NY*SHADOWY+4*SHADOWX*SHADOWY wrap-around
+ * elements) is partitioned into matrices with some shadowing, and these are
+ * copied into a destination "matrix2" of
+ * NRPARTSX*NPARTSY*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)) elements,
+ * partitioned in the traditionnal way, thus showing how shadowing shows up.
+ *
+ * For instance, with NX=NY=8, SHADOWX=SHADOWY=1, and NPARTSX=NPARTSY=4:
+ *
+ * matrix
+ * 0123456789
+ * 1234567890
+ * 2345678901
+ * 3456789012
+ * 4567890123
+ * 5678901234
+ * 6789012345
+ * 7890123456
+ * 8901234567
+ * 9012345678
+ *
+ * is partitioned into 4*4 pieces:
+ *
+ * 0123 2345 4567 6789
+ * 1234 3456 5678 7890
+ * 2345 4567 6789 8901
+ * 3456 5678 7890 9012
+ *
+ * 2345 4567 6789 8901
+ * 3456 5678 7890 9012
+ * 4567 6789 8901 0123
+ * 5678 7890 9012 1234
+ *
+ * 4567 6789 8901 0123
+ * 5678 7890 9012 1234
+ * 6789 8901 0123 2345
+ * 7890 9012 1234 3456
+ *
+ * 6789 8901 0123 2345
+ * 7890 9012 1234 3456
+ * 8901 0123 2345 4567
+ * 9012 1234 3456 5678
+ *
+ * which are copied into the 4*4 destination subparts of matrix2, thus getting in
+ * the end:
+ *
+ * 0123234545676789
+ * 1234345656787890
+ * 2345456767898901
+ * 3456567878909012
+ * 2345456767898901
+ * 3456567878909012
+ * 4567678989010123
+ * 5678789090121234
+ * 4567678989010123
+ * 5678789090121234
+ * 6789890101232345
+ * 7890901212343456
+ * 6789890101232345
+ * 7890901212343456
+ * 8901012323454567
+ * 9012123434565678
+ */
+
+#include <starpu.h>
+
+/* Shadow width */
+#define SHADOWX 3
+#define SHADOWY 2
+#define NX    20
+#define NY    30
+#define PARTSX 2
+#define PARTSY 3
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
+        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
+        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+	unsigned i, j;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	STARPU_ASSERT(m == m2);
+	for (j = 0; j < m; j++)
+		for (i = 0; i < n; i++)
+			val2[j*ld2+i] = val[j*ld+i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
+        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
+        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	STARPU_ASSERT(m == m2);
+	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned i, j, k, l;
+        int matrix[NY + 2*SHADOWY][NX + 2*SHADOWX];
+        int matrix2[NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
+	starpu_data_handle_t handle, handle2;
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+			|STARPU_CUDA
+#endif
+			,
+                .cpu_funcs = {cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+                .cuda_funcs = {cuda_func, NULL},
+#endif
+                .nbuffers = 2,
+		.modes = {STARPU_R, STARPU_W}
+        };
+
+	memset(matrix, -1, sizeof(matrix));
+	for(j=1 ; j<=NY ; j++)
+		for(i=1 ; i<=NX ; i++)
+			matrix[SHADOWY+j-1][SHADOWX+i-1] = i+j;
+
+	/* Copy borders */
+	for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+		for(i=0 ; i<SHADOWX ; i++) {
+			matrix[j][i] = matrix[j][i+NX];
+			matrix[j][SHADOWX+NX+i] = matrix[j][SHADOWX+i];
+		}
+	for(j=0 ; j<SHADOWY ; j++)
+		for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+			matrix[j][i] = matrix[j+NY][i];
+			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i];
+		}
+	/* Copy corners */
+	for(j=0 ; j<SHADOWY ; j++)
+		for(i=0 ; i<SHADOWX ; i++) {
+			matrix[j][i] = matrix[j+NY][i+NX];
+			matrix[j][SHADOWX+NX+i] = matrix[j+NY][SHADOWX+i];
+			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i+NX];
+			matrix[SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWY+j][SHADOWX+i];
+		}
+
+        FPRINTF(stderr,"IN  Matrix:\n");
+	for(j=0 ; j<NY + 2*SHADOWY ; j++)
+	{
+		for(i=0 ; i<NX + 2*SHADOWX ; i++)
+			FPRINTF(stderr, "%5d ", matrix[j][i]);
+		FPRINTF(stderr,"\n");
+	}
+        FPRINTF(stderr,"\n");
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare source matrix to StarPU */
+	starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX + 2*SHADOWX, NX + 2*SHADOWX, NY + 2*SHADOWY, sizeof(matrix[0][0]));
+
+	/* Declare destination matrix to StarPU */
+	starpu_matrix_data_register(&handle2, 0, (uintptr_t)matrix2, NX + PARTSX*2*SHADOWX, NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, sizeof(matrix2[0][0]));
+
+        /* Partition the source matrix in PARTSY*PARTSX sub-matrices with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
+	struct starpu_data_filter fy =
+	{
+		.filter_func = starpu_vertical_block_shadow_filter_func,
+		.nchildren = PARTSY,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
+	};
+	struct starpu_data_filter fx =
+	{
+		.filter_func = starpu_block_shadow_filter_func,
+		.nchildren = PARTSX,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
+	};
+	starpu_data_map_filters(handle, 2, &fy, &fx);
+
+        /* Partition the destination matrix in PARTSY*PARTSX sub-matrices */
+	struct starpu_data_filter fy2 =
+	{
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = PARTSY,
+	};
+	struct starpu_data_filter fx2 =
+	{
+		.filter_func = starpu_block_filter_func,
+		.nchildren = PARTSX,
+	};
+	starpu_data_map_filters(handle2, 2, &fy2, &fx2);
+
+        /* Submit a task on each sub-matrix */
+	for (j=0; j<PARTSY; j++)
+	{
+		for (i=0; i<PARTSX; i++)
+		{
+			starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 2, j, i);
+			starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 2, j, i);
+			struct starpu_task *task = starpu_task_create();
+
+			task->handles[0] = sub_handle;
+			task->handles[1] = sub_handle2;
+			task->cl = &cl;
+			task->synchronous = 1;
+
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle2, 0);
+        starpu_data_unregister(handle);
+        starpu_data_unregister(handle2);
+	starpu_shutdown();
+
+        FPRINTF(stderr,"OUT Matrix:\n");
+	for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
+	{
+		for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++)
+			FPRINTF(stderr, "%5d ", matrix2[j][i]);
+		FPRINTF(stderr,"\n");
+	}
+        FPRINTF(stderr,"\n");
+	for(j=0 ; j<PARTSY ; j++)
+		for(i=0 ; i<PARTSX ; i++)
+			for (l=0 ; l<NY/PARTSY + 2*SHADOWY ; l++)
+				for (k=0 ; k<NX/PARTSX + 2*SHADOWX ; k++)
+					STARPU_ASSERT(matrix2[j*(NY/PARTSY+2*SHADOWY)+l][i*(NX/PARTSX+2*SHADOWX)+k] == matrix[j*(NY/PARTSY)+l][i*(NX/PARTSX)+k]);
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 331 - 0
examples/filters/shadow3d.c

@@ -0,0 +1,331 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the 3D matrix shadow filters: a source "matrix" of
+ * NX*NY*NZ elements (plus SHADOW wrap-around elements) is partitioned into
+ * matrices with some shadowing, and these are copied into a destination
+ * "matrix2" of
+ * NRPARTSX*NPARTSY*NPARTSZ*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)*(NZ/NPARTSZ+2*SHADOWZ))
+ * elements, partitioned in the traditionnal way, thus showing how shadowing
+ * shows up.
+ */
+
+#include <starpu.h>
+
+/* Shadow width */
+#define SHADOWX 2
+#define SHADOWY 3
+#define SHADOWZ 4
+#define NX    12
+#define NY    9
+#define NZ    6
+#define PARTSX 4
+#define PARTSY 3
+#define PARTSZ 2
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
+        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
+        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
+        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
+
+	unsigned i, j, k;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(x == x2);
+	STARPU_ASSERT(y == y2);
+	STARPU_ASSERT(z == z2);
+	for (k = 0; k < z; k++)
+		for (j = 0; j < y; j++)
+			for (i = 0; i < x; i++)
+				val2[k*ldz2+j*ldy2+i] = val[k*ldz+j*ldy+i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
+        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
+        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
+        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
+
+	unsigned k;
+	cudaError_t cures;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(x == x2);
+	STARPU_ASSERT(y == y2);
+	STARPU_ASSERT(z == z2);
+	for (k = 0; k < z; k++) {
+		cures = cudaMemcpy2DAsync(val2+k*ldz2, ldy2*sizeof(*val2), val+k*ldz, ldy*sizeof(*val),
+				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		STARPU_ASSERT(!cures);
+	}
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	STARPU_ASSERT(!cures);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned i, j, k, l, m, n;
+        int matrix[NZ + 2*SHADOWZ][NY + 2*SHADOWY][NX + 2*SHADOWX];
+        int matrix2[NZ + PARTSZ*2*SHADOWZ][NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
+	starpu_data_handle_t handle, handle2;
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+			|STARPU_CUDA
+#endif
+			,
+                .cpu_funcs = {cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+                .cuda_funcs = {cuda_func, NULL},
+#endif
+                .nbuffers = 2,
+		.modes = {STARPU_R, STARPU_W}
+        };
+
+	memset(matrix, -1, sizeof(matrix));
+	for(k=1 ; k<=NZ ; k++)
+		for(j=1 ; j<=NY ; j++)
+			for(i=1 ; i<=NX ; i++)
+				matrix[SHADOWZ+k-1][SHADOWY+j-1][SHADOWX+i-1] = i+j+k;
+
+	/* Copy planes */
+	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k][j][i+NX];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k][j][SHADOWX+i];
+			}
+	for(k=SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k][j+NY][i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j][i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i];
+			}
+
+	/* Copy borders */
+	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k][j+NY][i+NX];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i+NX];
+				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k][SHADOWY+j][SHADOWX+i];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k][j+NY][SHADOWX+i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j][i+NX];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i+NX];
+				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j][SHADOWX+i];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j][SHADOWX+i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j+NY][i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i];
+			}
+
+	/* Copy corners */
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j+NY][i+NX];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j+NY][SHADOWX+i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i+NX];
+				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k+NZ][SHADOWY+j][SHADOWX+i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i+NX];
+				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j+NY][SHADOWX+i];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i+NX];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWZ+k][SHADOWY+j][SHADOWX+i];
+			}
+
+        FPRINTF(stderr,"IN  Matrix:\n");
+	for(k=0 ; k<NZ + 2*SHADOWZ ; k++)
+	{
+		for(j=0 ; j<NY + 2*SHADOWY ; j++)
+		{
+			for(i=0 ; i<NX + 2*SHADOWX ; i++)
+				FPRINTF(stderr, "%5d ", matrix[k][j][i]);
+			FPRINTF(stderr,"\n");
+		}
+		FPRINTF(stderr,"\n\n");
+	}
+        FPRINTF(stderr,"\n");
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare source matrix to StarPU */
+	starpu_block_data_register(&handle, 0, (uintptr_t)matrix,
+			NX + 2*SHADOWX, (NX + 2*SHADOWX) * (NY + 2*SHADOWY),
+			NX + 2*SHADOWX, NY + 2*SHADOWY, NZ + 2*SHADOWZ,
+			sizeof(matrix[0][0][0]));
+
+	/* Declare destination matrix to StarPU */
+	starpu_block_data_register(&handle2, 0, (uintptr_t)matrix2,
+			NX + PARTSX*2*SHADOWX, (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY),
+			NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, NZ + PARTSZ*2*SHADOWZ,
+			sizeof(matrix2[0][0][0]));
+
+        /* Partition the source matrix in PARTSZ*PARTSY*PARTSX sub-matrices with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
+	struct starpu_data_filter fz =
+	{
+		.filter_func = starpu_depth_block_shadow_filter_func_block,
+		.nchildren = PARTSZ,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
+	};
+	struct starpu_data_filter fy =
+	{
+		.filter_func = starpu_vertical_block_shadow_filter_func_block,
+		.nchildren = PARTSY,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
+	};
+	struct starpu_data_filter fx =
+	{
+		.filter_func = starpu_block_shadow_filter_func_block,
+		.nchildren = PARTSX,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
+	};
+	starpu_data_map_filters(handle, 3, &fz, &fy, &fx);
+
+        /* Partition the destination matrix in PARTSZ*PARTSY*PARTSX sub-matrices */
+	struct starpu_data_filter fz2 =
+	{
+		.filter_func = starpu_depth_block_filter_func_block,
+		.nchildren = PARTSZ,
+	};
+	struct starpu_data_filter fy2 =
+	{
+		.filter_func = starpu_vertical_block_filter_func_block,
+		.nchildren = PARTSY,
+	};
+	struct starpu_data_filter fx2 =
+	{
+		.filter_func = starpu_block_filter_func_block,
+		.nchildren = PARTSX,
+	};
+	starpu_data_map_filters(handle2, 3, &fz2, &fy2, &fx2);
+
+        /* Submit a task on each sub-matrix */
+	for (k=0; k<PARTSZ; k++)
+	{
+		for (j=0; j<PARTSY; j++)
+		{
+			for (i=0; i<PARTSX; i++)
+			{
+				starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 3, k, j, i);
+				starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 3, k, j, i);
+				struct starpu_task *task = starpu_task_create();
+
+				task->handles[0] = sub_handle;
+				task->handles[1] = sub_handle2;
+				task->cl = &cl;
+				task->synchronous = 1;
+
+				ret = starpu_task_submit(task);
+				if (ret == -ENODEV) goto enodev;
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+			}
+		}
+	}
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle2, 0);
+        starpu_data_unregister(handle);
+        starpu_data_unregister(handle2);
+	starpu_shutdown();
+
+        FPRINTF(stderr,"OUT Matrix:\n");
+	for(k=0 ; k<NZ + PARTSZ*2*SHADOWZ ; k++)
+	{
+		for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
+		{
+			for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++) {
+				FPRINTF(stderr, "%5d ", matrix2[k][j][i]);
+			}
+			FPRINTF(stderr,"\n");
+		}
+		FPRINTF(stderr,"\n\n");
+	}
+        FPRINTF(stderr,"\n");
+	for(k=0 ; k<PARTSZ ; k++)
+		for(j=0 ; j<PARTSY ; j++)
+			for(i=0 ; i<PARTSX ; i++)
+				for (n=0 ; n<NZ/PARTSZ + 2*SHADOWZ ; n++)
+					for (m=0 ; m<NY/PARTSY + 2*SHADOWY ; m++)
+						for (l=0 ; l<NX/PARTSX + 2*SHADOWX ; l++)
+							STARPU_ASSERT(matrix2[k*(NZ/PARTSZ+2*SHADOWZ)+n][j*(NY/PARTSY+2*SHADOWY)+m][i*(NX/PARTSX+2*SHADOWX)+l] ==
+									matrix[k*(NZ/PARTSZ)+n][j*(NY/PARTSY)+m][i*(NX/PARTSX)+l]);
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 3 - 2
examples/gl_interop/gl_interop.c

@@ -33,7 +33,8 @@ void dummy(void *buffers[], void *cl_arg)
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
 	printf("Codelet running\n");
-	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	printf("Codelet done\n");
 }
 
@@ -118,7 +119,7 @@ int main(int argc, char **argv)
 	ret = starpu_task_submit(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	/* And run the driver, which will run the task */
+	/* And run the driver inside main, which will run the task */
 	printf("running the driver\n");
 	starpu_driver_run(&drivers[0]);
 	printf("finished running the driver\n");

+ 154 - 0
examples/gl_interop/gl_interop_idle.c

@@ -0,0 +1,154 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example demonstrates how to use StarPU combined with OpenGL rendering,
+ * which needs:
+ *
+ * - initializing GLUT first,
+ * - enabling it at initialization,
+ * - running the corresponding CUDA worker in the GLUT thread (here, the main
+ *   thread).
+ *
+ * The difference with gl_interop.c is that this version runs StarPU Tasks in
+ * the glut idle handler.
+ */
+
+#include <starpu.h>
+#include <unistd.h>
+#include <GL/glut.h>
+
+void dummy(void *buffers[], void *cl_arg)
+{
+	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+
+	printf("Codelet running\n");
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	printf("Codelet done\n");
+}
+
+struct starpu_codelet cl = {
+	.where = STARPU_CUDA,
+	.cuda_funcs = { dummy, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+};
+
+void foo(void) {
+}
+
+void display(float i) {
+	glClear(GL_COLOR_BUFFER_BIT);
+	glColor3f(1, 1, 1);
+	glBegin(GL_LINES);
+	glVertex2f(-i, -i);
+	glVertex2f(i, i);
+	glEnd();
+	glFinish();
+	glutPostRedisplay();
+}
+
+static int cuda_devices[] = { 0 };
+static struct starpu_driver drivers[] = {
+	{ .type = STARPU_CUDA_WORKER }
+};
+
+void callback_func(void *foo) {
+	printf("Callback running, rendering\n");
+	float i = 1.;
+	while (i > 0) {
+		usleep(100000);
+		display(i);
+		i -= 0.1;
+	}
+	printf("rendering done\n");
+
+	/* Tell it was already the last submitted task */
+	starpu_drivers_request_termination();
+
+	/* And terminate StarPU */
+	starpu_driver_deinit(&drivers[0]);
+	starpu_shutdown();
+	exit(0);
+}
+
+static void idle(void)
+{
+	starpu_driver_run_once(&drivers[0]);
+}
+
+int main(int argc, char **argv)
+{
+#if !(defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+	return 77;
+#else
+	struct starpu_conf conf;
+	int ret;
+	struct starpu_task *task;
+	starpu_data_handle_t handle;
+	int cuda_device = 0;
+
+	cuda_devices[0] = cuda_device;
+	drivers[0].id.cuda_id = cuda_device;
+
+	glutInit(&argc, argv);
+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
+	glutInitWindowPosition(0, 0);
+	glutInitWindowSize(300,200);
+	glutCreateWindow("StarPU OpenGL interoperability test");
+	glClearColor (0.5, 0.5, 0.5, 0.0);
+
+	/* Enable OpenGL interoperability */
+	starpu_conf_init(&conf);
+	conf.ncuda = 1;
+	conf.ncpus = 0;
+	conf.nopencl = 0;
+	conf.cuda_opengl_interoperability = cuda_devices;
+	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
+	conf.not_launched_drivers = drivers;
+	conf.n_not_launched_drivers = sizeof(drivers) / sizeof(*drivers);
+	ret = starpu_init(&conf);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_vector_data_register(&handle, -1, 0, 10, sizeof(float));
+
+	/* Submit just one dumb task */
+	task = starpu_task_create();
+	task->cl = &cl;
+	task->handles[0] = handle;
+	task->callback_func = callback_func;
+	task->callback_arg = NULL;
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* And run the driver inside main, which will run the task */
+	printf("running the driver\n");
+	/* Initialize it */
+	starpu_driver_init(&drivers[0]);
+	/* Register driver loop content as idle handler */
+	glutIdleFunc(idle);
+	/* Now run the glut loop */
+	glutMainLoop();
+	/* And deinitialize driver */
+	starpu_driver_deinit(&drivers[0]);
+	printf("finished running the driver\n");
+
+	starpu_shutdown();
+
+	return 0;
+#endif
+}

+ 5 - 29
examples/heat/dw_factolu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -32,10 +32,6 @@ unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
 struct timeval start;
 struct timeval end;
 
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-static unsigned finished = 0;
-
 static unsigned no_prio = 0;
 
 static struct starpu_codelet cl11 =
@@ -347,11 +343,7 @@ void dw_callback_v2_codelet_update_u11(void *argcb)
 
 	if (i == nblocks - 1) 
 	{
-		/* we are done : wake the application up  */
-		pthread_mutex_lock(&mutex);
-		finished = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
+		/* we are done */
 		free(argcb);
 		return;
 	}
@@ -464,11 +456,7 @@ void dw_callback_codelet_update_u11(void *argcb)
 
 	if (args->i == args->nblocks - 1) 
 	{
-		/* we are done : wake the application up  */
-		pthread_mutex_lock(&mutex);
-		finished = 1;
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
+		/* we are done */
 		free(argcb);
 		return;
 	}
@@ -641,13 +629,7 @@ void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
 	ret = starpu_task_submit(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	/* stall the application until the end of computations */
-	pthread_mutex_lock(&mutex);
-
-	if (!finished)
-		pthread_cond_wait(&cond, &mutex);
-
-	pthread_mutex_unlock(&mutex);
+	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
 
@@ -697,13 +679,7 @@ void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
 		exit(0);
 	}
 
-	/* stall the application until the end of computations */
-	pthread_mutex_lock(&mutex);
-
-	if (!finished)
-		pthread_cond_wait(&cond, &mutex);
-
-	pthread_mutex_unlock(&mutex);
+	starpu_task_wait_for_all();
 
 	gettimeofday(&end, NULL);
 

+ 3 - 6
examples/heat/dw_factolu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,8 +22,7 @@
 #include <string.h>
 #include <math.h>
 #include <sys/time.h>
-/* for STARPU_USE_CUDA */
-#include <starpu_config.h>
+#include <starpu.h>
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -32,8 +31,6 @@
 
 #include "../common/blas.h"
 
-#include <starpu.h>
-
 #include "lu_kernels_model.h"
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)

+ 10 - 10
examples/heat/dw_factolu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -135,9 +135,9 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 					right, ld12, 1.0f, center, ld22);
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
-				STARPU_ABORT();
+				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -200,9 +200,9 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 					1.0f, sub11, ld11, sub12, ld12);
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
-				STARPU_ABORT();
+				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -262,9 +262,9 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
-				STARPU_ABORT();
+				STARPU_CUBLAS_REPORT_ERROR(status);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -344,8 +344,8 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 			for (z = 0; z < nx; z++)
 			{
 				float pivot;
-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(pivot != 0.0f);
 				
@@ -357,7 +357,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 1 - 2
examples/heat/dw_sparse_cg.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,7 +28,6 @@
 #include <pthread.h>
 #include <signal.h>
 
-#include <starpu_config.h>
 #include <starpu.h>
 
 #ifdef STARPU_USE_CUDA

+ 0 - 0
examples/heat/heat.c


Some files were not shown because too many files changed in this diff