Sfoglia il codice sorgente

Merge from trunk : several merges from r11938 to r12434

Marc Sergent 12 anni fa
parent
commit
1329f5bee3
100 ha cambiato i file con 4572 aggiunte e 2760 eliminazioni
  1. 2 0
      AUTHORS
  2. 50 18
      ChangeLog
  3. 1 40
      INSTALL
  4. 2 1
      Makefile.am
  5. 50 13
      README
  6. 92 9
      configure.ac
  7. 2 2
      doc/Makefile.am
  8. 71 59
      doc/doxygen/Makefile.am
  9. 82 20
      doc/doxygen/chapters/00introduction.doxy
  10. 59 19
      doc/doxygen/chapters/01building.doxy
  11. 2 1234
      doc/doxygen/chapters/03advanced_examples.doxy
  12. 0 552
      doc/doxygen/chapters/04optimize_performance.doxy
  13. 226 0
      doc/doxygen/chapters/05check_list_performance.doxy
  14. 438 0
      doc/doxygen/chapters/06tasks.doxy
  15. 0 114
      doc/doxygen/chapters/06tips_and_tricks.doxy
  16. 533 0
      doc/doxygen/chapters/07data_management.doxy
  17. 151 0
      doc/doxygen/chapters/08scheduling.doxy
  18. 0 0
      doc/doxygen/chapters/09scheduling_contexts.doxy
  19. 0 0
      doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
  20. 53 0
      doc/doxygen/chapters/11debugging_tools.doxy
  21. 437 0
      doc/doxygen/chapters/12online_performance_tools.doxy
  22. 80 209
      doc/doxygen/chapters/05performance_feedback.doxy
  23. 229 0
      doc/doxygen/chapters/14faq.doxy
  24. 0 0
      doc/doxygen/chapters/15out_of_core.doxy
  25. 59 9
      doc/doxygen/chapters/08mpi_support.doxy
  26. 0 0
      doc/doxygen/chapters/17fft_support.doxy
  27. 0 0
      doc/doxygen/chapters/18mic_scc_support.doxy
  28. 0 0
      doc/doxygen/chapters/19c_extensions.doxy
  29. 4 0
      doc/doxygen/chapters/12socl_opencl_extensions.doxy
  30. 127 0
      doc/doxygen/chapters/21simgrid.doxy
  31. 689 0
      doc/doxygen/chapters/40environment_variables.doxy
  32. 25 1
      doc/doxygen/chapters/16configure_options.doxy
  33. 31 24
      doc/doxygen/chapters/17files.doxy
  34. 0 0
      doc/doxygen/chapters/50scaling-vector-example.doxy
  35. 0 0
      doc/doxygen/chapters/51fdl-1.3.doxy
  36. 67 0
      doc/doxygen/chapters/api/bitmap.doxy
  37. 32 1
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  38. 5 1
      doc/doxygen/chapters/api/data_management.doxy
  39. 9 1
      doc/doxygen/chapters/api/fxt_support.doxy
  40. 5 1
      doc/doxygen/chapters/api/implicit_dependencies.doxy
  41. 15 2
      doc/doxygen/chapters/api/initialization.doxy
  42. 12 3
      doc/doxygen/chapters/api/insert_task.doxy
  43. 18 1
      doc/doxygen/chapters/api/misc_helpers.doxy
  44. 35 5
      doc/doxygen/chapters/api/mpi.doxy
  45. 6 1
      doc/doxygen/chapters/api/parallel_tasks.doxy
  46. 23 3
      doc/doxygen/chapters/api/performance_model.doxy
  47. 2 2
      doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
  48. 17 1
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  49. 6 5
      doc/doxygen/chapters/api/scheduling_policy.doxy
  50. 6 0
      doc/doxygen/chapters/api/standard_memory_library.doxy
  51. 65 2
      doc/doxygen/chapters/api/threads.doxy
  52. 46 0
      doc/doxygen/chapters/api/tree.doxy
  53. 29 0
      doc/doxygen/dev/sc_funcs.cocci
  54. 9 4
      doc/doxygen/dev/starpu_check_documented.py
  55. 58 45
      doc/doxygen/dev/starpu_check_undocumented.sh
  56. 4 2
      doc/doxygen/doxygen-config.cfg.in
  57. 3 1
      doc/doxygen/doxygen_filter.sh.in
  58. 73 35
      doc/doxygen/refman.tex
  59. 0 0
      doc/tutorial/hello_world_msvc.c
  60. 16 5
      examples/Makefile.am
  61. 1 5
      examples/basic_examples/dynamic_handles.c
  62. 2 1
      examples/basic_examples/vector_scal.c
  63. 1 3
      examples/basic_examples/vector_scal_cuda.cu
  64. 9 1
      examples/cholesky/cholesky.h
  65. 1 38
      examples/cholesky/cholesky_grain_tag.c
  66. 12 55
      examples/cholesky/cholesky_implicit.c
  67. 49 4
      examples/cholesky/cholesky_kernels.c
  68. 1 38
      examples/cholesky/cholesky_tag.c
  69. 1 38
      examples/cholesky/cholesky_tile_tag.c
  70. 0 0
      examples/fortran/Makefile
  71. 0 0
      examples/fortran/StarPU_fortran.h
  72. 0 0
      examples/fortran/hello.F
  73. 0 0
      examples/fortran/hello_c.c
  74. 5 2
      examples/gl_interop/gl_interop.c
  75. 5 2
      examples/gl_interop/gl_interop_idle.c
  76. 3 1
      examples/incrementer/incrementer.c
  77. 1 2
      examples/incrementer/incrementer_kernels.cu
  78. 3 7
      examples/incrementer/incrementer_kernels_opencl.c
  79. 19 37
      examples/matvecmult/matvecmult_kernel.cl
  80. 118 25
      examples/sched_ctx/parallel_code.c
  81. 2 1
      examples/stencil/Makefile.am
  82. 89 0
      examples/worker_collections/worker_list_example.c
  83. 99 0
      examples/worker_collections/worker_tree_example.c
  84. 1 0
      gcc-plugin/examples/Makefile.am
  85. 2 1
      gcc-plugin/tests/Makefile.am
  86. 5 1
      include/starpu.h
  87. 22 23
      include/starpu_bitmap.h
  88. 5 1
      include/starpu_config.h.in
  89. 5 6
      include/starpu_data.h
  90. 2 1
      include/starpu_fxt.h
  91. 2 2
      include/starpu_hash.h
  92. 4 3
      include/starpu_perfmodel.h
  93. 1 0
      include/starpu_sched_component.h
  94. 21 4
      include/starpu_sched_ctx.h
  95. 3 2
      include/starpu_scheduler.h
  96. 2 1
      include/starpu_stdlib.h
  97. 15 2
      include/starpu_task.h
  98. 2 2
      include/starpu_task_bundle.h
  99. 3 6
      include/starpu_task_list.h
  100. 0 0
      include/starpu_task_util.h

+ 2 - 0
AUTHORS

@@ -9,6 +9,8 @@ Nathalie Furmento <nathalie.furmento@labri.fr>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
+Xavier Lacoste <xavier.lacoste@inria.fr>
+Benoît Lizé <benoit.lize@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
 Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
 Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>

+ 50 - 18
ChangeLog

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2014  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,12 @@ New features:
     its own scheduler, by coding itself each "box" it wants, or by
     combining existing boxes in StarPU to build it. Hierarchical
     schedulers have very interesting scalability properties.
+  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
+  * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+  * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
+    CUDA and OpenCL kernel execution.
 
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
@@ -55,7 +61,7 @@ Small features:
     storing performance models. Available through the new option -d of
     the tool starpu_perfmodel_display
   * New batch files to execute StarPU applications under Microsoft
-    Visual Studio (They are installed in path_to_starpu/bin/mvsc)/
+    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
   * Functions starpu_insert_task and starpu_mpi_insert_task are
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     names are kept to avoid breaking old codes.
@@ -64,29 +70,34 @@ Small features:
     history-based calibrator.
   * Tasks can now have a name (via the field const char *name of
     struct starpu_task)
+  * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
+    to execute the task.
+  * New STARPU_DISABLE_PINNING environment variable to disable host memory
+    pinning.
+  * New STARPU_DISABLE_KERNELS environment variable to disable actual kernel
+    execution.
+  * New starpu_memory_get_total function to get the size of a memory node.
+  * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
+    a set of workers without going through combined workers.
+  * Allow application to provide the task footprint itself.
 
 Changes:
-  * Fix of the livelock issue discovered while executing applications
-    on a CPU+GPU cluster of machines by adding a maximum trylock
-    threshold before a blocking lock.
   * Data interfaces (variable, vector, matrix and block) now define
     pack und unpack functions
-  * Fix for properly dealing with NAN on windows systems
   * StarPU-MPI: Fix for being able to receive data which have not yet
     been registered by the application (i.e it did not call
     starpu_data_set_tag(), data are received as a raw memory)
   * StarPU-MPI: Fix for being able to receive data with the same tag
     from several nodes (see mpi/tests/gather.c)
-  * Function starpu_sched_ctx_create() now takes a variable argument
-    list to define the scheduler to be used, and the minimum and
-    maximum priority values
-  * The functions starpu_sched_set/get_min/max_priority set/get the
-    priorities of the current scheduling context, i.e the one which
-    was set by a call to starpu_sched_ctx_set_context() or the initial
-    context if the function was not called yet.
+  * StarPU-MPI: Fix overzealous allocation of memory.
+
+Small changes:
+  * Rename function starpu_trace_user_event() as
+    starpu_fxt_trace_user_event()
 
-StarPU 1.1.0 (svn revision xxxx)
+StarPU 1.1.0 (svn revision 11960)
 ==============================================
+The scheduling context release
 
 New features:
   * OpenGL interoperability support.
@@ -95,7 +106,7 @@ New features:
   * Performance models measurements can now be provided explicitly by
     applications.
   * Capability to emit communication statistics when running MPI code
-  * Add starpu_unregister_submit, starpu_data_acquire_on_node and
+  * Add starpu_data_unregister_submit, starpu_data_acquire_on_node and
     starpu_data_invalidate_submit
   * New functionnality to wrapper starpu_insert_task to pass a array of
 	data_handles via the parameter STARPU_DATA_ARRAY
@@ -121,6 +132,8 @@ New features:
         - Communication cache mechanism is enabled by default, and can
 	  only be disabled at execution time by setting the
 	  environment variable STARPU_MPI_CACHE to 0.
+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
+   	  cache holding received data.
         - Initialisation functions starpu_mpi_initialize_extended()
   	  and starpu_mpi_initialize() have been made deprecated. One
 	  should now use starpu_mpi_init(int *, char ***, int). The
@@ -131,6 +144,11 @@ New features:
         - When exchanging user-defined data interfaces, the size of
 	  the data is the size returned by the pack operation, i.e
 	  data with dynamic size can now be exchanged with StarPU-MPI.
+        - New function starpu_mpi_data_register() which sets the rank
+  	  and tag of a data, and also allows to automatically clear
+	  the MPI communication cache when unregistering the data. It
+	  should be called instead of both calling
+	  starpu_data_set_tag() and starpu_data_set_rank()
   * Add experimental simgrid support, to simulate execution with various
     number of CPUs, GPUs, amount of memory, etc.
   * Add support for OpenCL simulators (which provide simulated execution time)
@@ -196,6 +214,14 @@ New features:
   * Add a watchdog which permits to easily trigger a crash when StarPU gets
     stuck.
   * Document how to migrate data over MPI.
+  * New function starpu_wakeup_worker() to be used by schedulers to
+    wake up a single worker (instead of all workers) when submitting a
+    single task.
+  * The functions starpu_sched_set/get_min/max_priority set/get the
+    priorities of the current scheduling context, i.e the one which
+    was set by a call to starpu_sched_ctx_set_context() or the initial
+    context if the function has not been called yet.
+  * Fix for properly dealing with NAN on windows systems
 
 Small features:
   * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
@@ -219,11 +245,14 @@ Small features:
     storing performance models. Available through the new option -d of
     the tool starpu_perfmodel_display
   * New batch files to execute StarPU applications under Microsoft
-    Visual Studio (They are installed in path_to_starpu/bin/mvsc)/
+    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
   * Add cl_arg_free, callback_arg_free, prologue_callback_arg_free fields to
     enable automatic free(cl_arg); free(callback_arg);
     free(prologue_callback_arg) on task destroy.
   * New function starpu_task_build
+  * New configure options --with-simgrid-dir
+    --with-simgrid-include-dir and --with-simgrid-lib-dir to specify
+    the location of the SimGrid library
 
 Changes:
   * Rename all filter functions to follow the pattern
@@ -291,6 +320,9 @@ Changes:
     priorities of the current scheduling context, i.e the one which
     was set by a call to starpu_sched_ctx_set_context() or the initial
     context if the function was not called yet.
+  * MPI: Fix of the livelock issue discovered while executing applications
+    on a CPU+GPU cluster of machines by adding a maximum trylock
+    threshold before a blocking lock.
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
@@ -317,7 +349,7 @@ Changes:
   * Fix generating FXT traces bigger than 64MiB.
   * Improve ENODEV error detections in StarPU FFT
 
-StarPU 1.0.2 (svn revision xxx)
+StarPU 1.0.2 (svn revision 7210)
 ==============================================
 
 Changes:

+ 1 - 40
INSTALL

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -19,7 +19,6 @@ Contents
 
 * Installing StarPU on a Unix machine
 * Installing StarPU on Windows
-* Running StarPU Applications on Microsoft Visual C
 
 
 
@@ -214,41 +213,3 @@ autogen.sh part.
    and set the StarPU bin directory in your path.
 
    export PATH=<StarPU installation directory>/bin:$PATH
-
-
-Running StarPU Applications on Microsoft Visual C
--------------------------------------------------
-
-Batch files are provided to run StarPU applications under Microsoft
-Visual C. They are installed in path_to_starpu/bin/mvsc.
-
-To execute a StarPU application, you first need to set the environment
-variable STARPUPATH.
-
-c:\....> cd c:\cygwin\home\ci\starpu\
-c:\....> set STARPUPATH=c:\cygwin\home\ci\starpu\
-c:\....> cd bin\mvsc
-c:\....> starpu_open.bat starpu_simple.c
-
-The batch script will run Microsoft Visual C with a basic project file
-to run the given application.
-
-The batch script starpu_clean.bat can be used to delete all
-compilation generated files.
-
-The batch script starpu_exec.bat can be used to compile and execute a
-StarPU application from the command prompt.
-
-c:\....> cd c:\cygwin\home\ci\starpu\
-c:\....> set STARPUPATH=c:\cygwin\home\ci\starpu\
-c:\....> cd bin\mvsc
-c:\....> starpu_exec.bat ..\..\..\..\examples\basic_examples\hello_world.c
-
-MVSC StarPU Execution
-...
-/out:hello_world.exe
-...
-Hello world (params = {1, 2.00000})
-Callback function got argument 0000042
-c:\....>
-

+ 2 - 1
Makefile.am

@@ -83,7 +83,8 @@ versinclude_HEADERS = 				\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\
-	include/starpu_thread_util.h
+	include/starpu_thread_util.h		\
+	include/starpu_tree.h
 
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h

+ 50 - 13
README

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -128,9 +128,46 @@ StarPU to hang or exhibit incorrect behaviour.
 
 For details on the Windows build process, see the INSTALL file.
 
-++==================++
-|| V. Documentation ||
-++==================++
+++======================================================++
+|| V. Running StarPU Applications on Microsoft Visual C ||
+++======================================================++
+
+Batch files are provided to run StarPU applications under Microsoft
+Visual C. They are installed in path_to_starpu/bin/msvc.
+
+To execute a StarPU application, you first need to set the environment
+variable STARPU_PATH.
+
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPU_PATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\msvc
+c:\....> starpu_open.bat starpu_simple.c
+
+The batch script will run Microsoft Visual C with a basic project file
+to run the given application.
+
+The batch script starpu_clean.bat can be used to delete all
+compilation generated files.
+
+The batch script starpu_exec.bat can be used to compile and execute a
+StarPU application from the command prompt.
+
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPU_PATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\msvc
+c:\....> starpu_exec.bat ..\..\..\..\examples\basic_examples\hello_world.c
+
+MSVC StarPU Execution
+...
+/out:hello_world.exe
+...
+Hello world (params = {1, 2.00000})
+Callback function got argument 0000042
+c:\....>
+
+++===================++
+|| VI. Documentation ||
+++===================++
 
 Texinfo documentation is available in doc/ . If LaTeX is available on the
 machine, a pdf can be generated by running
@@ -141,22 +178,22 @@ If makeinfo is available on the machine, html pages can be generated by running
 
   $ make -C doc html
 
-++============++
-|| VI. Trying ||
-++============++
+++=============++
+|| VII. Trying ||
+++=============++
 
 Some examples ready to run are installed into $prefix/lib/starpu/{examples,mpi}
 
-++==============++
-|| VII. Upgrade ||
-++==============++
+++===============++
+|| VIII. Upgrade ||
+++===============++
 
 To upgrade your source code from older version (there were quite a few
 renamings), use the tools/dev/rename.sh script
 
-++===============++
-|| VIII. Contact ||
-++===============++
+++=============++
+|| IX. Contact ||
+++=============++
 
 For any questions regarding StarPU, please contact the starpu-devel
 mailing-list at starpu-devel@lists.gforge.inria.fr .

+ 92 - 9
configure.ac

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2014  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
@@ -70,6 +70,8 @@ AC_PROG_CPP
 AC_PROG_SED
 AC_PROG_LN_S
 AC_PROG_F77
+AC_CHECK_PROGS(PROG_STAT,gstat stat)
+AC_CHECK_PROGS(PROG_DATE,gdate date)
 
 LT_PREREQ([2.2])
 LT_INIT([win32-dll])
@@ -197,6 +199,12 @@ AC_CHECK_FUNCS([sysconf])
 AC_CHECK_FUNC([pthread_spin_lock], have_pthread_spin_lock=yes, have_pthread_spin_lock=no)
 if test x$have_pthread_spin_lock = xyes; then
 	AC_DEFINE(HAVE_PTHREAD_SPIN_LOCK,[],[pthread_spin_lock is available])
+	AC_DEFINE(STARPU_HAVE_PTHREAD_SPIN_LOCK,[],[pthread_spin_lock is available])
+fi
+
+AC_CHECK_FUNC([pthread_barrier_init], have_pthread_barrier=yes, have_pthread_barrier=no)
+if test x$have_pthread_barrier = xyes; then
+	AC_DEFINE(STARPU_HAVE_PTHREAD_BARRIER,[],[pthread_barrier is available])
 fi
 
 # yes, that's non portable, but it's still better than sched_setaffinity
@@ -329,6 +337,20 @@ fi
 AM_CONDITIONAL([STARPU_BUILD_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 
+AC_ARG_ENABLE([sc_hypervisor_debug],
+  [AS_HELP_STRING([--enable-sc-hypervisor-debug],
+    [enable debug for resizing contexts (experimental)])],
+  [enable_sc_hypervisor_debug="yes"],
+  [enable_sc_hypervisor_debug="no"])
+
+
+AC_SUBST(STARPU_SC_HYPERVISOR_DEBUG, $enable_sc_hypervisor_debug)
+AM_CONDITIONAL([STARPU_SC_HYPERVISOR_DEBUG], [test "x$enable_sc_hypervisor_debug" = "xyes"])
+
+if test "x$enable_sc_hypervisor_debug" = "xyes"; then
+  AC_DEFINE(STARPU_SC_HYPERVISOR_DEBUG, [1], [enable debug sc_hypervisor])
+fi
+
 ###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #
@@ -916,16 +938,53 @@ if test x$enable_opencl_simulator = xyes; then
 	AC_DEFINE(STARPU_OPENCL_SIMULATOR, 1, [Define this to enable using an OpenCL simulator])
 fi
 
+AC_ARG_WITH(simgrid-dir,
+	[AS_HELP_STRING([--with-simgrid-dir=<path>],
+	[specify SimGrid installation directory])],
+	[
+		simgrid_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], simgrid_dir=no)
+
+AC_ARG_WITH(simgrid-include-dir,
+	[AS_HELP_STRING([--with-simgrid-include-dir=<path>],
+	[specify where SimGrid headers are installed])],
+	[
+		simgrid_include_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], [simgrid_include_dir=no])
+
+AC_ARG_WITH(simgrid-lib-dir,
+	[AS_HELP_STRING([--with-simgrid-lib-dir=<path>],
+	[specify where SimGrid libraries are installed])],
+	[
+		simgrid_lib_dir="$withval"
+		# in case this was not explicit yet
+		enable_simgrid=yes
+	], [simgrid_lib_dir=no])
+
 AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
 			[Enable simulating execution in simgrid])],
 			enable_simgrid=$enableval, enable_simgrid=no)
 if test x$enable_simgrid = xyes ; then
-	if test -n "$SIMGRID_CFLAGS" ; then
-		CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
+   	if test -n "$SIMGRID_CFLAGS" ; then
+	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	fi
 	if test -n "$SIMGRID_LIBS" ; then
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
 	fi
+	if test "$simgrid_dir" != "no" ; then
+	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
+	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
+	fi
+	if test "$simgrid_include_dir" != "no" ; then
+	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
+	fi
+	if test "$simgrid_lib_dir" != "no" ; then
+	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
+	fi
 	AC_HAVE_LIBRARY([simgrid], [],
 		[
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
@@ -946,8 +1005,10 @@ fi
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
 AC_SUBST(SIMGRID_LIBS)
+AC_MSG_CHECKING(whether SimGrid is enabled)
+AC_MSG_RESULT($enable_simgrid)
 
-AC_MSG_CHECKING(whether blocking drivers should be disabled)
+AC_MSG_CHECKING(whether blocking drivers should be enabled)
 AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [enable blocking drivers])],
 				enable_blocking=$enableval, enable_blocking=no)
 AC_MSG_RESULT($enable_blocking)
@@ -1572,6 +1633,23 @@ AC_CHECK_FUNCS([clock_gettime])
 
 # Compute the maximum number of workers (we round it to 16 for alignment
 # purposes).
+if test x$enable_simgrid != xyes; then
+	if test x$enable_cpu != xyes; then
+		maxcpus=0
+	fi
+	if test x$enable_cuda != xyes; then
+		nmaxcudadev=0
+	fi
+	if test x$enable_opencl != xyes; then
+		nmaxopencldev=0
+	fi
+	if test x$enable_mic != xyes; then
+		nmaxmicthreads=0
+	fi
+	if test x$enable_rcce != xyes; then
+		nmaxsccdev=0
+	fi
+fi
 nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
@@ -1605,7 +1683,7 @@ AC_LANG_POP([C++])
 AC_MSG_CHECKING(calibration heuristic of history-based StarPU calibrator)
 AC_ARG_ENABLE(calibration-heuristic, [AS_HELP_STRING([--enable-calibration-heuristic=<number>],
 			[Define the maximum authorized deviation of StarPU history-based calibrator.])],
-			calibration_heuristic=$enableval, calibration_heuristic=10)
+			calibration_heuristic=$enableval, calibration_heuristic=50)
 AC_MSG_RESULT($calibration_heuristic)
 AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibration heuristic value])
 
@@ -1825,6 +1903,7 @@ IS_SUPPORTED_CFLAG(-W)
 IS_SUPPORTED_CFLAG(-Wall)
 IS_SUPPORTED_CFLAG(-Wextra)
 IS_SUPPORTED_CFLAG(-Werror=implicit)
+IS_SUPPORTED_CFLAG(-Werror=implicit-function-declaration)
 
 if test "x$STARPU_DEVEL" != x; then
 	AC_DEFINE(STARPU_DEVEL, [1], [enable developer warnings])
@@ -2301,13 +2380,17 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			enable_build_doc=$enableval, enable_build_doc=yes)
 
 if test "$enable_build_doc" = "yes" ; then
-   # Check whether doxygen and pdflatex are installed
+   # Check whether doxygen needed tools are installed
    AC_PATH_PROG(doxygencommand, doxygen)
    if test "$doxygencommand" = "" ; then
       	enable_build_doc="no"
    fi
-   AC_PATH_PROG(pdflatex, pdflatex)
-   if test "pdflatexcommand" = "" ; then
+   AC_PATH_PROG(pdflatexcommand, pdflatex)
+   if test "$pdflatexcommand" = "" ; then
+	enable_build_doc="no"
+   fi
+   AC_PATH_PROG(epstopdfcommand, epstopdf)
+   if test "$epstopdfcommand" = "" ; then
 	enable_build_doc="no"
    fi
 fi
@@ -2408,7 +2491,7 @@ AC_OUTPUT([
 	doc/doxygen/Makefile
 	doc/doxygen/doxygen-config.cfg
 	doc/doxygen/doxygen_filter.sh
-	tools/mvsc/starpu_var.bat
+	tools/msvc/starpu_var.bat
 ])
 
 AC_MSG_NOTICE([

+ 2 - 2
doc/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2013, 2014  Centre National de la Recherche Scientifique
 #
 # Permission is granted to copy, distribute and/or modify this document
 # under the terms of the GNU Free Documentation License, Version 1.3
@@ -15,7 +15,7 @@ endif
 
 EXTRA_DIST =    tutorial/hello_world.c \
 		tutorial/hello_world_plugin.c \
-		tutorial/hello_world_mvsc.c \
+		tutorial/hello_world_msvc.c \
 		tutorial/Makefile \
 		tutorial/README \
 		tutorial/vector_scal.c \

+ 71 - 59
doc/doxygen/Makefile.am

@@ -1,7 +1,8 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009, 2011, 2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+# Copyright (C) 2014  Inria
 #
 # Permission is granted to copy, distribute and/or modify this document
 # under the terms of the GNU Free Documentation License, Version 1.3
@@ -28,22 +29,28 @@ chapters =	\
 	chapters/01building.doxy \
 	chapters/02basic_examples.doxy \
 	chapters/03advanced_examples.doxy \
-	chapters/04optimize_performance.doxy \
-	chapters/05performance_feedback.doxy \
-	chapters/06tips_and_tricks.doxy \
-	chapters/07out_of_core.doxy \
-	chapters/08mpi_support.doxy \
-	chapters/09fft_support.doxy \
-	chapters/10mic_scc_support.doxy \
-	chapters/11c_extensions.doxy \
-	chapters/12socl_opencl_extensions.doxy \
-	chapters/13scheduling_contexts.doxy \
-	chapters/14scheduling_context_hypervisor.doxy \
-	chapters/15environment_variables.doxy \
-	chapters/16configure_options.doxy \
-	chapters/17files.doxy \
-	chapters/18scaling-vector-example.doxy \
-	chapters/19fdl-1.3.doxy \
+	chapters/05check_list_performance.doxy \
+	chapters/06tasks.doxy \
+	chapters/07data_management.doxy \
+	chapters/08scheduling.doxy \
+	chapters/09scheduling_contexts.doxy \
+	chapters/10scheduling_context_hypervisor.doxy \
+	chapters/11debugging_tools.doxy \
+	chapters/12online_performance_tools.doxy \
+	chapters/13offline_performance_tools.doxy \
+	chapters/14faq.doxy \
+	chapters/15out_of_core.doxy \
+	chapters/16mpi_support.doxy \
+	chapters/17fft_support.doxy \
+	chapters/18mic_scc_support.doxy \
+	chapters/19c_extensions.doxy \
+	chapters/20socl_opencl_extensions.doxy \
+	chapters/21simgrid.doxy \
+	chapters/40environment_variables.doxy \
+	chapters/41configure_options.doxy \
+	chapters/45files.doxy \
+	chapters/50scaling-vector-example.doxy \
+	chapters/51fdl-1.3.doxy \
 	chapters/code/hello_pragma2.c \
 	chapters/code/hello_pragma.c \
 	chapters/code/scal_pragma.cu \
@@ -94,46 +101,49 @@ chapters =	\
 	chapters/api/versioning.doxy \
 	chapters/api/workers.doxy \
 	chapters/api/threads.doxy \
+	chapters/api/bitmap.doxy \
+	chapters/api/tree.doxy \
 	chapters/api/toolbox.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
 	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
 
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
-	sed 's/#undef \(.*\)/#define \1 1/' $< > $@
+	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
 chapters/version.sty: $(chapters)
-	for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
+	@for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_sty
-	if test -s timestamp_sty ; then \
-		LC_ALL=C date --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
-		LC_ALL=C date --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
+	@if test -s timestamp_sty ; then \
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
 	fi
-	if test -s timestamp_sty_updated ; then \
-		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_sty_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
+	@if test -s timestamp_sty_updated ; then \
+		echo ':newcommand{:STARPUUPDATED}{'`cat timestamp_sty_updated`'}' > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	else \
-		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
+		echo ':newcommand{:STARPUUPDATED}{unknown date}' > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
-	echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
-	for f in timestamp_sty timestamp_sty_updated timestamp_sty_updated_month ; do \
+	@echo ':newcommand{:STARPUVERSION}{$(VERSION)}' >> $(top_srcdir)/doc/doxygen/chapters/version.sty
+	@$(SED) -i 's/:/\\/g' $(top_srcdir)/doc/doxygen/chapters/version.sty
+	@for f in timestamp_sty timestamp_sty_updated timestamp_sty_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
 chapters/version.html: $(chapters)
-	for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
+	@for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_html
-	if test -s timestamp_html ; then \
-		LC_ALL=C date --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
-		LC_ALL=C date --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
+	@if test -s timestamp_html ; then \
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
 	fi
-	echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
-	if test -s timestamp_html_updated ; then \
+	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
+	@if test -s timestamp_html_updated ; then \
 		echo "Its contents was last updated on "`cat timestamp_html_updated`"." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	else \
 		echo "Its contents was last updated on <em>unknown_date</em>." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	fi
-	for f in timestamp_html timestamp_html_updated timestamp_html_updated_month ; do \
+	@for f in timestamp_html timestamp_html_updated timestamp_html_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
@@ -166,7 +176,8 @@ EXTRA_DIST	= 					\
 	chapters/tasks_size_overhead.eps		\
 	chapters/tasks_size_overhead.pdf		\
 	doxygen.cfg 					\
-	refman.tex
+	refman.tex					\
+	$(DOX_HTML_DIR)
 
 dox_inputs = $(DOX_CONFIG) 				\
 	$(chapters) 					\
@@ -213,30 +224,31 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_policy.h
 
 $(DOX_TAG): $(dox_inputs)
-	rm -fr $(DOX_HTML_DIR) $(DOX_LATEX_DIR)
-	$(DOXYGEN) $(DOX_CONFIG)
-	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
-	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
-        # comment for the line above: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
-	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
-	sed -i 's/.*"Files.html".*//' html/pages.html
-	if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
+	@rm -fr $(DOX_HTML_DIR) $(DOX_LATEX_DIR)
+	@$(DOXYGEN) $(DOX_CONFIG)
+	@$(SED) -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
+	@$(SED) -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
+        # comment for the line below: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
+	@if test -f html/navtree.js ; then $(SED) -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
+	@$(SED) -i 's/.*"Files.html".*//' html/pages.html
+	@if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
 
 dist_pdf_DATA = $(DOX_PDF)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
-	cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
-	cp $(top_srcdir)/doc/doxygen/chapters/*pdf $(DOX_LATEX_DIR)
-	cd $(DOX_LATEX_DIR); \
-	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out; \
-	sed -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\
-	sed -i -e 's/__configure__/\\_Configure Options!/' -e 's/\\-\\_\\-\\-\\_\\-configure\\-\\_\\-\\-\\_\\-//' CompilationConfiguration.tex ;\
-	sed -i s'/\\item Module\\-Documentation/\\item \\hyperlink{ModuleDocumentation}{Module Documentation}/' index.tex ;\
-	sed -i s'/\\item File\\-Documentation/\\item \\hyperlink{FileDocumentation}{File Documentation}/' index.tex ;\
-	$(PDFLATEX) refman.tex; \
-	$(MAKEINDEX) refman.idx;\
-	$(PDFLATEX) refman.tex; \
-	done=0; repeat=5; \
+	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
+	@cp $(top_srcdir)/doc/doxygen/chapters/*pdf $(DOX_LATEX_DIR)
+	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
+	@cd $(DOX_LATEX_DIR) ;\
+	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\
+	$(SED) -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\
+	$(SED) -i -e 's/__configure__/\\_Configure Options!/' -e 's/\\-\\_\\-\\-\\_\\-configure\\-\\_\\-\\-\\_\\-//' CompilationConfiguration.tex ;\
+	$(SED) -i s'/\\item Module\\-Documentation/\\item \\hyperlink{ModuleDocumentation}{Module Documentation}/' index.tex ;\
+	$(SED) -i s'/\\item File\\-Documentation/\\item \\hyperlink{FileDocumentation}{File Documentation}/' index.tex ;\
+	$(PDFLATEX) refman.tex > /dev/null ;\
+	$(MAKEINDEX) refman.idx > /dev/null 2>&1 ;\
+	$(PDFLATEX) refman.tex > /dev/null ;\
+	done=0; repeat=5 ;\
 	while test $$done = 0 -a $$repeat -gt 0; do \
            if $(EGREP) 'Rerun (LaTeX|to get cross-references right)' refman.log > /dev/null 2>&1; then \
 	       $(PDFLATEX) refman.tex; \
@@ -244,8 +256,8 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	   else \
 	       done=1; \
 	   fi; \
-	done; \
-	mv refman.pdf ../$(DOX_PDF)
+	done
+	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
 
 CLEANFILES = $(DOX_TAG) starpu_config.h \
     -r \

+ 82 - 20
doc/doxygen/chapters/00introduction.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
 */
@@ -91,14 +91,13 @@ fields that the application may use to give hints to the scheduler (such as
 priority levels).
 
 By default, task dependencies are inferred from data dependency (sequential
-coherence) by StarPU. The application can however disable sequential coherency
-for some data, and dependencies be expressed by hand.
+coherency) by StarPU. The application can however disable sequential coherency
+for some data, and dependencies can be specifically expressed.
 A task may be identified by a unique 64-bit number chosen by the application
 which we refer as a \b tag.
-Task dependencies can be enforced by hand either by the means of callback functions, by
+Task dependencies can be enforced either by the means of callback functions, by
 submitting other tasks, or by expressing dependencies
-between tags (which can thus correspond to tasks that have not been submitted
-yet).
+between tags (which can thus correspond to tasks that have not yet been submitted).
 
 // TODO insert illustration f(Ar, Brw, Cr) + ..
 // DSM
@@ -107,7 +106,7 @@ yet).
 
 Because StarPU schedules tasks at runtime, data transfers have to be
 done automatically and ``just-in-time'' between processing units,
-relieving the application programmer from explicit data transfers.
+relieving application programmers from explicit data transfers.
 Moreover, to avoid unnecessary transfers, StarPU keeps data
 where it was last needed, even if was modified there, and it
 allows multiple copies of the same data to reside at the same time on
@@ -134,8 +133,8 @@ A <b>memory node</b> can be either the main RAM, GPU-embedded memory or a disk m
 A \b bus is a link between memory nodes.
 
 A <b>data handle</b> keeps track of replicates of the same data (\b registered by the
-application) over various memory nodes. The data management library manages
-keeping them coherent.
+application) over various memory nodes. The data management library manages to
+keep them coherent.
 
 The \b home memory node of a data handle is the memory node from which the data
 was registered (usually the main memory node).
@@ -184,30 +183,94 @@ http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
 A good overview is available in the research report at
 http://hal.archives-ouvertes.fr/inria-00467677.
 
+\section StarPUApplications StarPU Applications
+
+You can first have a look at the chapters \ref BasicExamples and \ref AdvancedExamples.
+A tutorial is also installed in the directory <c>share/doc/starpu/tutorial/</c>.
+
+Many examples are also available in the StarPU sources in the directory
+<c>examples/</c>. Simple examples include:
+
+<dl>
+<dt> <c>incrementer/</c> </dt>
+<dd> Trivial incrementation test. </dd>
+<dt> <c>basic_examples/</c> </dt>
+<dd>
+        Simple documented Hello world and vector/scalar product (as
+        shown in \ref BasicExamples), matrix
+        product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
+        interface, an example using the variable data interface, and an example
+        using different formats on CPUs and GPUs.
+</dd>
+<dt> <c>matvecmult/</c></dt>
+<dd>
+    OpenCL example from NVidia, adapted to StarPU.
+</dd>
+<dt> <c>axpy/</c></dt>
+<dd>
+    AXPY CUBLAS operation adapted to StarPU.
+</dd>
+<dt> <c>fortran/</c> </dt>
+<dd>
+    Example of Fortran bindings.
+</dd>
+</dl>
+
+More advanced examples include:
+
+<dl>
+<dt><c>filters/</c></dt>
+<dd>
+    Examples using filters, as shown in \ref PartitioningData.
+</dd>
+<dt><c>lu/</c></dt>
+<dd>
+    LU matrix factorization, see for instance <c>xlu_implicit.c</c>
+</dd>
+<dt><c>cholesky/</c></dt>
+<dd>
+    Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
+</dd>
+</dl>
+
 \section FurtherReading Further Reading
 
 The documentation chapters include
 
-<ol>
-<li> Part: Using StarPU
+<ul>
+<li> Part 1: StarPU Basics
 <ul>
 <li> \ref BuildingAndInstallingStarPU
 <li> \ref BasicExamples
+</ul>
+<li> Part 2: StarPU Quick Programming Guide
+<ul>
 <li> \ref AdvancedExamples
-<li> \ref HowToOptimizePerformanceWithStarPU
-<li> \ref PerformanceFeedback
-<li> \ref TipsAndTricksToKnowAbout
+<li> \ref CheckListWhenPerformanceAreNotThere
+</ul>
+<li> Part 3: StarPU Inside
+<ul>
+<li> \ref TasksInStarPU
+<li> \ref DataManagement
+<li> \ref Scheduling
+<li> \ref SchedulingContexts
+<li> \ref SchedulingContextHypervisor
+<li> \ref DebuggingTools
+<li> \ref OnlinePerformanceTools
+<li> \ref OfflinePerformanceTools
+<li> \ref FrequentlyAskedQuestions
+</ul>
+<li> Part 4: StarPU Extensions
+<ul>
 <li> \ref OutOfCore
 <li> \ref MPISupport
 <li> \ref FFTSupport
 <li> \ref MICSCCSupport
 <li> \ref cExtensions
 <li> \ref SOCLOpenclExtensions
-<li> \ref SchedulingContexts
-<li> \ref SchedulingContextHypervisor
+<li> \ref SimGridSupport
 </ul>
-</li>
-<li> Part: Inside StarPU
+<li> Part 5: StarPU Reference API
 <ul>
 <li> \ref ExecutionConfigurationThroughEnvironmentVariables
 <li> \ref CompilationConfiguration
@@ -220,8 +283,7 @@ The documentation chapters include
 <li> \ref FullSourceCodeVectorScal
 <li> \ref GNUFreeDocumentationLicense
 </ul>
-</ol>
-
+</ul>
 
 Make sure to have had a look at those too!
 

+ 59 - 19
doc/doxygen/chapters/01building.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -18,7 +18,7 @@ available, simply type:
 $ apt-cache search starpu
 \endverbatim
 
-To install what you need, type:
+To install what you need, type for example:
 
 \verbatim
 $ sudo apt-get install libstarpu-1.2 libstarpu-dev
@@ -42,8 +42,7 @@ If <c>hwloc</c> is not available on your system, the option
 \ref without-hwloc "--without-hwloc" should be explicitely given when calling the
 <c>configure</c> script. If <c>hwloc</c> is installed with a <c>pkg-config</c> file,
 no option is required, it will be detected automatically, otherwise
-\ref with-hwloc "--with-hwloc" should be used to specify the location of
-<c>hwloc</c>.
+\ref with-hwloc "--with-hwloc" should be used to specify its location.
 
 \subsection GettingSources Getting Sources
 
@@ -78,7 +77,7 @@ $ svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk StarPU
 Running <c>autogen.sh</c> is not necessary when using the tarball
 releases of StarPU.  If you are using the source code from the svn
 repository, you first need to generate the configure scripts and the
-Makefiles. This requires the availability of <c>autoconf</c>,
+Makefiles. This requires the availability of <c>autoconf</c> and
 <c>automake</c> >= 2.60.
 
 \verbatim
@@ -97,7 +96,7 @@ make sure to post the content of <c>config.log</c> when reporting the issue.
 
 By default, the files produced during the compilation are placed in
 the source directory. As the compilation generates a lot of files, it
-is advised to to put them all in a separate directory. It is then
+is advised to put them all in a separate directory. It is then
 easier to cleanup, and this allows to compile several configurations
 out of the same source tree. For that, simply enter the directory
 where you want the compilation to produce its files, and invoke the
@@ -141,19 +140,18 @@ libraries names (<c>libstarpu-1.2.so</c>, <c>libstarpumpi-1.2.so</c> and
 
 \subsection SettingFlagsForCompilingLinkingAndRunningApplications Setting Flags for Compiling, Linking and Running Applications
 
-StarPU provides a pkg-config executable to obtain relevant compiler
-and linker flags.
-Compiling and linking an application against StarPU may require to use
-specific flags or libraries (for instance <c>CUDA</c> or <c>libspe2</c>).
-To this end, it is possible to use the tool <c>pkg-config</c>.
+StarPU provides a <c>pkg-config</c> executable to obtain relevant compiler
+and linker flags. As compiling and linking an application against
+StarPU may require to use specific flags or libraries (for instance
+<c>CUDA</c> or <c>libspe2</c>).
 
 If StarPU was not installed at some standard location, the path of StarPU's
 library must be specified in the environment variable <c>PKG_CONFIG_PATH</c> so
 that <c>pkg-config</c> can find it. For example if StarPU was installed in
-<c>$prefix_dir</c>:
+<c>$STARPU_PATH</c>:
 
 \verbatim
-$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
+$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$STARPU_PATH/lib/pkgconfig
 \endverbatim
 
 The flags required to compile or link against StarPU are then
@@ -182,7 +180,7 @@ It is also necessary to set the environment variable <c>LD_LIBRARY_PATH</c> to
 locate dynamic libraries at runtime.
 
 \verbatim
-$ LD_LIBRARY_PATH=$prefix_dir/lib:$LD_LIBRARY_PATH
+$ LD_LIBRARY_PATH=$STARPU_PATH/lib:$LD_LIBRARY_PATH
 \endverbatim
 
 When using a Makefile, the following lines can be added to set the
@@ -197,7 +195,7 @@ LDFLAGS         +=      $$(pkg-config --libs starpu-1.2)
 
 Basic examples using StarPU are built in the directory
 <c>examples/basic_examples/</c> (and installed in
-<c>$prefix_dir/lib/starpu/examples/</c>). You can for example run the example
+<c>$STARPU_PATH/lib/starpu/examples/</c>). You can for example run the example
 <c>vector_scal</c>.
 
 \verbatim
@@ -214,6 +212,47 @@ Please note that buses are benchmarked when StarPU is launched for the
 first time. This may take a few minutes, or less if <c>hwloc</c> is
 installed. This step is done only once per user and per machine.
 
+\subsection RunningABasicStarPUApplicationOnMicrosoft Running a Basic StarPU Application on Microsoft Visual C
+
+Batch files are provided to run StarPU applications under Microsoft
+Visual C. They are installed in <c>$STARPU_PATH/bin/msvc</c>.
+
+To execute a StarPU application, you first need to set the environment
+variable <c>STARPU_PATH</c>.
+
+\verbatim
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPU_PATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\msvc
+c:\....> starpu_open.bat starpu_simple.c
+\endverbatim
+
+The batch script will run Microsoft Visual C with a basic project file
+to run the given application.
+
+The batch script <c>starpu_clean.bat</c> can be used to delete all
+compilation generated files.
+
+The batch script <c>starpu_exec.bat</c> can be used to compile and execute a
+StarPU application from the command prompt.
+
+\verbatim
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPU_PATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\msvc
+c:\....> starpu_exec.bat ..\..\..\..\examples\basic_examples\hello_world.c
+\endverbatim
+
+\verbatim
+MSVC StarPU Execution
+...
+/out:hello_world.exe
+...
+Hello world (params = {1, 2.00000})
+Callback function got argument 0000042
+c:\....>
+\endverbatim
+
 \subsection KernelThreadsStartedByStarPU Kernel Threads Started by StarPU
 
 StarPU automatically binds one thread per CPU core. It does not use
@@ -261,13 +300,14 @@ $ STARPU_NCUDA=2 ./application
 \section BenchmarkingStarPU Benchmarking StarPU
 
 Some interesting benchmarks are installed among examples in
-<c>$prefix_dir/lib/starpu/examples/</c>. Make sure to try various
+<c>$STARPU_PATH/lib/starpu/examples/</c>. Make sure to try various
 schedulers, for instance <c>STARPU_SCHED=dmda</c>.
 
 \subsection TaskSizeOverhead Task Size Overhead
 
 This benchmark gives a glimpse into how long a task should be (in µs) for StarPU overhead
-to be low enough to keep efficiency.  Run <c>tasks_size_overhead.sh</c>, it will generate a plot
+to be low enough to keep efficiency.  Running
+<c>tasks_size_overhead.sh</c> generates a plot
 of the speedup of tasks of various sizes, depending on the number of CPUs being
 used.
 
@@ -286,10 +326,10 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
 \subsection CholeskyFactorization Cholesky Factorization
 
-<c>cholesky/*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
+<c>cholesky_*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
 
 \subsection LUFactorization LU Factorization
 
-<c>lu/*</c> perform an LU factorization. They use different dependency primitives.
+<c>lu_*</c> perform an LU factorization. They use different dependency primitives.
 
 */

File diff suppressed because it is too large
+ 2 - 1234
doc/doxygen/chapters/03advanced_examples.doxy


+ 0 - 552
doc/doxygen/chapters/04optimize_performance.doxy

@@ -1,552 +0,0 @@
-/*
- * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
- * See the file version.doxy for copying conditions.
- */
-
-/*! \page HowToOptimizePerformanceWithStarPU How To Optimize Performance With StarPU
-
-TODO: improve!
-
-Simply encapsulating application kernels into tasks already permits to
-seamlessly support CPU and GPUs at the same time. To achieve good performance, a
-few additional changes are needed.
-
-\section DataManagement Data Management
-
-When the application allocates data, whenever possible it should use
-the function starpu_malloc(), which will ask CUDA or OpenCL to make
-the allocation itself and pin the corresponding allocated memory. This
-is needed to permit asynchronous data transfer, i.e. permit data
-transfer to overlap with computations. Otherwise, the trace will show
-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
-because CUDA or OpenCL then reverts to synchronous transfers.
-
-By default, StarPU leaves replicates of data wherever they were used, in case they
-will be re-used by other tasks, thus saving the data transfer time. When some
-task modifies some data, all the other replicates are invalidated, and only the
-processing unit which ran that task will have a valid replicate of the data. If the application knows
-that this data will not be re-used by further tasks, it should advise StarPU to
-immediately replicate it to a desired list of memory nodes (given through a
-bitmask). This can be understood like the write-through mode of CPU caches.
-
-\code{.c}
-starpu_data_set_wt_mask(img_handle, 1<<0);
-\endcode
-
-will for instance request to always automatically transfer a replicate into the
-main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
-
-\code{.c}
-starpu_data_set_wt_mask(img_handle, ~0U);
-\endcode
-
-will request to always automatically broadcast the updated data to all memory
-nodes.
-
-Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
-memory nodes always have a copy of the data, so that it is never evicted when
-memory gets scarse.
-
-Implicit data dependency computation can become expensive if a lot
-of tasks access the same piece of data. If no dependency is required
-on some piece of data (e.g. because it is only accessed in read-only
-mode, or because write accesses are actually commutative), use the
-function starpu_data_set_sequential_consistency_flag() to disable
-implicit dependencies on that data.
-
-In the same vein, accumulation of results in the same data can become a
-bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
-accumulation (see \ref DataReduction). To a lesser extent, the use of
-the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
-the accumulation to happen in any order.
-
-Applications often need a data just for temporary results.  In such a case,
-registration can be made without an initial value, for instance this produces a vector data:
-
-\code{.c}
-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
-\endcode
-
-StarPU will then allocate the actual buffer only when it is actually needed,
-e.g. directly on the GPU without allocating in main memory.
-
-In the same vein, once the temporary results are not useful any more, the
-data should be thrown away. If the handle is not to be reused, it can be
-unregistered:
-
-\code{.c}
-starpu_unregister_submit(handle);
-\endcode
-
-actual unregistration will be done after all tasks working on the handle
-terminate.
-
-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
-
-\code{.c}
-starpu_invalidate_submit(handle);
-\endcode
-
-the buffers containing the current value will then be freed, and reallocated
-only when another task writes some value to the handle.
-
-\section TaskGranularity Task Granularity
-
-Like any other runtime, StarPU has some overhead to manage tasks. Since
-it does smart scheduling and data management, that overhead is not always
-neglectable. The order of magnitude of the overhead is typically a couple of
-microseconds, which is actually quite smaller than the CUDA overhead itself. The
-amount of work that a task should do should thus be somewhat
-bigger, to make sure that the overhead becomes neglectible. The offline
-performance feedback can provide a measure of task length, which should thus be
-checked if bad performance are observed. To get a grasp at the scalability
-possibility according to task size, one can run
-<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
-speedup of independent tasks of very small sizes.
-
-The choice of scheduler also has impact over the overhead: for instance, the
- scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
-not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
-impact that has on the target machine.
-
-\section TaskSubmission Task Submission
-
-To let StarPU make online optimizations, tasks should be submitted
-asynchronously as much as possible. Ideally, all the tasks should be
-submitted, and mere calls to starpu_task_wait_for_all() or
-starpu_data_unregister() be done to wait for
-termination. StarPU will then be able to rework the whole schedule, overlap
-computation with communication, manage accelerator local memory usage, etc.
-
-\section TaskPriorities Task Priorities
-
-By default, StarPU will consider the tasks in the order they are submitted by
-the application. If the application programmer knows that some tasks should
-be performed in priority (for instance because their output is needed by many
-other tasks and may thus be a bottleneck if not executed early
-enough), the field starpu_task::priority should be set to transmit the
-priority information to StarPU.
-
-\section TaskSchedulingPolicy Task Scheduling Policy
-
-By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
-because it provides correct load balance even if the application codelets do not
-have performance models. If your application codelets have performance models
-(\ref PerformanceModelExample), you should change the scheduler thanks
-to the environment variable \ref STARPU_SCHED. For instance <c>export
-STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
-
-The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
-to work on. This however does not permit to prefetch data since the scheduling
-decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
-
-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
-priority (between -5 and 5).
-
-The <b>random</b> scheduler distributes tasks randomly according to assumed worker
-overall performance.
-
-The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
-default. When a worker becomes idle, it steals a task from the most loaded
-worker.
-
-The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
-perform an HEFT-similar scheduling strategy: it schedules tasks where their
-termination time will be minimal.
-
-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
-into account data transfer time.
-
-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
-it also sorts tasks on per-worker queues by number of already-available data
-buffers.
-
-The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
-also supports arbitrary priority values.
-
-The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
-is now just an alias for <b>dmda</b>.
-
-The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
-parallel tasks (still experimental). Should not be used when several contexts using
-it are being executed simultaneously.
-
-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
-supports parallel tasks (still experimental). Should not be used when several 
-contexts using it are being executed simultaneously.
-
-
-\section PerformanceModelCalibration Performance Model Calibration
-
-Most schedulers are based on an estimation of codelet duration on each kind
-of processing unit. For this to be possible, the application programmer needs
-to configure a performance model for the codelets of the application (see
-\ref PerformanceModelExample for instance). History-based performance models
-use on-line calibration.  StarPU will automatically calibrate codelets
-which have never been calibrated yet, and save the result in
-<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
-The models are indexed by machine name. To share the models between
-machines (e.g. for a homogeneous cluster), use <c>export
-STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
-use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
-has not-so-stable performance. StarPU will force calibration (and thus ignore
-the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
-made on each architecture, to avoid badly scheduling tasks just because the
-first measurements were not so good. Details on the current performance model status
-can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
-option lists the available performance models, and the <c>-s</c> option permits
-to choose the performance model to be displayed. The result looks like:
-
-\verbatim
-$ starpu_perfmodel_display -s starpu_slu_lu_model_11
-performance model for cpu_impl_0
-# hash    size     flops         mean          dev           n
-914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
-3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
-e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
-...
-\endverbatim
-
-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
-execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
-8 samples. It is a good idea to check this before doing actual performance
-measurements.
-
-A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
-
-\verbatim
-$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
-4096 16384 65536 262144 1048576 4194304 
-$ gnuplot starpu_starpu_slu_lu_model_11.gp
-$ gv starpu_starpu_slu_lu_model_11.eps
-\endverbatim
-
-\image html starpu_starpu_slu_lu_model_11.png
-\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
-
-If a kernel source code was modified (e.g. performance improvement), the
-calibration information is stale and should be dropped, to re-calibrate from
-start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
-
-Note: due to CUDA limitations, to be able to measure kernel duration,
-calibration mode needs to disable asynchronous data transfers. Calibration thus
-disables data transfer / computation overlapping, and should thus not be used
-for eventual benchmarks. Note 2: history-based performance models get calibrated
-only if a performance-model-based scheduler is chosen.
-
-The history-based performance models can also be explicitly filled by the
-application without execution, if e.g. the application already has a series of
-measurements. This can be done by using starpu_perfmodel_update_history(),
-for instance:
-
-\code{.c}
-static struct starpu_perfmodel perf_model = {
-    .type = STARPU_HISTORY_BASED,
-    .symbol = "my_perfmodel",
-};
-
-struct starpu_codelet cl = {
-    .where = STARPU_CUDA,
-    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
-    .nbuffers = 1,
-    .modes = {STARPU_W},
-    .model = &perf_model
-};
-
-void feed(void) {
-    struct my_measure *measure;
-    struct starpu_task task;
-    starpu_task_init(&task);
-
-    task.cl = &cl;
-
-    for (measure = &measures[0]; measure < measures[last]; measure++) {
-        starpu_data_handle_t handle;
-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
-	task.handles[0] = handle;
-	starpu_perfmodel_update_history(&perf_model, &task,
-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
-	                                measure->implementation, measure->time);
-	starpu_task_clean(&task);
-	starpu_data_unregister(handle);
-    }
-}
-\endcode
-
-Measurement has to be provided in milliseconds for the completion time models,
-and in Joules for the energy consumption models.
-
-\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
-
-Distributing tasks to balance the load induces data transfer penalty. StarPU
-thus needs to find a balance between both. The target function that the
-scheduler <c>dmda</c> of StarPU
-tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
-<c>T_execution</c> is the estimated execution time of the codelet (usually
-accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
-latter is estimated based on bus calibration before execution start,
-i.e. with an idle machine, thus without contention. You can force bus
-re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
-beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
-by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
-real application execution, contention makes transfer times bigger.
-This is of course imprecise, but in practice, a rough estimation
-already gives the good results that a precise estimation would give.
-
-\section DataPrefetch Data Prefetch
-
-The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
-perform data prefetch (see \ref STARPU_PREFETCH):
-as soon as a scheduling decision is taken for a task, requests are issued to
-transfer its required data to the target processing unit, if needed, so that
-when the processing unit actually starts the task, its data will hopefully be
-already available and it will not have to wait for the transfer to finish.
-
-The application may want to perform some manual prefetching, for several reasons
-such as excluding initial data transfers from performance measurements, or
-setting up an initial statically-computed data distribution on the machine
-before submitting tasks, which will thus guide StarPU toward an initial task
-distribution (since StarPU will try to avoid further transfers).
-
-This can be achieved by giving the function starpu_data_prefetch_on_node()
-the handle and the desired target memory node.
-
-\section Power-basedScheduling Power-based Scheduling
-
-If the application can provide some power performance model (through
-the field starpu_codelet::power_model), StarPU will
-take it into account when distributing tasks. The target function that
-the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
-beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
-is the estimated task consumption in Joules. To tune this parameter, use
-<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
-(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
-<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
-
-This is however not sufficient to correctly optimize power: the scheduler would
-simply tend to run all computations on the most energy-conservative processing
-unit. To account for the consumption of the whole machine (including idle
-processing units), the idle power of the machine should be given by setting
-<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
-be obtained from the machine power supplier.
-
-The power actually consumed by the total execution can be displayed by setting
-<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
-
-On-line task consumption measurement is currently only supported through the
-<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
-simulator. Applications can however provide explicit measurements by
-using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
-with the <c>power_model</c> performance model). Fine-grain
-measurement is often not feasible with the feedback provided by the hardware, so
-the user can for instance run a given task a thousand times, measure the global
-consumption for that series of tasks, divide it by a thousand, repeat for
-varying kinds of tasks and task sizes, and eventually feed StarPU
-with these manual measurements through starpu_perfmodel_update_history().
-
-\section StaticScheduling Static Scheduling
-
-In some cases, one may want to force some scheduling, for instance force a given
-set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
-be scheduled on any other device. This can indeed be useful to guide StarPU into
-some work distribution, while still letting some degree of dynamism. For
-instance, to force execution of a task on CUDA0:
-
-\code{.c}
-task->execute_on_a_specific_worker = 1;
-task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
-\endcode
-
-Note however that using scheduling contexts while statically scheduling tasks on workers
-could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
-contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
-the execution of the application may deadlock. Moreover, the hypervisor should not be used when
-statically scheduling tasks.
-
-\section Profiling Profiling
-
-A quick view of how many tasks each worker has executed can be obtained by setting
-<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
-execution did happen on accelerators without penalizing performance with
-the profiling overhead.
-
-A quick view of how much data transfers have been issued can be obtained by setting
-<c>export STARPU_BUS_STATS=1</c> .
-
-More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
-calling starpu_profiling_status_set() from the source code.
-Statistics on the execution can then be obtained by using <c>export
-STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
- More details on performance feedback are provided by the next chapter.
-
-\section DetectionStuckConditions Detection Stuck Conditions
-
-It may happen that for some reason, StarPU does not make progress for a long
-period of time.  Reason are sometimes due to contention inside StarPU, but
-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
-driver, etc.
-
-<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
-
-allows to make StarPU print an error message whenever StarPU does not terminate
-any task for 10ms. In addition to that,
-
-<c>export STARPU_WATCHDOG_CRASH=1</c>
-
-triggers a crash in that condition, thus allowing to catch the situation in gdb
-etc.
-
-\section CUDA-specificOptimizations CUDA-specific Optimizations
-
-Due to CUDA limitations, StarPU will have a hard time overlapping its own
-communications and the codelet computations if the application does not use a
-dedicated CUDA stream for its computations instead of the default stream,
-which synchronizes all operations of the GPU. StarPU provides one by the use
-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
-operations to avoid this issue. For instance:
-
-\code{.c}
-func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
-cudaStreamSynchronize(starpu_cuda_get_local_stream());
-\endcode
-
-StarPU already does appropriate calls for the CUBLAS library.
-
-Unfortunately, some CUDA libraries do not have stream variants of
-kernels. That will lower the potential for overlapping.
-
-\section PerformanceDebugging Performance Debugging
-
-To get an idea of what is happening, a lot of performance feedback is available,
-detailed in the next chapter. The various informations should be checked for.
-
-<ul>
-<li>
-What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
-<ul>
-  <li> If it's mostly green (tasks running in the initial context) or context specific
-  color prevailing, then the machine is properly
-  utilized, and perhaps the codelets are just slow. Check their performance, see
-  \ref PerformanceOfCodelets.
-  </li>
-  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
-  transfers, do you perhaps have far more communication than computation? Did
-  you properly use CUDA streams to make sure communication can be
-  overlapped? Did you use data-locality aware schedulers to avoid transfers as
-  much as possible?
-  </li>
-  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
-  do you have enough parallelism? It might be a good idea to check what the DAG
-  looks like (see \ref CreatingADAGWithGraphviz).
-  </li>
-  <li> If only some workers are completely red (Blocked), for some reason the
-  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
-  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
-  performance model?  When some of them don't, the schedulers switches to a
-  greedy algorithm which thus performs badly.
-  </li>
-</ul>
-</li>
-</ul>
-
-You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
-visualize the task graph more easily.
-
-\section SimulatedPerformance Simulated Performance
-
-StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform.
-
-\subsection Calibration Calibration
-
-The idea is to first compile StarPU normally, and run the application,
-so as to automatically benchmark the bus and the codelets.
-
-\verbatim
-$ ./configure && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-   is not calibrated, forcing calibration for this run. Use the
-   STARPU_CALIBRATE environment variable to control this.
-$ ...
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST PASSED
-\endverbatim
-
-Note that we force to use the scheduler <c>dmda</c> to generate
-performance models for the application. The application may need to be
-run several times before the model is calibrated.
-
-\subsection Simulation Simulation
-
-Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
-to <c>./configure</c>, and re-run the application:
-
-\verbatim
-$ ./configure --enable-simgrid && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST FAILED !!!
-\endverbatim
-
-It is normal that the test fails: since the computation are not actually done
-(that is the whole point of simgrid), the result is wrong, of course.
-
-If the performance model is not calibrated enough, the following error
-message will be displayed
-
-\verbatim
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-    is not calibrated, forcing calibration for this run. Use the
-    STARPU_CALIBRATE environment variable to control this.
-[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
-    matvecmult does not have a perfmodel, or is not calibrated enough
-\endverbatim
-
-The number of devices can be chosen as usual with \ref STARPU_NCPU,
-\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
-cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
-lower than the real number on the current machine.
-
-The amount of simulated GPU memory is for now unbound by default, but
-it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
-\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
-
-The Simgrid default stack size is small; to increase it use the
-parameter <c>--cfg=contexts/stack_size</c>, for example:
-
-\verbatim
-$ ./example --cfg=contexts/stack_size:8192
-TEST FAILED !!!
-\endverbatim
-
-Note: of course, if the application uses <c>gettimeofday</c> to make its
-performance measurements, the real time will be used, which will be bogus. To
-get the simulated time, it has to use starpu_timing_now() which returns the
-virtual timestamp in ms.
-
-\subsection SimulationOnAnotherMachine Simulation On Another Machine
-
-The simgrid support even permits to perform simulations on another machine, your
-desktop, typically. To achieve this, one still needs to perform the Calibration
-step on the actual machine to be simulated, then copy them to your desktop
-machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
-Simulation step on the desktop machine, by setting the environment
-variable \ref STARPU_HOSTNAME to the name of the actual machine, to
-make StarPU use the performance models of the simulated machine even
-on the desktop machine.
-
-If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
-use simgrid to simulate execution with CUDA/OpenCL devices, but the application
-source code will probably disable the CUDA and OpenCL codelets in thatcd sc
-case. Since during simgrid execution, the functions of the codelet are actually
-not called, one can use dummy functions such as the following to still permit
-CUDA or OpenCL execution:
-
-\snippet simgrid.c To be included. You should update doxygen if you see this text.
-
-*/

+ 226 - 0
doc/doxygen/chapters/05check_list_performance.doxy

@@ -0,0 +1,226 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page CheckListWhenPerformanceAreNotThere Check List When Performance Are Not There
+
+TODO: improve!
+
+Simply encapsulating application kernels into tasks already permits to
+seamlessly support CPU and GPUs at the same time. To achieve good
+performance, we give below a list of features which should be checked.
+
+\section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
+
+link to \ref DataManagement
+
+link to \ref DataPrefetch
+
+\section TaskRelatedFeaturesToImprovePerformance Task Related Features That May Improve Performance
+
+link to \ref TaskGranularity
+
+link to \ref TaskSubmission
+
+link to \ref TaskPriorities
+
+\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features That May Improve Performance
+
+link to \ref TaskSchedulingPolicy
+
+link to \ref TaskDistributionVsDataTransfer
+
+link to \ref Power-basedScheduling
+
+link to \ref StaticScheduling
+
+\section CUDA-specificOptimizations CUDA-specific Optimizations
+
+Due to CUDA limitations, StarPU will have a hard time overlapping its own
+communications and the codelet computations if the application does not use a
+dedicated CUDA stream for its computations instead of the default stream,
+which synchronizes all operations of the GPU. StarPU provides one by the use
+of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
+operations to avoid this issue. For instance:
+
+\code{.c}
+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaStreamSynchronize(starpu_cuda_get_local_stream());
+\endcode
+
+StarPU already does appropriate calls for the CUBLAS library.
+
+If the kernel can be made to only use this local stream or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding cuda_flags[] flag in the codelet and dropping the
+cudaStreamSynchronize() call at the end of the kernel. That way, StarPU will be
+able to pipeline submitting tasks to GPUs, instead of synchronizing at each
+kernel submission. The kernel just has to make sure that StarPU can use the
+local stream to synchronize with the kernel startup and completion.
+
+Unfortunately, some CUDA libraries do not have stream variants of
+kernels. That will lower the potential for overlapping.
+
+\section OpenCL-specificOptimizations OpenCL-specific Optimizations
+
+If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
+streams, i.e. the whole kernel submission can be made asynchronous, then
+one should enable asynchronous execution of the kernel. This means setting
+the corresponding opencl_flags[] flag in the codelet and dropping the
+clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel.
+That way, StarPU will be able to pipeline submitting tasks to GPUs, instead of
+synchronizing at each kernel submission. The kernel just has to make sure
+that StarPU can use the command queue it has provided to synchronize with the
+kernel startup and completion.
+
+\section DetectionStuckConditions Detection Stuck Conditions
+
+It may happen that for some reason, StarPU does not make progress for a long
+period of time.  Reason are sometimes due to contention inside StarPU, but
+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
+driver, etc.
+
+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
+
+allows to make StarPU print an error message whenever StarPU does not terminate
+any task for 10ms. In addition to that,
+
+<c>export STARPU_WATCHDOG_CRASH=1</c>
+
+triggers a crash in that condition, thus allowing to catch the situation in gdb
+etc.
+
+\section HowToLimitMemoryPerNode How to limit memory per node
+
+TODO
+
+Talk about
+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
+\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
+and \ref STARPU_LIMIT_CPU_MEM
+
+starpu_memory_get_total()
+
+starpu_memory_get_available()
+
+\section PerformanceModelCalibration Performance Model Calibration
+
+Most schedulers are based on an estimation of codelet duration on each kind
+of processing unit. For this to be possible, the application programmer needs
+to configure a performance model for the codelets of the application (see
+\ref PerformanceModelExample for instance). History-based performance models
+use on-line calibration.  StarPU will automatically calibrate codelets
+which have never been calibrated yet, and save the result in
+<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
+The models are indexed by machine name. To share the models between
+machines (e.g. for a homogeneous cluster), use <c>export
+STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
+use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
+has not-so-stable performance. StarPU will force calibration (and thus ignore
+the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
+made on each architecture, to avoid badly scheduling tasks just because the
+first measurements were not so good. Details on the current performance model status
+can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
+option lists the available performance models, and the <c>-s</c> option permits
+to choose the performance model to be displayed. The result looks like:
+
+\verbatim
+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
+performance model for cpu_impl_0
+# hash    size     flops         mean          dev           n
+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
+...
+\endverbatim
+
+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
+8 samples. It is a good idea to check this before doing actual performance
+measurements.
+
+A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
+
+\verbatim
+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
+4096 16384 65536 262144 1048576 4194304 
+$ gnuplot starpu_starpu_slu_lu_model_11.gp
+$ gv starpu_starpu_slu_lu_model_11.eps
+\endverbatim
+
+\image html starpu_starpu_slu_lu_model_11.png
+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
+
+If a kernel source code was modified (e.g. performance improvement), the
+calibration information is stale and should be dropped, to re-calibrate from
+start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
+
+Note: due to CUDA limitations, to be able to measure kernel duration,
+calibration mode needs to disable asynchronous data transfers. Calibration thus
+disables data transfer / computation overlapping, and should thus not be used
+for eventual benchmarks. Note 2: history-based performance models get calibrated
+only if a performance-model-based scheduler is chosen.
+
+The history-based performance models can also be explicitly filled by the
+application without execution, if e.g. the application already has a series of
+measurements. This can be done by using starpu_perfmodel_update_history(),
+for instance:
+
+\code{.c}
+static struct starpu_perfmodel perf_model = {
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "my_perfmodel",
+};
+
+struct starpu_codelet cl = {
+    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
+    .nbuffers = 1,
+    .modes = {STARPU_W},
+    .model = &perf_model
+};
+
+void feed(void) {
+    struct my_measure *measure;
+    struct starpu_task task;
+    starpu_task_init(&task);
+
+    task.cl = &cl;
+
+    for (measure = &measures[0]; measure < measures[last]; measure++) {
+        starpu_data_handle_t handle;
+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
+	task.handles[0] = handle;
+	starpu_perfmodel_update_history(&perf_model, &task,
+	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
+	                                measure->implementation, measure->time);
+	starpu_task_clean(&task);
+	starpu_data_unregister(handle);
+    }
+}
+\endcode
+
+Measurement has to be provided in milliseconds for the completion time models,
+and in Joules for the energy consumption models.
+
+\section Profiling Profiling
+
+A quick view of how many tasks each worker has executed can be obtained by setting
+<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
+execution did happen on accelerators, without penalizing performance with
+the profiling overhead.
+
+A quick view of how much data transfers have been issued can be obtained by setting
+<c>export STARPU_BUS_STATS=1</c> .
+
+More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
+calling starpu_profiling_status_set() from the source code.
+Statistics on the execution can then be obtained by using <c>export
+STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
+ More details on performance feedback are provided by the next chapter.
+
+*/

+ 438 - 0
doc/doxygen/chapters/06tasks.doxy

@@ -0,0 +1,438 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page TasksInStarPU Tasks In StarPU
+
+\section TaskGranularity Task Granularity
+
+Like any other runtime, StarPU has some overhead to manage tasks. Since
+it does smart scheduling and data management, that overhead is not always
+neglectable. The order of magnitude of the overhead is typically a couple of
+microseconds, which is actually quite smaller than the CUDA overhead itself. The
+amount of work that a task should do should thus be somewhat
+bigger, to make sure that the overhead becomes neglectible. The offline
+performance feedback can provide a measure of task length, which should thus be
+checked if bad performance are observed. To get a grasp at the scalability
+possibility according to task size, one can run
+<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
+speedup of independent tasks of very small sizes.
+
+The choice of scheduler also has impact over the overhead: for instance, the
+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
+not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
+impact that has on the target machine.
+
+\section TaskSubmission Task Submission
+
+To let StarPU make online optimizations, tasks should be submitted
+asynchronously as much as possible. Ideally, all the tasks should be
+submitted, and mere calls to starpu_task_wait_for_all() or
+starpu_data_unregister() be done to wait for
+termination. StarPU will then be able to rework the whole schedule, overlap
+computation with communication, manage accelerator local memory usage, etc.
+
+\section TaskPriorities Task Priorities
+
+By default, StarPU will consider the tasks in the order they are submitted by
+the application. If the application programmer knows that some tasks should
+be performed in priority (for instance because their output is needed by many
+other tasks and may thus be a bottleneck if not executed early
+enough), the field starpu_task::priority should be set to transmit the
+priority information to StarPU.
+
+\section SettingTheDataHandlesForATask Setting The Data Handles For A Task
+
+The number of data a task can manage is fixed by the environment variable
+\ref STARPU_NMAXBUFS which has a default value which can be changed
+through the configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
+However, it is possible to define tasks managing more data by using
+the field starpu_task::dyn_handles when defining a task and the field
+starpu_codelet::dyn_modes when defining the corresponding codelet.
+
+\code{.c}
+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
+	STARPU_R, STARPU_R, ...
+};
+
+struct starpu_codelet dummy_big_cl =
+{
+	.cuda_funcs = { dummy_big_kernel, NULL },
+	.opencl_funcs = { dummy_big_kernel, NULL },
+	.cpu_funcs = { dummy_big_kernel, NULL },
+	.cpu_funcs_name = { "dummy_big_kernel", NULL },
+	.nbuffers = STARPU_NMAXBUFS+1,
+	.dyn_modes = modes
+};
+
+task = starpu_task_create();
+task->cl = &dummy_big_cl;
+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<task->cl->nbuffers ; i++)
+{
+	task->dyn_handles[i] = handle;
+}
+starpu_task_submit(task);
+\endcode
+
+\code{.c}
+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+{
+	handles[i] = handle;
+}
+starpu_task_insert(&dummy_big_cl,
+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+		 0);
+\endcode
+
+The whole code for this complex data interface is available in the
+directory <c>examples/basic_examples/dynamic_handles.c</c>.
+
+\section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
+
+One may want to write multiple implementations of a codelet for a single type of
+device and let StarPU choose which one to run. As an example, we will show how
+to use SSE to scale a vector. The codelet can be written as follows:
+
+\code{.c}
+#include <xmmintrin.h>
+
+void scal_sse_func(void *buffers[], void *cl_arg)
+{
+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+    unsigned int n_iterations = n/4;
+    if (n % 4 != 0)
+        n_iterations++;
+
+    __m128 *VECTOR = (__m128*) vector;
+    __m128 factor __attribute__((aligned(16)));
+    factor = _mm_set1_ps(*(float *) cl_arg);
+
+    unsigned int i;
+    for (i = 0; i < n_iterations; i++)
+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
+}
+\endcode
+
+\code{.c}
+struct starpu_codelet cl = {
+    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
+    .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+Schedulers which are multi-implementation aware (only <c>dmda</c> and
+<c>pheft</c> for now) will use the performance models of all the
+implementations it was given, and pick the one that seems to be the fastest.
+
+\section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
+
+Some implementations may not run on some devices. For instance, some CUDA
+devices do not support double floating point precision, and thus the kernel
+execution would just fail; or the device may not have enough shared memory for
+the implementation being used. The field starpu_codelet::can_execute
+permits to express this. For instance:
+
+\code{.c}
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 3)
+    /* At least compute capability 1.3, supports doubles */
+    return 1;
+  /* Old card, does not support doubles */
+  return 0;
+}
+
+struct starpu_codelet cl = {
+    .can_execute = can_execute,
+    .cpu_funcs = { cpu_func, NULL },
+    .cpu_funcs_name = { "cpu_func", NULL },
+    .cuda_funcs = { gpu_func, NULL }
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+This can be essential e.g. when running on a machine which mixes various models
+of CUDA devices, to take benefit from the new models without crashing on old models.
+
+Note: the function starpu_codelet::can_execute is called by the
+scheduler each time it tries to match a task with a worker, and should
+thus be very fast. The function starpu_cuda_get_device_properties()
+provides a quick access to CUDA properties of CUDA devices to achieve
+such efficiency.
+
+Another example is to compile CUDA code for various compute capabilities,
+resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
+1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
+provided to StarPU by using starpu_codelet::cuda_funcs, and
+starpu_codelet::can_execute can then be used to rule out the
+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
+
+\code{.c}
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  if (nimpl == 0)
+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
+    return 1;
+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 0)
+    /* At least compute capability 2.0, can run it */
+    return 1;
+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
+  return 0;
+}
+
+struct starpu_codelet cl = {
+    .can_execute = can_execute,
+    .cpu_funcs = { cpu_func, NULL },
+    .cpu_funcs_name = { "cpu_func", NULL },
+    .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+Note: the most generic variant should be provided first, as some schedulers are
+not able to try the different variants.
+
+\section InsertTaskUtility Insert Task Utility
+
+StarPU provides the wrapper function starpu_task_insert() to ease
+the creation and submission of tasks.
+
+Here the implementation of the codelet:
+
+\code{.c}
+void func_cpu(void *descr[], void *_args)
+{
+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        int ifactor;
+        float ffactor;
+
+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
+        *x0 = *x0 * ifactor;
+        *x1 = *x1 * ffactor;
+}
+
+struct starpu_codelet mycodelet = {
+        .cpu_funcs = { func_cpu, NULL },
+        .cpu_funcs_name = { "func_cpu", NULL },
+        .nbuffers = 2,
+        .modes = { STARPU_RW, STARPU_RW }
+};
+\endcode
+
+And the call to the function starpu_task_insert():
+
+\code{.c}
+starpu_task_insert(&mycodelet,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+                   0);
+\endcode
+
+The call to starpu_task_insert() is equivalent to the following
+code:
+
+\code{.c}
+struct starpu_task *task = starpu_task_create();
+task->cl = &mycodelet;
+task->handles[0] = data_handles[0];
+task->handles[1] = data_handles[1];
+char *arg_buffer;
+size_t arg_buffer_size;
+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
+                    0);
+task->cl_arg = arg_buffer;
+task->cl_arg_size = arg_buffer_size;
+int ret = starpu_task_submit(task);
+\endcode
+
+Here a similar call using ::STARPU_DATA_ARRAY.
+
+\code{.c}
+starpu_task_insert(&mycodelet,
+                   STARPU_DATA_ARRAY, data_handles, 2,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   0);
+\endcode
+
+If some part of the task insertion depends on the value of some computation,
+the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
+instance, assuming that the index variable <c>i</c> was registered as handle
+<c>A_handle[i]</c>:
+
+\code{.c}
+/* Compute which portion we will work on, e.g. pivot */
+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
+
+/* And submit the corresponding task */
+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
+\endcode
+
+The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
+acquiring data <c>i</c> for the main application, and will execute the code
+given as third parameter when it is acquired. In other words, as soon as the
+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
+portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
+be executed, and is allowed to read from <c>i</c> to use it e.g. as an
+index. Note that this macro is only avaible when compiling StarPU with
+the compiler <c>gcc</c>.
+
+\section ParallelTasks Parallel Tasks
+
+StarPU can leverage existing parallel computation libraries by the means of
+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
+(called a parallel or combined worker) at the same time, by using an existing
+parallel CPU implementation of the computation to be achieved. This can also be
+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
+work collectively on a single task, the completion time of tasks on CPUs become
+comparable to the completion time on GPUs, thus relieving from granularity
+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
+good performance, otherwise StarPU will not know how to better group
+cores.
+
+Two modes of execution exist to accomodate with existing usages.
+
+\subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
+
+In the Fork mode, StarPU will call the codelet function on one
+of the CPUs of the combined worker. The codelet function can use
+starpu_combined_worker_get_size() to get the number of threads it is
+allowed to start to achieve the computation. The CPU binding mask for the whole
+set of CPUs is already enforced, so that threads created by the function will
+inherit the mask, and thus execute where StarPU expected, the OS being in charge
+of choosing how to schedule threads on the corresponding CPUs. The application
+can also choose to bind threads by hand, using e.g. sched_getaffinity to know
+the CPU binding mask that StarPU chose.
+
+For instance, using OpenMP (full source is available in
+<c>examples/openmp/vector_scal.c</c>):
+
+\snippet forkmode.c To be included. You should update doxygen if you see this text.
+
+Other examples include for instance calling a BLAS parallel CPU implementation
+(see <c>examples/mult/xgemm.c</c>).
+
+\subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
+
+In the SPMD mode, StarPU will call the codelet function on
+each CPU of the combined worker. The codelet function can use
+starpu_combined_worker_get_size() to get the total number of CPUs
+involved in the combined worker, and thus the number of calls that are made in
+parallel to the function, and starpu_combined_worker_get_rank() to get
+the rank of the current CPU within the combined worker. For instance:
+
+\code{.c}
+static void func(void *buffers[], void *args)
+{
+    unsigned i;
+    float *factor = _args;
+    struct starpu_vector_interface *vector = buffers[0];
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* Compute slice to compute */
+    unsigned m = starpu_combined_worker_get_size();
+    unsigned j = starpu_combined_worker_get_rank();
+    unsigned slice = (n+m-1)/m;
+
+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
+        val[i] *= *factor;
+}
+
+static struct starpu_codelet cl =
+{
+    .modes = { STARPU_RW },
+    .type = STARPU_SPMD,
+    .max_parallelism = INT_MAX,
+    .cpu_funcs = { func, NULL },
+    .cpu_funcs_name = { "func", NULL },
+    .nbuffers = 1,
+}
+\endcode
+
+Of course, this trivial example will not really benefit from parallel task
+execution, and was only meant to be simple to understand.  The benefit comes
+when the computation to be done is so that threads have to e.g. exchange
+intermediate results, or write to the data in a complex but safe way in the same
+buffer.
+
+\subsection ParallelTasksPerformance Parallel Tasks Performance
+
+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
+be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
+(parallel eager) will indeed also try to execute tasks with
+several CPUs. It will automatically try the various available combined
+worker sizes (making several measurements for each worker size) and
+thus be able to avoid choosing a large combined worker if the codelet
+does not actually scale so much.
+
+\subsection CombinedWorkers Combined Workers
+
+By default, StarPU creates combined workers according to the architecture
+structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
+without a hierarchy of shared caches), StarPU will create combined workers of
+intermediate sizes. The variable \ref
+STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
+arity between levels of combined workers.
+
+The combined workers actually produced can be seen in the output of the
+tool <c>starpu_machine_display</c> (the environment variable \ref
+STARPU_SCHED has to be set to a combined worker-aware scheduler such
+as <c>pheft</c> or <c>peager</c>).
+
+\subsection ConcurrentParallelTasks Concurrent Parallel Tasks
+
+Unfortunately, many environments and librairies do not support concurrent
+calls.
+
+For instance, most OpenMP implementations (including the main ones) do not
+support concurrent <c>pragma omp parallel</c> statements without nesting them in
+another <c>pragma omp parallel</c> statement, but StarPU does not yet support
+creating its CPU workers by using such pragma.
+
+Other parallel libraries are also not safe when being invoked concurrently
+from different threads, due to the use of global variables in their sequential
+sections for instance.
+
+The solution is then to use only one combined worker at a time.  This can be
+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
+setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
+CPU and GPU tasks are not affected and can be run concurrently). The parallel
+task scheduler will however still however still try varying combined worker
+sizes to look for the most efficient ones.
+
+
+*/

+ 0 - 114
doc/doxygen/chapters/06tips_and_tricks.doxy

@@ -1,114 +0,0 @@
-/*
- * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
- * See the file version.doxy for copying conditions.
- */
-
-/*! \page TipsAndTricksToKnowAbout Tips and Tricks To Know About
-
-\section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
-
-Some libraries need to be initialized once for each concurrent instance that
-may run on the machine. For instance, a C++ computation class which is not
-thread-safe by itself, but for which several instanciated objects of that class
-can be used concurrently. This can be used in StarPU by initializing one such
-object per worker. For instance, the libstarpufft example does the following to
-be able to use FFTW on CPUs.
-
-Some global array stores the instanciated objects:
-
-\code{.c}
-fftw_plan plan_cpu[STARPU_NMAXWORKERS];
-\endcode
-
-At initialisation time of libstarpu, the objects are initialized:
-
-\code{.c}
-int workerid;
-for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
-    switch (starpu_worker_get_type(workerid)) {
-        case STARPU_CPU_WORKER:
-            plan_cpu[workerid] = fftw_plan(...);
-            break;
-    }
-}
-\endcode
-
-And in the codelet body, they are used:
-
-\code{.c}
-static void fft(void *descr[], void *_args)
-{
-    int workerid = starpu_worker_get_id();
-    fftw_plan plan = plan_cpu[workerid];
-    ...
-
-    fftw_execute(plan, ...);
-}
-\endcode
-
-This however is not sufficient for FFT on CUDA: initialization has
-to be done from the workers themselves.  This can be done thanks to
-starpu_execute_on_each_worker().  For instance libstarpufft does the following.
-
-\code{.c}
-static void fft_plan_gpu(void *args)
-{
-    plan plan = args;
-    int n2 = plan->n2[0];
-    int workerid = starpu_worker_get_id();
-
-    cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
-    cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
-}
-void starpufft_plan(void)
-{
-    starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
-}
-\endcode
-
-\section HowToLimitMemoryPerNode How to limit memory per node
-
-TODO
-
-Talk about
-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
-\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
-and \ref STARPU_LIMIT_CPU_MEM
-
-starpu_memory_get_available()
-
-\section ThreadBindingOnNetBSD Thread Binding on NetBSD
-
-When using StarPU on a NetBSD machine, if the topology
-discovery library <c>hwloc</c> is used, thread binding will fail. To
-prevent the problem, you should at least use the version 1.7 of
-<c>hwloc</c>, and also issue the following call:
-
-\verbatim
-$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
-\endverbatim
-
-Or add the following line in the file <c>/etc/sysctl.conf</c>
-
-\verbatim
-security.models.extensions.user_set_cpu_affinity=1
-\endverbatim
-
-\section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
-
-Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
-1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
-parallelism using StarPU (no multithreaded tasks), setting the
-environment variable MKL_NUM_THREADS to 1, and using the threaded MKL library,
-with iomp5.
-
-Using this configuration, StarPU uses only 1 core, no matter the value of
-\ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
-
-The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
-(http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
-
-*/

+ 533 - 0
doc/doxygen/chapters/07data_management.doxy

@@ -0,0 +1,533 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page DataManagement Data Management
+
+intro qui parle de coherency entre autres
+
+\section DataManagement Data Management
+
+When the application allocates data, whenever possible it should use
+the function starpu_malloc(), which will ask CUDA or OpenCL to make
+the allocation itself and pin the corresponding allocated memory. This
+is needed to permit asynchronous data transfer, i.e. permit data
+transfer to overlap with computations. Otherwise, the trace will show
+that the <c>DriverCopyAsync</c> state takes a lot of time, this is
+because CUDA or OpenCL then reverts to synchronous transfers.
+
+By default, StarPU leaves replicates of data wherever they were used, in case they
+will be re-used by other tasks, thus saving the data transfer time. When some
+task modifies some data, all the other replicates are invalidated, and only the
+processing unit which ran that task will have a valid replicate of the data. If the application knows
+that this data will not be re-used by further tasks, it should advise StarPU to
+immediately replicate it to a desired list of memory nodes (given through a
+bitmask). This can be understood like the write-through mode of CPU caches.
+
+\code{.c}
+starpu_data_set_wt_mask(img_handle, 1<<0);
+\endcode
+
+will for instance request to always automatically transfer a replicate into the
+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
+
+\code{.c}
+starpu_data_set_wt_mask(img_handle, ~0U);
+\endcode
+
+will request to always automatically broadcast the updated data to all memory
+nodes.
+
+Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
+memory nodes always have a copy of the data, so that it is never evicted when
+memory gets scarse.
+
+Implicit data dependency computation can become expensive if a lot
+of tasks access the same piece of data. If no dependency is required
+on some piece of data (e.g. because it is only accessed in read-only
+mode, or because write accesses are actually commutative), use the
+function starpu_data_set_sequential_consistency_flag() to disable
+implicit dependencies on that data.
+
+In the same vein, accumulation of results in the same data can become a
+bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
+accumulation (see \ref DataReduction). To a lesser extent, the use of
+the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
+the accumulation to happen in any order.
+
+Applications often need a data just for temporary results.  In such a case,
+registration can be made without an initial value, for instance this produces a vector data:
+
+\code{.c}
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+\endcode
+
+StarPU will then allocate the actual buffer only when it is actually needed,
+e.g. directly on the GPU without allocating in main memory.
+
+In the same vein, once the temporary results are not useful any more, the
+data should be thrown away. If the handle is not to be reused, it can be
+unregistered:
+
+\code{.c}
+starpu_data_unregister_submit(handle);
+\endcode
+
+actual unregistration will be done after all tasks working on the handle
+terminate.
+
+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
+
+\code{.c}
+starpu_data_invalidate_submit(handle);
+\endcode
+
+the buffers containing the current value will then be freed, and reallocated
+only when another task writes some value to the handle.
+
+\section DataPrefetch Data Prefetch
+
+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
+perform data prefetch (see \ref STARPU_PREFETCH):
+as soon as a scheduling decision is taken for a task, requests are issued to
+transfer its required data to the target processing unit, if needed, so that
+when the processing unit actually starts the task, its data will hopefully be
+already available and it will not have to wait for the transfer to finish.
+
+The application may want to perform some manual prefetching, for several reasons
+such as excluding initial data transfers from performance measurements, or
+setting up an initial statically-computed data distribution on the machine
+before submitting tasks, which will thus guide StarPU toward an initial task
+distribution (since StarPU will try to avoid further transfers).
+
+This can be achieved by giving the function starpu_data_prefetch_on_node()
+the handle and the desired target memory node.
+
+\section PartitioningData Partitioning Data
+
+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
+
+\code{.c}
+int vector[NX];
+starpu_data_handle_t handle;
+
+/* Declare data to StarPU */
+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
+                            NX, sizeof(vector[0]));
+
+/* Partition the vector in PARTS sub-vectors */
+starpu_data_filter f =
+{
+    .filter_func = starpu_vector_filter_block,
+    .nchildren = PARTS
+};
+starpu_data_partition(handle, &f);
+\endcode
+
+The task submission then uses the function starpu_data_get_sub_data()
+to retrieve the sub-handles to be passed as tasks parameters.
+
+\code{.c}
+/* Submit a task on each sub-vector */
+for (i=0; i<starpu_data_get_nb_children(handle); i++) {
+    /* Get subdata number i (there is only 1 dimension) */
+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
+    struct starpu_task *task = starpu_task_create();
+
+    task->handles[0] = sub_handle;
+    task->cl = &cl;
+    task->synchronous = 1;
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+
+    starpu_task_submit(task);
+}
+\endcode
+
+Partitioning can be applied several times, see
+<c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
+
+Wherever the whole piece of data is already available, the partitioning will
+be done in-place, i.e. without allocating new buffers but just using pointers
+inside the existing copy. This is particularly important to be aware of when
+using OpenCL, where the kernel parameters are not pointers, but handles. The
+kernel thus needs to be also passed the offset within the OpenCL buffer:
+
+\code{.c}
+void opencl_func(void *buffers[], void *cl_arg)
+{
+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
+
+    ...
+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
+    ...
+}
+\endcode
+
+And the kernel has to shift from the pointer passed by the OpenCL driver:
+
+\code{.c}
+__kernel void opencl_kernel(__global int *vector, unsigned offset)
+{
+    block = (__global void *)block + offset;
+    ...
+}
+\endcode
+
+StarPU provides various interfaces and filters for matrices, vectors, etc.,
+but applications can also write their own data interfaces and filters, see
+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
+
+\section DataReduction Data Reduction
+
+In various cases, some piece of data is used to accumulate intermediate
+results. For instances, the dot product of a vector, maximum/minimum finding,
+the histogram of a photograph, etc. When these results are produced along the
+whole machine, it would not be efficient to accumulate them in only one place,
+incurring data transmission each and access concurrency.
+
+StarPU provides a mode ::STARPU_REDUX, which permits to optimize
+that case: it will allocate a buffer on each memory node, and accumulate
+intermediate results there. When the data is eventually accessed in the normal
+mode ::STARPU_R, StarPU will collect the intermediate results in just one
+buffer.
+
+For this to work, the user has to use the function
+starpu_data_set_reduction_methods() to declare how to initialize these
+buffers, and how to assemble partial results.
+
+For instance, <c>cg</c> uses that to optimize its dot product: it first defines
+the codelets for initialization and reduction:
+
+\code{.c}
+struct starpu_codelet bzero_variable_cl =
+{
+        .cpu_funcs = { bzero_variable_cpu, NULL },
+        .cpu_funcs_name = { "bzero_variable_cpu", NULL },
+        .cuda_funcs = { bzero_variable_cuda, NULL },
+        .nbuffers = 1,
+}
+
+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
+{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        *v_dst = *v_dst + *v_src;
+}
+
+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
+{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+struct starpu_codelet accumulate_variable_cl =
+{
+        .cpu_funcs = { accumulate_variable_cpu, NULL },
+        .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
+        .cuda_funcs = { accumulate_variable_cuda, NULL },
+        .nbuffers = 1,
+}
+\endcode
+
+and attaches them as reduction methods for its handle <c>dtq</c>:
+
+\code{.c}
+starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
+starpu_data_set_reduction_methods(dtq_handle,
+        &accumulate_variable_cl, &bzero_variable_cl);
+\endcode
+
+and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
+dot products with partitioned vectors:
+
+\code{.c}
+for (b = 0; b < nblocks; b++)
+    starpu_task_insert(&dot_kernel_cl,
+        STARPU_REDUX, dtq_handle,
+        STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+        STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+        0);
+\endcode
+
+During registration, we have here provided <c>NULL</c>, i.e. there is
+no initial value to be taken into account during reduction. StarPU
+will thus only take into account the contributions from the tasks
+<c>dot_kernel_cl</c>. Also, it will not allocate any memory for
+<c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
+
+If another dot product has to be performed, one could unregister
+<c>dtq_handle</c>, and re-register it. But one can also call
+starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
+which will clear all data from the handle, thus resetting it back to
+the initial status <c>register(NULL)</c>.
+
+The example <c>cg</c> also uses reduction for the blocked gemv kernel,
+leading to yet more relaxed dependencies and more parallelism.
+
+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
+case. That will however not produce any MPI communication, but just pass
+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
+application to call starpu_mpi_redux_data(), which posts tasks that will
+reduce the partial results among MPI nodes into the MPI node which owns the
+data. For instance, some hypothetical application which collects partial results
+into data <c>res</c>, then uses it for other computation, before looping again
+with a new reduction:
+
+\code{.c}
+for (i = 0; i < 100; i++) {
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
+               STARPU_R, B, STARPU_REDUX, res, 0);
+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+}
+\endcode
+
+\section TemporaryBuffers Temporary Buffers
+
+There are two kinds of temporary buffers: temporary data which just pass results
+from a task to another, and scratch data which are needed only internally by
+tasks.
+
+\subsection TemporaryData Temporary Data
+
+Data can sometimes be entirely produced by a task, and entirely consumed by
+another task, without the need for other parts of the application to access
+it. In such case, registration can be done without prior allocation, by using
+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
+actually allocate memory only when the task creating the content gets scheduled,
+and destroy it on unregistration.
+
+In addition to that, it can be tedious for the application to have to unregister
+the data, since it will not use its content anyway. The unregistration can be
+done lazily by using the function starpu_data_unregister_submit(),
+which will record that no more tasks accessing the handle will be submitted, so
+that it can be freed as soon as the last task accessing it is over.
+
+The following code examplifies both points: it registers the temporary
+data, submits three tasks accessing it, and records the data for automatic
+unregistration.
+
+\code{.c}
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_data_unregister_submit(handle);
+\endcode
+
+\subsection ScratchData Scratch Data
+
+Some kernels sometimes need temporary data to achieve the computations, i.e. a
+workspace. The application could allocate it at the start of the codelet
+function, and free it at the end, but that would be costly. It could also
+allocate one buffer per worker (similarly to \ref
+HowToInitializeAComputationLibraryOnceForEachWorker), but that would
+make them systematic and permanent. A more  optimized way is to use
+the data access mode ::STARPU_SCRATCH, as examplified below, which
+provides per-worker buffers without content consistency.
+
+\code{.c}
+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
+for (i = 0; i < N; i++)
+    starpu_task_insert(&compute, STARPU_R, input[i],
+                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+\endcode
+
+StarPU will make sure that the buffer is allocated before executing the task,
+and make this allocation per-worker: for CPU workers, notably, each worker has
+its own buffer. This means that each task submitted above will actually have its
+own workspace, which will actually be the same for all tasks running one after
+the other on the same worker. Also, if for instance GPU memory becomes scarce,
+StarPU will notice that it can free such buffers easily, since the content does
+not matter.
+
+The example <c>examples/pi</c> uses scratches for some temporary buffer.
+
+\section TheMultiformatInterface The Multiformat Interface
+
+It may be interesting to represent the same piece of data using two different
+data structures: one that would only be used on CPUs, and one that would only
+be used on GPUs. This can be done by using the multiformat interface. StarPU
+will be able to convert data from one data structure to the other when needed.
+Note that the scheduler <c>dmda</c> is the only one optimized for this
+interface. The user must provide StarPU with conversion codelets:
+
+\snippet multiformat.c To be included. You should update doxygen if you see this text.
+
+Kernels can be written almost as for any other interface. Note that
+::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
+must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
+::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
+be used in any kind of kernel.
+
+\code{.c}
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+{
+    struct point *aos;
+    unsigned int n;
+
+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+    ...
+}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+{
+    unsigned int n;
+    struct struct_of_arrays *soa;
+
+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+    ...
+}
+\endcode
+
+A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
+
+\section DefiningANewDataInterface Defining A New Data Interface
+
+Let's define a new data interface to manage complex numbers.
+
+\code{.c}
+/* interface for complex numbers */
+struct starpu_complex_interface
+{
+        double *real;
+        double *imaginary;
+        int nx;
+};
+\endcode
+
+Registering such a data to StarPU is easily done using the function
+starpu_data_register(). The last
+parameter of the function, <c>interface_complex_ops</c>, will be
+described below.
+
+\code{.c}
+void starpu_complex_data_register(starpu_data_handle_t *handle,
+     unsigned home_node, double *real, double *imaginary, int nx)
+{
+        struct starpu_complex_interface complex =
+        {
+                .real = real,
+                .imaginary = imaginary,
+                .nx = nx
+        };
+
+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+        {
+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
+        }
+
+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
+}
+\endcode
+
+Different operations need to be defined for a data interface through
+the type starpu_data_interface_ops. We only define here the basic
+operations needed to run simple applications. The source code for the
+different functions can be found in the file
+<c>examples/interface/complex_interface.c</c>.
+
+\code{.c}
+static struct starpu_data_interface_ops interface_complex_ops =
+{
+        .register_data_handle = complex_register_data_handle,
+        .allocate_data_on_node = complex_allocate_data_on_node,
+        .copy_methods = &complex_copy_methods,
+        .get_size = complex_get_size,
+        .footprint = complex_footprint,
+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+        .interface_size = sizeof(struct starpu_complex_interface),
+};
+\endcode
+
+Functions need to be defined to access the different fields of the
+complex interface from a StarPU data handle.
+
+\code{.c}
+double *starpu_complex_get_real(starpu_data_handle_t handle)
+{
+        struct starpu_complex_interface *complex_interface =
+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+        return complex_interface->real;
+}
+
+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
+int starpu_complex_get_nx(starpu_data_handle_t handle);
+\endcode
+
+Similar functions need to be defined to access the different fields of the
+complex interface from a <c>void *</c> pointer to be used within codelet
+implemetations.
+
+\snippet complex.c To be included. You should update doxygen if you see this text.
+
+Complex data interfaces can then be registered to StarPU.
+
+\code{.c}
+double real = 45.0;
+double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
+\endcode
+
+and used by codelets.
+
+\code{.c}
+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
+        int i;
+
+        for(i=0 ; i<nx ; i++)
+        {
+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
+        }
+}
+\endcode
+
+The whole code for this complex data interface is available in the
+directory <c>examples/interface/</c>.
+
+
+\section SpecifyingATargetNode Specifying a target node for task data
+
+When executing a task on a GPU for instance, StarPU would normally copy all the
+needed data for the tasks on the embedded memory of the GPU.  It may however
+happen that the task kernel would rather have some of the datas kept in the
+main memory instead of copied in the GPU, a pivoting vector for instance.
+This can be achieved by setting the starpu_codelet::specific_nodes flag to
+1, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
+starpu_codelet::nbuffers is greater than STARPU_NMAXBUFS) with the node numbers
+where data should be copied to, or -1 to let StarPU copy it to the memory node
+where the task will be executed. For instance, with the following codelet:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel, NULL },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_MAIN_RAM, -1},
+};
+\endcode
+
+the first data of the task will be kept in the main memory, while the second
+data will be copied to the CUDA GPU as usual.
+
+*/

+ 151 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -0,0 +1,151 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page Scheduling Scheduling
+
+\section TaskSchedulingPolicy Task Scheduling Policy
+
+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
+because it provides correct load balance even if the application codelets do not
+have performance models. If your application codelets have performance models
+(\ref PerformanceModelExample), you should change the scheduler thanks
+to the environment variable \ref STARPU_SCHED. For instance <c>export
+STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
+
+The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
+to work on. This however does not permit to prefetch data since the scheduling
+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
+
+The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
+priority (between -5 and 5).
+
+The <b>random</b> scheduler distributes tasks randomly according to assumed worker
+overall performance.
+
+The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
+default. When a worker becomes idle, it steals a task from the most loaded
+worker.
+
+The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
+perform an HEFT-similar scheduling strategy: it schedules tasks where their
+termination time will be minimal.
+
+The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
+into account data transfer time.
+
+The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
+it also sorts tasks on per-worker queues by number of already-available data
+buffers.
+
+The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
+also supports arbitrary priority values.
+
+The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
+is now just an alias for <b>dmda</b>.
+
+The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
+parallel tasks (still experimental). Should not be used when several contexts using
+it are being executed simultaneously.
+
+The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
+supports parallel tasks (still experimental). Should not be used when several 
+contexts using it are being executed simultaneously.
+
+\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
+
+Distributing tasks to balance the load induces data transfer penalty. StarPU
+thus needs to find a balance between both. The target function that the
+scheduler <c>dmda</c> of StarPU
+tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
+<c>T_execution</c> is the estimated execution time of the codelet (usually
+accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
+latter is estimated based on bus calibration before execution start,
+i.e. with an idle machine, thus without contention. You can force bus
+re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
+by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
+real application execution, contention makes transfer times bigger.
+This is of course imprecise, but in practice, a rough estimation
+already gives the good results that a precise estimation would give.
+
+\section Power-basedScheduling Power-based Scheduling
+
+If the application can provide some power performance model (through
+the field starpu_codelet::power_model), StarPU will
+take it into account when distributing tasks. The target function that
+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
+beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
+is the estimated task consumption in Joules. To tune this parameter, use
+<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
+<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
+
+This is however not sufficient to correctly optimize power: the scheduler would
+simply tend to run all computations on the most energy-conservative processing
+unit. To account for the consumption of the whole machine (including idle
+processing units), the idle power of the machine should be given by setting
+<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
+be obtained from the machine power supplier.
+
+The power actually consumed by the total execution can be displayed by setting
+<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
+
+On-line task consumption measurement is currently only supported through the
+<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
+simulator. Applications can however provide explicit measurements by
+using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
+with the <c>power_model</c> performance model). Fine-grain
+measurement is often not feasible with the feedback provided by the hardware, so
+the user can for instance run a given task a thousand times, measure the global
+consumption for that series of tasks, divide it by a thousand, repeat for
+varying kinds of tasks and task sizes, and eventually feed StarPU
+with these manual measurements through starpu_perfmodel_update_history().
+
+\section StaticScheduling Static Scheduling
+
+In some cases, one may want to force some scheduling, for instance force a given
+set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
+be scheduled on any other device. This can indeed be useful to guide StarPU into
+some work distribution, while still letting some degree of dynamism. For
+instance, to force execution of a task on CUDA0:
+
+\code{.c}
+task->execute_on_a_specific_worker = 1;
+task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
+\endcode
+
+Note however that using scheduling contexts while statically scheduling tasks on workers
+could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
+contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
+the execution of the application may deadlock. Moreover, the hypervisor should not be used when
+statically scheduling tasks.
+
+\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory <c>examples/scheduler/</c>.
+
+See \ref API_Scheduling_Policy
+
+\code{.c}
+static struct starpu_sched_policy dummy_sched_policy = {
+    .init_sched = init_dummy_sched,
+    .deinit_sched = deinit_dummy_sched,
+    .add_workers = dummy_sched_add_workers,
+    .remove_workers = dummy_sched_remove_workers,
+    .push_task = push_task_dummy,
+    .push_prio_task = NULL,
+    .pop_task = pop_task_dummy,
+    .post_exec_hook = NULL,
+    .pop_every_task = NULL,
+    .policy_name = "dummy",
+    .policy_description = "dummy scheduling strategy"
+};
+\endcode
+
+*/

doc/doxygen/chapters/13scheduling_contexts.doxy → doc/doxygen/chapters/09scheduling_contexts.doxy


doc/doxygen/chapters/14scheduling_context_hypervisor.doxy → doc/doxygen/chapters/10scheduling_context_hypervisor.doxy


+ 53 - 0
doc/doxygen/chapters/11debugging_tools.doxy

@@ -0,0 +1,53 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page DebuggingTools Debugging Tools
+
+StarPU provides several tools to help debugging applications. Execution traces
+can be generated and displayed graphically, see \ref
+GeneratingTracesWithFxT.
+
+Some gdb helpers are also provided to show the whole StarPU state:
+
+\verbatim
+(gdb) source tools/gdbinit
+(gdb) help starpu
+\endverbatim
+
+Valgrind can be used on StarPU: valgrind.h just needs to be found at ./configure
+time, to tell valgrind about some known false positives and disable host memory
+pinning. Other known false positives can be suppressed by giving the suppression
+files in tools/valgrind/ *.suppr to valgrind's --suppressions option.
+
+The STARPU_DISABLE_KERNELS environment variable can also be set to 1 to make
+StarPU do everything (schedule tasks, transfer memory, etc.) except actually
+calling the application-provided kernel functions, i.e. the computation will not
+happen. This permits to quickly check that the task scheme is working properly.
+
+The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
+
+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
+
+StarPU can connect to Temanejo >= 1.0rc2 (see
+http://www.hlrs.de/temanejo), to permit
+nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
+install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
+<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
+
+Make sure to specify at least the same number of CPUs in the dialog box as your
+machine has, otherwise an error will happen during execution. Future versions
+of Temanejo should be able to tell StarPU the number of CPUs to use.
+
+Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
+Temanejo (so as to distinguish them from tasks).
+
+
+
+*/

+ 437 - 0
doc/doxygen/chapters/12online_performance_tools.doxy

@@ -0,0 +1,437 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page OnlinePerformanceTools Online Performance Tools
+
+\section On-linePerformanceFeedback On-line Performance Feedback
+
+\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
+
+In order to enable online performance monitoring, the application can
+call starpu_profiling_status_set() with the parameter
+::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
+is already enabled or not by calling starpu_profiling_status_get().
+Enabling monitoring also reinitialize all previously collected
+feedback. The environment variable \ref STARPU_PROFILING can also be
+set to <c>1</c> to achieve the same effect. The function
+starpu_profiling_init() can also be called during the execution to
+reinitialize performance counters and to start the profiling if the
+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
+
+Likewise, performance monitoring is stopped by calling
+starpu_profiling_status_set() with the parameter
+::STARPU_PROFILING_DISABLE. Note that this does not reset the
+performance counters so that the application may consult them later
+on.
+
+More details about the performance monitoring API are available in \ref API_Profiling.
+
+\subsection Per-taskFeedback Per-task Feedback
+
+If profiling is enabled, a pointer to a structure
+starpu_profiling_task_info is put in the field
+starpu_task::profiling_info when a task terminates. This structure is
+automatically destroyed when the task structure is destroyed, either
+automatically or by calling starpu_task_destroy().
+
+The structure starpu_profiling_task_info indicates the date when the
+task was submitted (starpu_profiling_task_info::submit_time), started
+(starpu_profiling_task_info::start_time), and terminated
+(starpu_profiling_task_info::end_time), relative to the initialization
+of StarPU with starpu_init(). It also specifies the identifier of the worker
+that has executed the task (starpu_profiling_task_info::workerid).
+These date are stored as <c>timespec</c> structures which the user may convert
+into micro-seconds using the helper function
+starpu_timing_timespec_to_us().
+
+It it worth noting that the application may directly access this structure from
+the callback executed at the end of the task. The structure starpu_task
+associated to the callback currently being executed is indeed accessible with
+the function starpu_task_get_current().
+
+\subsection Per-codeletFeedback Per-codelet Feedback
+
+The field starpu_codelet::per_worker_stats is
+an array of counters. The i-th entry of the array is incremented every time a
+task implementing the codelet is executed on the i-th worker.
+This array is not reinitialized when profiling is enabled or disabled.
+
+\subsection Per-workerFeedback Per-worker Feedback
+
+The second argument returned by the function
+starpu_profiling_worker_get_info() is a structure
+starpu_profiling_worker_info that gives statistics about the specified
+worker. This structure specifies when StarPU started collecting
+profiling information for that worker
+(starpu_profiling_worker_info::start_time), the
+duration of the profiling measurement interval
+(starpu_profiling_worker_info::total_time), the time spent executing
+kernels (starpu_profiling_worker_info::executing_time), the time
+spent sleeping because there is no task to execute at all
+(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
+while profiling was enabled. These values give an estimation of the
+proportion of time spent do real work, and the time spent either
+sleeping because there are not enough executable tasks or simply
+wasted in pure StarPU overhead.
+
+Calling starpu_profiling_worker_get_info() resets the profiling
+information associated to a worker.
+
+When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
+possible to use the tool <c>starpu_workers_activity</c> (see \ref
+MonitoringActivity) to generate a graphic showing the evolution of
+these values during the time, for the different workers.
+
+\subsection Bus-relatedFeedback Bus-related Feedback
+
+TODO: ajouter \ref STARPU_BUS_STATS
+
+// how to enable/disable performance monitoring
+// what kind of information do we get ?
+
+The bus speed measured by StarPU can be displayed by using the tool
+<c>starpu_machine_display</c>, for instance:
+
+\verbatim
+StarPU has found:
+        3 CUDA devices
+                CUDA 0 (Tesla C2050 02:00.0)
+                CUDA 1 (Tesla C2050 03:00.0)
+                CUDA 2 (Tesla C2050 84:00.0)
+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
+RAM     0.000000        5176.530428     5176.492994     5191.710722
+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
+\endverbatim
+
+\subsection StarPU-TopInterface StarPU-Top Interface
+
+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
+application and permits the user to change parameters on the fly.
+
+Variables to be monitored can be registered by calling the functions
+starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
+starpu_top_add_data_float(), e.g.:
+
+\code{.c}
+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
+\endcode
+
+The application should then call starpu_top_init_and_wait() to give its name
+and wait for StarPU-Top to get a start request from the user. The name is used
+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
+
+\code{.c}
+starpu_top_init_and_wait("the application");
+\endcode
+
+The new values can then be provided thanks to
+starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
+starpu_top_update_data_float(), e.g.:
+
+\code{.c}
+starpu_top_update_data_integer(data, mynum);
+\endcode
+
+Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
+
+\code{.c}
+float alpha;
+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
+\endcode
+
+<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
+
+\code{.c}
+void modif_hook(struct starpu_top_param *d) {
+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
+}
+\endcode
+
+Task schedulers should notify StarPU-Top when it has decided when a task will be
+scheduled, so that it can show it in its Gantt chart, for instance:
+
+\code{.c}
+starpu_top_task_prevision(task, workerid, begin, end);
+\endcode
+
+Starting StarPU-Top (StarPU-Top is started via the binary
+<c>starpu_top</c>.) and the application can be done two ways:
+
+<ul>
+<li> The application is started by hand on some machine (and thus already
+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
+checkbox should be unchecked, and the hostname and port (default is 2011) on
+which the application is already running should be specified. Clicking on the
+connection button will thus connect to the already-running application.
+</li>
+<li> StarPU-Top is started first, and clicking on the connection button will
+start the application itself (possibly on a remote machine). The SSH checkbox
+should be checked, and a command line provided, e.g.:
+
+\verbatim
+$ ssh myserver STARPU_SCHED=dmda ./application
+\endverbatim
+
+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
+
+\verbatim
+$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
+\endverbatim
+
+and "localhost" should be used as IP Address to connect to.
+</li>
+</ul>
+
+\section TaskAndWorkerProfiling Task And Worker Profiling
+
+A full example showing how to use the profiling API is available in
+the StarPU sources in the directory <c>examples/profiling/</c>.
+
+\code{.c}
+struct starpu_task *task = starpu_task_create();
+task->cl = &cl;
+task->synchronous = 1;
+/* We will destroy the task structure by hand so that we can
+ * query the profiling info before the task is destroyed. */
+task->destroy = 0;
+
+/* Submit and wait for completion (since synchronous was set to 1) */
+starpu_task_submit(task);
+
+/* The task is finished, get profiling information */
+struct starpu_profiling_task_info *info = task->profiling_info;
+
+/* How much time did it take before the task started ? */
+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
+
+/* How long was the task execution ? */
+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+
+/* We don't need the task structure anymore */
+starpu_task_destroy(task);
+\endcode
+
+\code{.c}
+/* Display the occupancy of all workers during the test */
+int worker;
+for (worker = 0; worker < starpu_worker_get_count(); worker++)
+{
+        struct starpu_profiling_worker_info worker_info;
+        int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+        STARPU_ASSERT(!ret);
+
+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
+        double overhead_time = total_time - executing_time - sleeping_time;
+
+        float executing_ratio = 100.0*executing_time/total_time;
+        float sleeping_ratio = 100.0*sleeping_time/total_time;
+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
+
+        char workername[128];
+        starpu_worker_get_name(worker, workername, 128);
+        fprintf(stderr, "Worker %s:\n", workername);
+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
+                executing_time*1e-3, executing_ratio);
+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
+                sleeping_time*1e-3, sleeping_ratio);
+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
+                overhead_time*1e-3, overhead_ratio);
+}
+\endcode
+
+\section PerformanceModelExample Performance Model Example
+
+To achieve good scheduling, StarPU scheduling policies need to be able to
+estimate in advance the duration of a task. This is done by giving to codelets
+a performance model, by defining a structure starpu_perfmodel and
+providing its address in the field starpu_codelet::model. The fields
+starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to
+give a name to the model, and the type of the model, since there are
+several kinds of performance models. For compatibility, make sure to
+initialize the whole structure to zero, either by using explicit
+memset(), or by letting the compiler implicitly do it as examplified
+below.
+
+<ul>
+<li>
+Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a
+given set of data input/output sizes, the performance will always be about the
+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
+and just a bit less true on CPUs (~=1% error). This also assumes that there are
+few different sets of data input/output sizes. StarPU will then keep record of
+the average time of previous executions on the various processing units, and use
+it as an estimation. History is done per task size, by using a hash of the input
+and ouput sizes as an index.
+It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
+for further executions, and can be observed by using the tool
+<c>starpu_perfmodel_display</c>, or drawn by using
+the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration).  The
+models are indexed by machine name. To
+share the models between machines (e.g. for a homogeneous cluster), use
+<c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
+when using a task scheduler which makes use of it, such as
+<c>dmda</c>. Measurements can also be provided explicitly by the application, by
+using the function starpu_perfmodel_update_history().
+
+The following is a small code example.
+
+If e.g. the code is recompiled with other compilation options, or several
+variants of the code are used, the symbol string should be changed to reflect
+that, in order to recalibrate a new model from zero. The symbol string can even
+be constructed dynamically at execution time, as long as this is done before
+submitting any task using it.
+
+\code{.c}
+static struct starpu_perfmodel mult_perf_model = {
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "mult_perf_model"
+};
+
+struct starpu_codelet cl = {
+    .cpu_funcs = { cpu_mult, NULL },
+    .cpu_funcs_name = { "cpu_mult", NULL },
+    .nbuffers = 3,
+    .modes = { STARPU_R, STARPU_R, STARPU_W },
+    /* for the scheduling policy to be able to use performance models */
+    .model = &mult_perf_model
+};
+\endcode
+
+</li>
+<li>
+Measured at runtime and refined by regression (model types
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
+still assumes performance regularity, but works 
+with various data input sizes, by applying regression over observed
+execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
+form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
+::STARPU_REGRESSION_BASED, but costs a lot more to compute).
+
+For instance,
+<c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
+model for the function memset().
+
+Of course, the application has to issue
+tasks with varying size so that the regression can be computed. StarPU will not
+trust the regression unless there is at least 10% difference between the minimum
+and maximum observed input size. It can be useful to set the
+environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
+so as to feed the performance model for a variety of
+inputs. The application can also provide the measurements explictly by
+using the function starpu_perfmodel_update_history(). The tools
+<c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
+be used to observe how much the performance model is calibrated (\ref
+PerformanceModelCalibration); when their output look good,
+\ref STARPU_CALIBRATE can be reset to <c>0</c> to let
+StarPU use the resulting performance model without recording new measures, and
+\ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
+the data input sizes vary a lot, it is really important to set
+\ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
+measures, and result with a very big performance model, which will take time a
+lot of time to load and save.
+
+For non-linear regression, since computing it
+is quite expensive, it is only done at termination of the application. This
+means that the first execution of the application will use only history-based
+performance model to perform scheduling, without using regression.
+</li>
+
+<li>
+Provided as an estimation from the application itself (model type
+::STARPU_COMMON and field starpu_perfmodel::cost_function),
+see for instance
+<c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
+</li>
+
+<li>
+Provided explicitly by the application (model type ::STARPU_PER_ARCH):
+the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
+filled with pointers to functions which return the expected duration
+of the task in micro-seconds, one per architecture.
+</li>
+</ul>
+
+For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
+::STARPU_NL_REGRESSION_BASED, the dimensions of task data (both input
+and output) are used as an index by default. ::STARPU_HISTORY_BASED uses a CRC
+hash of the dimensions as an index to distinguish histories, and
+::REGRESSION_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED use the total
+size as an index for the regression.
+
+The starpu_perfmodel::size_base and starpu_perfmodel::footprint fields however
+permit the application to override that, when for instance some of the data
+do not matter for task cost (e.g. mere reference table), or when using sparse
+structures (in which case it is the number of non-zeros which matter), or when
+there is some hidden parameter such as the number of iterations, or when the
+application actually has a very good idea of the complexity of the algorithm,
+and just not the speed of the processor, etc.  The example in the directory
+<c>examples/pi</c> uses this to include the number of iterations in the base
+size. starpu_perfmodel::size_base should be used when the variance of the actual
+performance is known (i.e. bigger returned value is longer execution
+time), and thus particularly useful for ::STARPU_REGRESSION_BASED or
+::STARPU_NL_REGRESSION_BASED. starpu_perfmodel::footprint can be used when the
+variance of the actual performance is unknown (irregular performance behavior,
+etc.), and thus only useful for ::STARPU_HISTORY_BASED.
+starpu_task_data_footprint() can be used as a base and combined with other
+parameters through starpu_hash_crc32c_be for instance.
+
+StarPU will automatically determine when the performance model is calibrated,
+or rather, it will assume the performance model is calibrated until the
+application submits a task for which the performance can not be predicted. For
+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
+measurements for a given size before estimating that an average can be taken as
+estimation for further executions with the same size. For
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+data size is smaller than 90% of the maximum measured data size (i.e. the
+measurement interval is large enough for a regression to have a meaning).
+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
+
+How to use schedulers which can benefit from such performance model is explained
+in \ref TaskSchedulingPolicy.
+
+The same can be done for task power consumption estimation, by setting
+the field starpu_codelet::power_model the same way as the field
+starpu_codelet::model. Note: for now, the application has to give to
+the power consumption performance model a name which is different from
+the execution time performance model.
+
+The application can request time estimations from the StarPU performance
+models by filling a task structure as usual without actually submitting
+it. The data handles can be created by calling any of the functions
+<c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
+node and the desired data sizes, and need to be unregistered as usual.
+The functions starpu_task_expected_length() and
+starpu_task_expected_power() can then be called to get an estimation
+of the task cost on a given arch. starpu_task_footprint() can also be
+used to get the footprint used for indexing history-based performance
+models. starpu_task_destroy() needs to be called to destroy the dummy
+task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
+
+\section DataTrace Data trace and tasks length
+It is possible to get statistics about tasks length and data size by using :
+\verbatim
+$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
+\endverbatim
+Where filename is the FxT trace file and codeletX the names of the codelets you
+want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
+This will create a file, <c>data_trace.gp</c> which
+can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
+task, and each color corresponds to a codelet.
+
+\image html data_trace.png
+\image latex data_trace.eps "" width=\textwidth
+
+*/

+ 80 - 209
doc/doxygen/chapters/05performance_feedback.doxy

@@ -1,211 +1,47 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
 
-/*! \page PerformanceFeedback Performance Feedback
-
-\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
-
-StarPU can connect to Temanejo >= 1.0rc2 (see
-http://www.hlrs.de/temanejo), to permit
-nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
-install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
-<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
-to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
-
-Make sure to specify at least the same number of CPUs in the dialog box as your
-machine has, otherwise an error will happen during execution. Future versions
-of Temanejo should be able to tell StarPU the number of CPUs to use.
-
-Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
-Temanejo (so as to distinguish them from tasks).
-
-\section On-linePerformanceFeedback On-line Performance Feedback
-
-\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
-
-In order to enable online performance monitoring, the application can
-call starpu_profiling_status_set() with the parameter
-::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
-is already enabled or not by calling starpu_profiling_status_get().
-Enabling monitoring also reinitialize all previously collected
-feedback. The environment variable \ref STARPU_PROFILING can also be
-set to <c>1</c> to achieve the same effect. The function
-starpu_profiling_init() can also be called during the execution to
-reinitialize performance counters and to start the profiling if the
-environment variable \ref STARPU_PROFILING is set to <c>1</c>.
-
-Likewise, performance monitoring is stopped by calling
-starpu_profiling_status_set() with the parameter
-::STARPU_PROFILING_DISABLE. Note that this does not reset the
-performance counters so that the application may consult them later
-on.
-
-More details about the performance monitoring API are available in \ref API_Profiling.
-
-\subsection Per-taskFeedback Per-task Feedback
-
-If profiling is enabled, a pointer to a structure
-starpu_profiling_task_info is put in the field
-starpu_task::profiling_info when a task terminates. This structure is
-automatically destroyed when the task structure is destroyed, either
-automatically or by calling starpu_task_destroy().
-
-The structure starpu_profiling_task_info indicates the date when the
-task was submitted (starpu_profiling_task_info::submit_time), started
-(starpu_profiling_task_info::start_time), and terminated
-(starpu_profiling_task_info::end_time), relative to the initialization
-of StarPU with starpu_init(). It also specifies the identifier of the worker
-that has executed the task (starpu_profiling_task_info::workerid).
-These date are stored as <c>timespec</c> structures which the user may convert
-into micro-seconds using the helper function
-starpu_timing_timespec_to_us().
-
-It it worth noting that the application may directly access this structure from
-the callback executed at the end of the task. The structure starpu_task
-associated to the callback currently being executed is indeed accessible with
-the function starpu_task_get_current().
-
-\subsection Per-codeletFeedback Per-codelet Feedback
-
-The field starpu_codelet::per_worker_stats is
-an array of counters. The i-th entry of the array is incremented every time a
-task implementing the codelet is executed on the i-th worker.
-This array is not reinitialized when profiling is enabled or disabled.
-
-\subsection Per-workerFeedback Per-worker Feedback
-
-The second argument returned by the function
-starpu_profiling_worker_get_info() is a structure
-starpu_profiling_worker_info that gives statistics about the specified
-worker. This structure specifies when StarPU started collecting
-profiling information for that worker
-(starpu_profiling_worker_info::start_time), the
-duration of the profiling measurement interval
-(starpu_profiling_worker_info::total_time), the time spent executing
-kernels (starpu_profiling_worker_info::executing_time), the time
-spent sleeping because there is no task to execute at all
-(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
-while profiling was enabled. These values give an estimation of the
-proportion of time spent do real work, and the time spent either
-sleeping because there are not enough executable tasks or simply
-wasted in pure StarPU overhead.
-
-Calling starpu_profiling_worker_get_info() resets the profiling
-information associated to a worker.
-
-When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
-possible to use the tool <c>starpu_workers_activity</c> (see \ref
-MonitoringActivity) to generate a graphic showing the evolution of
-these values during the time, for the different workers.
-
-\subsection Bus-relatedFeedback Bus-related Feedback
-
-TODO: ajouter \ref STARPU_BUS_STATS
-
-// how to enable/disable performance monitoring
-// what kind of information do we get ?
-
-The bus speed measured by StarPU can be displayed by using the tool
-<c>starpu_machine_display</c>, for instance:
+/*! \page OfflinePerformanceTools Offline Performance Tools
 
-\verbatim
-StarPU has found:
-        3 CUDA devices
-                CUDA 0 (Tesla C2050 02:00.0)
-                CUDA 1 (Tesla C2050 03:00.0)
-                CUDA 2 (Tesla C2050 84:00.0)
-from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
-RAM     0.000000        5176.530428     5176.492994     5191.710722
-CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
-CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
-CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
-\endverbatim
-
-\subsection StarPU-TopInterface StarPU-Top Interface
-
-StarPU-Top is an interface which remotely displays the on-line state of a StarPU
-application and permits the user to change parameters on the fly.
-
-Variables to be monitored can be registered by calling the functions
-starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
-starpu_top_add_data_float(), e.g.:
-
-\code{.c}
-starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
-\endcode
-
-The application should then call starpu_top_init_and_wait() to give its name
-and wait for StarPU-Top to get a start request from the user. The name is used
-by StarPU-Top to quickly reload a previously-saved layout of parameter display.
-
-\code{.c}
-starpu_top_init_and_wait("the application");
-\endcode
-
-The new values can then be provided thanks to
-starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
-starpu_top_update_data_float(), e.g.:
-
-\code{.c}
-starpu_top_update_data_integer(data, mynum);
-\endcode
-
-Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
-
-\code{.c}
-float alpha;
-starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
-\endcode
-
-<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
-
-\code{.c}
-void modif_hook(struct starpu_top_param *d) {
-    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
-}
-\endcode
-
-Task schedulers should notify StarPU-Top when it has decided when a task will be
-scheduled, so that it can show it in its Gantt chart, for instance:
-
-\code{.c}
-starpu_top_task_prevision(task, workerid, begin, end);
-\endcode
-
-Starting StarPU-Top (StarPU-Top is started via the binary
-<c>starpu_top</c>.) and the application can be done two ways:
+To get an idea of what is happening, a lot of performance feedback is available,
+detailed in this chapter. The various informations should be checked for.
 
 <ul>
-<li> The application is started by hand on some machine (and thus already
-waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
-checkbox should be unchecked, and the hostname and port (default is 2011) on
-which the application is already running should be specified. Clicking on the
-connection button will thus connect to the already-running application.
-</li>
-<li> StarPU-Top is started first, and clicking on the connection button will
-start the application itself (possibly on a remote machine). The SSH checkbox
-should be checked, and a command line provided, e.g.:
-
-\verbatim
-$ ssh myserver STARPU_SCHED=dmda ./application
-\endverbatim
-
-If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
-
-\verbatim
-$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
-\endverbatim
-
-and "localhost" should be used as IP Address to connect to.
+<li>
+What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
+<ul>
+  <li> If it's mostly green (tasks running in the initial context) or context specific
+  color prevailing, then the machine is properly
+  utilized, and perhaps the codelets are just slow. Check their performance, see
+  \ref PerformanceOfCodelets.
+  </li>
+  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
+  transfers, do you perhaps have far more communication than computation? Did
+  you properly use CUDA streams to make sure communication can be
+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
+  much as possible?
+  </li>
+  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
+  do you have enough parallelism? It might be a good idea to check what the DAG
+  looks like (see \ref CreatingADAGWithGraphviz).
+  </li>
+  <li> If only some workers are completely red (Blocked), for some reason the
+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
+  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
+  performance model?  When some of them don't, the schedulers switches to a
+  greedy algorithm which thus performs badly.
+  </li>
+</ul>
 </li>
 </ul>
 
+You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
+visualize the task graph more easily.
 \section Off-linePerformanceFeedback Off-line Performance Feedback
 
 \subsection GeneratingTracesWithFxT Generating Traces With FxT
@@ -492,6 +328,55 @@ execution time.
 \ref TheoreticalLowerBoundOnExecutionTimeExample provides an example on how to
 use this.
 
+\section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
+
+For kernels with history-based performance models (and provided that
+they are completely calibrated), StarPU can very easily provide a
+theoretical lower bound for the execution time of a whole set of
+tasks. See for instance <c>examples/lu/lu_example.c</c>: before
+submitting tasks, call the function starpu_bound_start(), and after
+complete execution, call starpu_bound_stop().
+starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
+to output a Linear Programming problem corresponding to the schedule
+of your tasks. Run it through <c>lp_solve</c> or any other linear
+programming solver, and that will give you a lower bound for the total
+execution time of your tasks. If StarPU was compiled with the library
+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
+immediately and get the optimized minimum, in ms. Its parameter
+<c>integer</c> allows to decide whether integer resolution should be
+computed and returned 
+
+The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
+data, and tag dependencies into account. Tags released in a callback
+or similar are not taken into account, only tags associated with a task are.
+It must be understood that the linear programming
+problem size is quadratic with the number of tasks and thus the time to solve it
+will be very long, it could be minutes for just a few dozen tasks. You should
+probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
+problem to MPS format and then use a better solver, <c>glpsol</c> might be
+better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
+useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
+slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
+<c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
+-Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
+also be quite useful. The resulting schedule can be observed by using
+the tool <c>starpu_lp2paje</c>, which converts it into the Paje
+format.
+
+Data transfer time can only be taken into account when <c>deps</c> is set. Only
+data transfers inferred from implicit data dependencies between tasks are taken
+into account. Other data transfers are assumed to be completely overlapped.
+
+Setting <c>deps</c> to 0 will only take into account the actual computations
+on processing units. It however still properly takes into account the varying
+performances of kernels and processing units, which is quite more accurate than
+just comparing StarPU performances with the fastest of the kernels being used.
+
+The <c>prio</c> parameter tells StarPU whether to simulate taking into account
+the priorities as the StarPU scheduler would, i.e. schedule prioritized
+tasks before less prioritized tasks, to check to which extend this results
+to a less optimal solution. This increases even more computation time.
+
 \section MemoryFeedback Memory Feedback
 
 It is possible to enable memory statistics. To do so, you need to pass
@@ -592,20 +477,6 @@ Computation took (in ms)
 Synthetic GFlops : 44.21
 \endverbatim
 
-\section DataTrace Data trace and tasks length
-It is possible to get statistics about tasks length and data size by using :
-\verbatim
-$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
-\endverbatim
-Where filename is the FxT trace file and codeletX the names of the codelets you 
-want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
-This will create a file, <c>data_trace.gp</c> which
-can be plotted to get a .eps image of these results. On the image, each point represents a 
-task, and each color corresponds to a codelet.
-
-\image html data_trace.png
-\image latex data_trace.eps "" width=\textwidth
-
 // TODO: data transfer stats are similar to the ones displayed when
 // setting STARPU_BUS_STATS
 

+ 229 - 0
doc/doxygen/chapters/14faq.doxy

@@ -0,0 +1,229 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page FrequentlyAskedQuestions Frequently Asked Questions
+
+\section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
+
+Some libraries need to be initialized once for each concurrent instance that
+may run on the machine. For instance, a C++ computation class which is not
+thread-safe by itself, but for which several instanciated objects of that class
+can be used concurrently. This can be used in StarPU by initializing one such
+object per worker. For instance, the libstarpufft example does the following to
+be able to use FFTW on CPUs.
+
+Some global array stores the instanciated objects:
+
+\code{.c}
+fftw_plan plan_cpu[STARPU_NMAXWORKERS];
+\endcode
+
+At initialisation time of libstarpu, the objects are initialized:
+
+\code{.c}
+int workerid;
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+    switch (starpu_worker_get_type(workerid)) {
+        case STARPU_CPU_WORKER:
+            plan_cpu[workerid] = fftw_plan(...);
+            break;
+    }
+}
+\endcode
+
+And in the codelet body, they are used:
+
+\code{.c}
+static void fft(void *descr[], void *_args)
+{
+    int workerid = starpu_worker_get_id();
+    fftw_plan plan = plan_cpu[workerid];
+    ...
+
+    fftw_execute(plan, ...);
+}
+\endcode
+
+This however is not sufficient for FFT on CUDA: initialization has
+to be done from the workers themselves.  This can be done thanks to
+starpu_execute_on_each_worker().  For instance libstarpufft does the following.
+
+\code{.c}
+static void fft_plan_gpu(void *args)
+{
+    plan plan = args;
+    int n2 = plan->n2[0];
+    int workerid = starpu_worker_get_id();
+
+    cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
+    cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+}
+void starpufft_plan(void)
+{
+    starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
+}
+\endcode
+
+\section UsingTheDriverAPI Using The Driver API
+
+\ref API_Running_Drivers
+
+\code{.c}
+int ret;
+struct starpu_driver = {
+    .type = STARPU_CUDA_WORKER,
+    .id.cuda_id = 0
+};
+ret = starpu_driver_init(&d);
+if (ret != 0)
+    error();
+while (some_condition) {
+    ret = starpu_driver_run_once(&d);
+    if (ret != 0)
+        error();
+}
+ret = starpu_driver_deinit(&d);
+if (ret != 0)
+    error();
+\endcode
+
+To add a new kind of device to the structure starpu_driver, one needs to:
+<ol>
+<li> Add a member to the union starpu_driver::id
+</li>
+<li> Modify the internal function <c>_starpu_launch_drivers()</c> to
+make sure the driver is not always launched.
+</li>
+<li> Modify the function starpu_driver_run() so that it can handle
+another kind of architecture.
+</li>
+<li> Write the new function <c>_starpu_run_foobar()</c> in the
+corresponding driver.
+</li>
+</ol>
+
+\section On-GPURendering On-GPU Rendering
+
+Graphical-oriented applications need to draw the result of their computations,
+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
+interoperability permit to let CUDA directly work on the OpenGL buffers, making
+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
+renderbuffer objects into CUDA.  CUDA however imposes some technical
+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
+to be the one that runs CUDA computations for that GPU.
+
+To achieve this with StarPU, pass the option
+\ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
+to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
+first, and the interoperability mode has to
+be enabled by using the field
+starpu_conf::cuda_opengl_interoperability, and the driver loop has to
+be run by the application, by using the field
+starpu_conf::not_launched_drivers to prevent StarPU from running it in
+a separate thread, and by using starpu_driver_run() to run the loop.
+The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
+articulates in a simple case, where rendering is done in task
+callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
+progress from the StarPU driver loop, while the latter uses
+<c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
+
+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
+the CUDA pointer at registration, for instance:
+
+\code{.c}
+/* Get the CUDA worker id */
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
+                break;
+
+/* Build a CUDA pointer pointing at the OpenGL buffer */
+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
+
+/* And register it to StarPU */
+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
+                            output, num_bytes / sizeof(float4), sizeof(float4));
+
+/* The handle can now be used as usual */
+starpu_task_insert(&cl, STARPU_RW, handle, 0);
+
+/* ... */
+
+/* This gets back data into the OpenGL buffer */
+starpu_data_unregister(handle);
+\endcode
+
+and display it e.g. in the callback function.
+
+\section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
+
+Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
+1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
+parallelism using StarPU (no multithreaded tasks), setting the
+environment variable MKL_NUM_THREADS to 1, and using the threaded MKL library,
+with iomp5.
+
+Using this configuration, StarPU uses only 1 core, no matter the value of
+\ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
+
+The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
+(http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
+
+\section ThreadBindingOnNetBSD Thread Binding on NetBSD
+
+When using StarPU on a NetBSD machine, if the topology
+discovery library <c>hwloc</c> is used, thread binding will fail. To
+prevent the problem, you should at least use the version 1.7 of
+<c>hwloc</c>, and also issue the following call:
+
+\verbatim
+$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
+\endverbatim
+
+Or add the following line in the file <c>/etc/sysctl.conf</c>
+
+\verbatim
+security.models.extensions.user_set_cpu_affinity=1
+\endverbatim
+
+
+\section PauseResume Interleaving StarPU and non-StarPU code
+
+If your application only partially uses StarPU, and you do not want to
+call starpu_init() / starpu_shutdown() at the beginning/end
+of each section, StarPU workers will poll for work between the
+sections. To avoid this behavior, you can "pause" StarPU with the 
+starpu_pause() function. This will prevent the StarPU workers from
+accepting new work (tasks that are already in progress will not be
+frozen), and stop them from polling for more work.
+
+Note that this does not prevent you from submitting new tasks, but
+they won't execute until starpu_resume() is called. Also note
+that StarPU must not be paused when you call starpu_shutdown(), and
+that this function pair works in a push/pull manner, ie you need to
+match the number of calls to these functions to clear their effect.
+
+
+One way to use these functions could be:
+\code{.c}
+starpu_init(NULL);
+starpu_pause(); // To submit all the tasks without a single one executing
+submit_some_tasks();
+starpu_resume(); // The tasks start executing
+
+
+starpu_task_wait_for_all();
+starpu_pause(); // Stop the workers from polling
+
+// Non-StarPU code
+
+starpu_resume();
+// ...
+starpu_shutdown();
+\endcode
+
+*/

doc/doxygen/chapters/07out_of_core.doxy → doc/doxygen/chapters/15out_of_core.doxy


+ 59 - 9
doc/doxygen/chapters/08mpi_support.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -234,10 +234,13 @@ The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
 
 Here an stencil example showing how to use starpu_mpi_task_insert(). One
 first needs to define a distribution function which specifies the
-locality of the data. Note that that distribution information needs to
-be given to StarPU by calling starpu_data_set_rank(). A MPI tag
-should also be defined for each data handle by calling
-starpu_data_set_tag().
+locality of the data. Note that the data needs to be registered to MPI
+by calling starpu_mpi_data_register(). This function allows to set
+the distribution information and the MPI tag which should be used when
+communicating the data. The function starpu_mpi_data_register() should
+be prefered to starpu_data_set_rank() and starpu_data_set_tag() as
+it also allows to automatically clear the MPI communication cache
+when unregistering the data.
 
 \code{.c}
 /* Returns the MPI node number where data is */
@@ -284,8 +287,7 @@ data which will be needed by the tasks that we will execute.
                 /* I know it's useless to allocate anything for this */
                 data_handles[x][y] = NULL;
             if (data_handles[x][y]) {
-                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                starpu_data_set_tag(data_handles[x][y], x*X+y);
+                starpu_mpi_data_register(data_handles[x][y], x*X+y, mpi_rank);
             }
         }
     }
@@ -318,6 +320,55 @@ application can prune the task for loops according to the data distribution,
 so as to only submit tasks on nodes which have to care about them (either to
 execute them, or to send the required data).
 
+A function starpu_mpi_task_build() is also provided with the aim to
+only construct the task structure. All MPI nodes need to call the
+function, only the node which is to execute the task will return a
+valid task structure. Following the execution of the task, all nodes
+need to call the function starpu_mpi_task_post_build() -- with the same
+list of arguments as starpu_mpi_task_build() -- to post all the
+necessary data communications.
+
+\code{.c}
+struct starpu_task *task;
+task = starpu_mpi_task_build(MPI_COMM_WORLD, &cl,
+                             STARPU_RW, data_handles[0],
+                             STARPU_R, data_handles[1],
+                             0);
+if (task) starpu_task_submit(task);
+starpu_mpi_task_post_build(MPI_COMM_WORLD, &cl,
+                           STARPU_RW, data_handles[0],
+                           STARPU_R, data_handles[1],
+                           0);
+\endcode
+
+\section MPICache MPI cache support
+
+StarPU-MPI automatically optimizes duplicate data transmissions: if an MPI
+node B needs a piece of data D from MPI node A for several tasks, only one
+transmission of D will take place from A to B, and the value of D will be kept
+on B as long as no task modifies D.
+
+If a task modifies D, B will wait for all tasks which need the previous value of
+D, before invalidating the value of D. As a consequence, it releases the memory
+occupied by D. Whenever a task running on B needs the new value of D, allocation
+will take place again to receive it.
+
+Since tasks can be submitted dynamically, StarPU-MPI can not know whether the
+current value of data D will again be used by a newly-submitted task before
+being modified by another newly-submitted task, so until a task is submitted to
+modify the current value, it can not decide by itself whether to flush the cache
+or not.  The application can however explicitly tell StarPU-MPI to flush the
+cache by calling starpu_mpi_cache_flush() or starpu_mpi_cache_flush_all_data(),
+for instance in case the data will not be used at all any more (see for instance
+the cholesky example in mpi/examples/matrix_decomposition), or at least not in
+the close future. If a newly-submitted task actually needs the value again,
+another transmission of D will be initiated from A to B.
+
+The whole caching behavior can be disabled thanks to the ::STARPU_MPI_CACHE
+environment variable. The variable ::STARPU_MPI_CACHE_STATS can be set to 1
+to enable the runtime to display messages when data are added or removed
+from the cache holding the received data.
+
 \section MPIMigration MPI Data migration
 
 The application can dynamically change its mind about the data distribution, to
@@ -409,8 +460,7 @@ for(x = 0; x < nblocks ;  x++)
         data_handles[x] = NULL;
     }
     if (data_handles[x]) {
-        starpu_data_set_rank(data_handles[x], mpi_rank);
-        starpu_data_set_tag(data_handles[x], x*nblocks+y);
+        starpu_mpi_data_register(data_handles[x], x*nblocks+y, mpi_rank);
     }
 }
 

doc/doxygen/chapters/09fft_support.doxy → doc/doxygen/chapters/17fft_support.doxy


doc/doxygen/chapters/10mic_scc_support.doxy → doc/doxygen/chapters/18mic_scc_support.doxy


doc/doxygen/chapters/11c_extensions.doxy → doc/doxygen/chapters/19c_extensions.doxy


+ 4 - 0
doc/doxygen/chapters/12socl_opencl_extensions.doxy

@@ -70,4 +70,8 @@ Number of platforms:	2
 $
 \endverbatim
 
+To enable the use of CPU cores via OpenCL, one can set the STARPU_OPENCL_ON_CPUS
+environment variable to 1 and STARPU_NCPUS to 0 (to avoid using CPUs both via
+the OpenCL driver and the normal CPU driver).
+
 */

+ 127 - 0
doc/doxygen/chapters/21simgrid.doxy

@@ -0,0 +1,127 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page SimGridSupport SimGrid Support
+
+StarPU can use Simgrid in order to simulate execution on an arbitrary
+platform.
+
+\section Preparing your application for simulation.
+
+There are a few technical details which need to be handled for an application to
+be simulated through Simgrid.
+
+If the application uses <c>gettimeofday</c> to make its
+performance measurements, the real time will be used, which will be bogus. To
+get the simulated time, it has to use starpu_timing_now() which returns the
+virtual timestamp in us.
+
+For some technical reason, the application's .c file which contains main() has
+to be recompiled with starpu.h, which in the simgrid case will #define main()
+into starpu_main(), and it is libstarpu which will provide the real main() and
+call the application's main().
+
+To be able to test with crazy data sizes, one may want to only allocate
+application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to
+starpu_data_register functions is fine, data will never be read/written to by
+StarPU in Simgrid mode anyway.
+
+To be able to run the application with e.g. CUDA simulation on a system which
+does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
+express that there is a CUDA implementation, even if one does not actually
+provide it. StarPU will never actually run it in Simgrid mode anyway.
+
+\section Calibration Calibration
+
+The idea is to first compile StarPU normally, and run the application,
+so as to automatically benchmark the bus and the codelets.
+
+\verbatim
+$ ./configure && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+   is not calibrated, forcing calibration for this run. Use the
+   STARPU_CALIBRATE environment variable to control this.
+$ ...
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST PASSED
+\endverbatim
+
+Note that we force to use the scheduler <c>dmda</c> to generate
+performance models for the application. The application may need to be
+run several times before the model is calibrated.
+
+\section Simulation Simulation
+
+Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
+to <c>./configure</c>.
+
+\verbatim
+$ ./configure --enable-simgrid
+\endverbatim
+
+To specify the location of SimGrid, you can either set the environment
+variables SIMGRID_CFLAGS and SIMGRID_LIBS, or use the configure
+options \ref with-simgrid-dir "--with-simgrid-dir",
+\ref with-simgrid-include-dir "--with-simgrid-include-dir" and
+\ref with-simgrid-lib-dir "--with-simgrid-lib-dir", for example
+
+\verbatim
+$ ./configure --with-simgrid-dir=/opt/local/simgrid
+\endverbatim
+
+You can then re-run the application.
+
+\verbatim
+$ make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST FAILED !!!
+\endverbatim
+
+It is normal that the test fails: since the computation are not actually done
+(that is the whole point of simgrid), the result is wrong, of course.
+
+If the performance model is not calibrated enough, the following error
+message will be displayed
+
+\verbatim
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+    is not calibrated, forcing calibration for this run. Use the
+    STARPU_CALIBRATE environment variable to control this.
+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
+    matvecmult does not have a perfmodel, or is not calibrated enough
+\endverbatim
+
+The number of devices can be chosen as usual with \ref STARPU_NCPU, \ref
+STARPU_NCUDA, and \ref STARPU_NOPENCL, and the amount of GPU memory
+with \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM, \ref
+STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM.
+
+\section SimulationOnAnotherMachine Simulation On Another Machine
+
+The simgrid support even permits to perform simulations on another machine, your
+desktop, typically. To achieve this, one still needs to perform the Calibration
+step on the actual machine to be simulated, then copy them to your desktop
+machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
+Simulation step on the desktop machine, by setting the environment
+variable \ref STARPU_HOSTNAME to the name of the actual machine, to
+make StarPU use the performance models of the simulated machine even
+on the desktop machine.
+
+If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
+use simgrid to simulate execution with CUDA/OpenCL devices, but the application
+source code will probably disable the CUDA and OpenCL codelets in thatcd sc
+case. Since during simgrid execution, the functions of the codelet are actually
+not called, one can use dummy functions such as the following to still permit
+CUDA or OpenCL execution:
+
+\snippet simgrid.c To be included. You should update doxygen if you see this text.
+
+
+*/

+ 689 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -0,0 +1,689 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page ExecutionConfigurationThroughEnvironmentVariables Execution Configuration Through Environment Variables
+
+The behavior of the StarPU library and tools may be tuned thanks to
+the following environment variables.
+
+\section ConfiguringWorkers Configuring Workers
+
+<dl>
+
+<dt>STARPU_NCPU</dt>
+<dd>
+\anchor STARPU_NCPU
+\addindex __env__STARPU_NCPU
+Specify the number of CPU workers (thus not including workers
+dedicated to control accelerators). Note that by default, StarPU will
+not allocate more CPU workers than there are physical CPUs, and that
+some CPUs are used to control the accelerators.
+</dd>
+
+<dt>STARPU_NCPUS</dt>
+<dd>
+\anchor STARPU_NCPUS
+\addindex __env__STARPU_NCPUS
+This variable is deprecated. You should use \ref STARPU_NCPU.
+</dd>
+
+<dt>STARPU_NCUDA</dt>
+<dd>
+\anchor STARPU_NCUDA
+\addindex __env__STARPU_NCUDA
+Specify the number of CUDA devices that StarPU can use. If
+\ref STARPU_NCUDA is lower than the number of physical devices, it is
+possible to select which CUDA devices should be used by the means of the
+environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
+create as many CUDA workers as there are CUDA devices.
+</dd>
+
+<dt>STARPU_NOPENCL</dt>
+<dd>
+\anchor STARPU_NOPENCL
+\addindex __env__STARPU_NOPENCL
+OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_NMICDEVS</dt>
+<dd>
+\anchor STARPU_NMICDEVS
+\addindex __env__STARPU_NMICDEVS
+MIC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_OPENCL_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ON_CPUS
+\addindex __env__STARPU_OPENCL_ON_CPUS
+By default, the OpenCL driver only enables GPU and accelerator
+devices. By setting the environment variable \ref
+STARPU_OPENCL_ON_CPUS to 1, the OpenCL driver will also enable CPU
+devices.
+</dd>
+
+<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ONLY_ON_CPUS
+\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
+By default, the OpenCL driver enables GPU and accelerator
+devices. By setting the environment variable \ref
+STARPU_OPENCL_ONLY_ON_CPUS to 1, the OpenCL driver will ONLY enable
+CPU devices.
+</dd>
+
+<dt>STARPU_NMIC</dt>
+<dd>
+\anchor STARPU_NMIC
+\addindex __env__STARPU_NMIC
+MIC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_NSCC</dt>
+<dd>
+\anchor STARPU_NSCC
+\addindex __env__STARPU_NSCC
+SCC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_WORKERS_NOBIND</dt>
+<dd>
+\anchor STARPU_WORKERS_NOBIND
+\addindex __env__STARPU_WORKERS_NOBIND
+Setting it to non-zero will prevent StarPU from binding its threads to
+CPUs. This is for instance useful when running the testsuite in parallel.
+</dd>
+
+<dt>STARPU_WORKERS_CPUID</dt>
+<dd>
+\anchor STARPU_WORKERS_CPUID
+\addindex __env__STARPU_WORKERS_CPUID
+Passing an array of integers (starting from 0) in \ref STARPU_WORKERS_CPUID
+specifies on which logical CPU the different workers should be
+bound. For instance, if <c>STARPU_WORKERS_CPUID = "0 1 4 5"</c>, the first
+worker will be bound to logical CPU #0, the second CPU worker will be bound to
+logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
+determined by the OS, or provided by the library <c>hwloc</c> in case it is
+available.
+
+Note that the first workers correspond to the CUDA workers, then come the
+OpenCL workers, and finally the CPU workers. For example if
+we have <c>STARPU_NCUDA=1</c>, <c>STARPU_NOPENCL=1</c>, <c>STARPU_NCPU=2</c>
+and <c>STARPU_WORKERS_CPUID = "0 2 1 3"</c>, the CUDA device will be controlled
+by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
+the logical CPUs #1 and #3 will be used by the CPU workers.
+
+If the number of workers is larger than the array given in \ref
+STARPU_WORKERS_CPUID, the workers are bound to the logical CPUs in a
+round-robin fashion: if <c>STARPU_WORKERS_CPUID = "0 1"</c>, the first
+and the third (resp. second and fourth) workers will be put on CPU #0
+(resp. CPU #1).
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_bindid passed to starpu_init() is
+set.
+
+</dd>
+
+<dt>STARPU_WORKERS_CUDAID</dt>
+<dd>
+\anchor STARPU_WORKERS_CUDAID
+\addindex __env__STARPU_WORKERS_CUDAID
+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
+possible to select which CUDA devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
+<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
+the one reported by CUDA).
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_WORKERS_OPENCLID</dt>
+<dd>
+\anchor STARPU_WORKERS_OPENCLID
+\addindex __env__STARPU_WORKERS_OPENCLID
+OpenCL equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_WORKERS_MICID</dt>
+<dd>
+\anchor STARPU_WORKERS_MICID
+\addindex __env__STARPU_WORKERS_MICID
+MIC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_mic_deviceid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_WORKERS_SCCID</dt>
+<dd>
+\anchor STARPU_WORKERS_SCCID
+\addindex __env__STARPU_WORKERS_SCCID
+SCC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_scc_deviceid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_SINGLE_COMBINED_WORKER</dt>
+<dd>
+\anchor STARPU_SINGLE_COMBINED_WORKER
+\addindex __env__STARPU_SINGLE_COMBINED_WORKER
+If set, StarPU will create several workers which won't be able to work
+concurrently. It will by default create combined workers which size goes from 1
+to the total number of CPU workers in the system. \ref STARPU_MIN_WORKERSIZE
+and \ref STARPU_MAX_WORKERSIZE can be used to change this default.
+</dd>
+
+<dt>STARPU_MIN_WORKERSIZE</dt>
+<dd>
+\anchor STARPU_MIN_WORKERSIZE
+\addindex __env__STARPU_MIN_WORKERSIZE
+\ref STARPU_MIN_WORKERSIZE
+permits to specify the minimum size of the combined workers (instead of the default 2)
+</dd>
+
+<dt>STARPU_MAX_WORKERSIZE</dt>
+<dd>
+\anchor STARPU_MAX_WORKERSIZE
+\addindex __env__STARPU_MAX_WORKERSIZE
+\ref STARPU_MAX_WORKERSIZE
+permits to specify the minimum size of the combined workers (instead of the
+number of CPU workers in the system)
+</dd>
+
+<dt>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</dt>
+<dd>
+\anchor STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
+\addindex __env__STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
+Let the user decide how many elements are allowed between combined workers
+created from hwloc information. For instance, in the case of sockets with 6
+cores without shared L2 caches, if \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER is
+set to 6, no combined worker will be synthesized beyond one for the socket
+and one per core. If it is set to 3, 3 intermediate combined workers will be
+synthesized, to divide the socket cores into 3 chunks of 2 cores. If it set to
+2, 2 intermediate combined workers will be synthesized, to divide the the socket
+cores into 2 chunks of 3 cores, and then 3 additional combined workers will be
+synthesized, to divide the former synthesized workers into a bunch of 2 cores,
+and the remaining core (for which no combined worker is synthesized since there
+is already a normal worker for it).
+
+The default, 2, thus makes StarPU tend to building a binary trees of combined
+workers.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_COPY
+Disable asynchronous copies between CPU and GPU devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
+Disable asynchronous copies between CPU and CUDA devices.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
+Disable asynchronous copies between CPU and MIC devices.
+</dd>
+
+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
+<dd>
+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
+through RAM. The default is Enabled.
+This permits to test the performance effect of GPU-Direct.
+</dd>
+
+<dt>STARPU_DISABLE_PINNING</dt>
+<dd>
+\anchor STARPU_DISABLE_PINNING
+\addindex __env__STARPU_DISABLE_PINNING
+Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc
+and friends.  The default is Enabled.
+This permits to test the performance effect of memory pinning.
+</dd>
+
+</dl>
+
+\section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
+
+<dl>
+
+<dt>STARPU_SCHED</dt>
+<dd>
+\anchor STARPU_SCHED
+\addindex __env__STARPU_SCHED
+Choose between the different scheduling policies proposed by StarPU: work
+random, stealing, greedy, with performance models, etc.
+
+Use <c>STARPU_SCHED=help</c> to get the list of available schedulers.
+</dd>
+
+<dt>STARPU_CALIBRATE</dt>
+<dd>
+\anchor STARPU_CALIBRATE
+\addindex __env__STARPU_CALIBRATE
+If this variable is set to 1, the performance models are calibrated during
+the execution. If it is set to 2, the previous values are dropped to restart
+calibration from scratch. Setting this variable to 0 disable calibration, this
+is the default behaviour.
+
+Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
+</dd>
+
+<dt>STARPU_BUS_CALIBRATE</dt>
+<dd>
+\anchor STARPU_BUS_CALIBRATE
+\addindex __env__STARPU_BUS_CALIBRATE
+If this variable is set to 1, the bus is recalibrated during intialization.
+</dd>
+
+<dt>STARPU_PREFETCH</dt>
+<dd>
+\anchor STARPU_PREFETCH
+\addindex __env__STARPU_PREFETCH
+This variable indicates whether data prefetching should be enabled (0 means
+that it is disabled). If prefetching is enabled, when a task is scheduled to be
+executed e.g. on a GPU, StarPU will request an asynchronous transfer in
+advance, so that data is already present on the GPU when the task starts. As a
+result, computation and data transfers are overlapped.
+Note that prefetching is enabled by default in StarPU.
+</dd>
+
+<dt>STARPU_SCHED_ALPHA</dt>
+<dd>
+\anchor STARPU_SCHED_ALPHA
+\addindex __env__STARPU_SCHED_ALPHA
+To estimate the cost of a task StarPU takes into account the estimated
+computation time (obtained thanks to performance models). The alpha factor is
+the coefficient to be applied to it before adding it to the communication part.
+</dd>
+
+<dt>STARPU_SCHED_BETA</dt>
+<dd>
+\anchor STARPU_SCHED_BETA
+\addindex __env__STARPU_SCHED_BETA
+To estimate the cost of a task StarPU takes into account the estimated
+data transfer time (obtained thanks to performance models). The beta factor is
+the coefficient to be applied to it before adding it to the computation part.
+</dd>
+
+<dt>STARPU_SCHED_GAMMA</dt>
+<dd>
+\anchor STARPU_SCHED_GAMMA
+\addindex __env__STARPU_SCHED_GAMMA
+Define the execution time penalty of a joule (\ref Power-basedScheduling).
+</dd>
+
+<dt>STARPU_IDLE_POWER</dt>
+<dd>
+\anchor STARPU_IDLE_POWER
+\addindex __env__STARPU_IDLE_POWER
+Define the idle power of the machine (\ref Power-basedScheduling).
+</dd>
+
+<dt>STARPU_PROFILING</dt>
+<dd>
+\anchor STARPU_PROFILING
+\addindex __env__STARPU_PROFILING
+Enable on-line performance monitoring (\ref EnablingOn-linePerformanceMonitoring).
+</dd>
+
+</dl>
+
+\section Extensions Extensions
+
+<dl>
+
+<dt>SOCL_OCL_LIB_OPENCL</dt>
+<dd>
+\anchor SOCL_OCL_LIB_OPENCL
+\addindex __env__SOCL_OCL_LIB_OPENCL
+THE SOCL test suite is only run when the environment variable \ref
+SOCL_OCL_LIB_OPENCL is defined. It should contain the location
+of the file <c>libOpenCL.so</c> of the OCL ICD implementation.
+</dd>
+
+<dt>OCL_ICD_VENDORS</dt>
+<dd>
+\anchor OCL_ICD_VENDORS
+\addindex __env__OCL_ICD_VENDORS
+When using SOCL with OpenCL ICD
+(https://forge.imag.fr/projects/ocl-icd/), this variable may be used
+to point to the directory where ICD files are installed. The default
+directory is <c>/etc/OpenCL/vendors</c>. StarPU installs ICD
+files in the directory <c>$prefix/share/starpu/opencl/vendors</c>.
+</dd>
+
+<dt>STARPU_COMM_STATS</dt>
+<dd>
+\anchor STARPU_COMM_STATS
+\addindex __env__STARPU_COMM_STATS
+Communication statistics for starpumpi (\ref MPISupport)
+will be enabled when the environment variable \ref STARPU_COMM_STATS
+is defined to an value other than 0.
+</dd>
+
+<dt>STARPU_MPI_CACHE</dt>
+<dd>
+\anchor STARPU_MPI_CACHE
+\addindex __env__STARPU_MPI_CACHE
+Communication cache for starpumpi (\ref MPISupport) will be
+disabled when the environment variable \ref STARPU_MPI_CACHE is set
+to 0. It is enabled by default or for any other values of the variable
+\ref STARPU_MPI_CACHE.
+</dd>
+
+<dt>STARPU_MPI_CACHE_STATS</dt>
+<dd>
+\anchor STARPU_MPI_CACHE_STATS
+\addindex __env__STARPU_MPI_CACHE_STATS
+When set to 1, statistics are enabled for the communication cache (\ref MPISupport). For now,
+it prints messages on the standard output when data are added or removed from the received
+communication cache.
+</dd>
+
+</dl>
+
+\section MiscellaneousAndDebug Miscellaneous And Debug
+
+<dl>
+
+<dt>STARPU_HOME</dt>
+<dd>
+\anchor STARPU_HOME
+\addindex __env__STARPU_HOME
+This specifies the main directory in which StarPU stores its
+configuration files. The default is <c>$HOME</c> on Unix environments,
+and <c>$USERPROFILE</c> on Windows environments.
+</dd>
+
+<dt>STARPU_HOSTNAME</dt>
+<dd>
+\anchor STARPU_HOSTNAME
+\addindex __env__STARPU_HOSTNAME
+When set, force the hostname to be used when dealing performance model
+files. Models are indexed by machine name. When running for example on
+a homogenenous cluster, it is possible to share the models between
+machines by setting <c>export STARPU_HOSTNAME=some_global_name</c>.
+</dd>
+
+<dt>STARPU_OPENCL_PROGRAM_DIR</dt>
+<dd>
+\anchor STARPU_OPENCL_PROGRAM_DIR
+\addindex __env__STARPU_OPENCL_PROGRAM_DIR
+This specifies the directory where the OpenCL codelet source files are
+located. The function starpu_opencl_load_program_source() looks
+for the codelet in the current directory, in the directory specified
+by the environment variable \ref STARPU_OPENCL_PROGRAM_DIR, in the
+directory <c>share/starpu/opencl</c> of the installation directory of
+StarPU, and finally in the source directory of StarPU.
+</dd>
+
+<dt>STARPU_SILENT</dt>
+<dd>
+\anchor STARPU_SILENT
+\addindex __env__STARPU_SILENT
+This variable allows to disable verbose mode at runtime when StarPU
+has been configured with the option \ref enable-verbose "--enable-verbose". It also
+disables the display of StarPU information and warning messages.
+</dd>
+
+<dt>STARPU_LOGFILENAME</dt>
+<dd>
+\anchor STARPU_LOGFILENAME
+\addindex __env__STARPU_LOGFILENAME
+This variable specifies in which file the debugging output should be saved to.
+</dd>
+
+<dt>STARPU_FXT_PREFIX</dt>
+<dd>
+\anchor STARPU_FXT_PREFIX
+\addindex __env__STARPU_FXT_PREFIX
+This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
+</dd>
+
+<dt>STARPU_LIMIT_CUDA_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CUDA_devid_MEM
+\addindex __env__STARPU_LIMIT_CUDA_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the CUDA device with the identifier
+<c>devid</c>. This variable is intended to be used for experimental
+purposes as it emulates devices that have a limited amount of memory.
+When defined, the variable overwrites the value of the variable
+\ref STARPU_LIMIT_CUDA_MEM.
+</dd>
+
+<dt>STARPU_LIMIT_CUDA_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CUDA_MEM
+\addindex __env__STARPU_LIMIT_CUDA_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on each CUDA devices. This variable is
+intended to be used for experimental purposes as it emulates devices
+that have a limited amount of memory.
+</dd>
+
+<dt>STARPU_LIMIT_OPENCL_devid_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_OPENCL_devid_MEM
+\addindex __env__STARPU_LIMIT_OPENCL_devid_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on the OpenCL device with the identifier
+<c>devid</c>. This variable is intended to be used for experimental
+purposes as it emulates devices that have a limited amount of memory.
+When defined, the variable overwrites the value of the variable
+\ref STARPU_LIMIT_OPENCL_MEM.
+</dd>
+
+<dt>STARPU_LIMIT_OPENCL_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_OPENCL_MEM
+\addindex __env__STARPU_LIMIT_OPENCL_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on each OpenCL devices. This variable is
+intended to be used for experimental purposes as it emulates devices
+that have a limited amount of memory.
+</dd>
+
+<dt>STARPU_LIMIT_CPU_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_MEM
+\addindex __env__STARPU_LIMIT_CPU_MEM
+This variable specifies the maximum number of megabytes that should be
+available to the application on each CPU device. This variable is
+intended to be used for experimental purposes as it emulates devices
+that have a limited amount of memory.
+</dd>
+
+<dt>STARPU_GENERATE_TRACE</dt>
+<dd>
+\anchor STARPU_GENERATE_TRACE
+\addindex __env__STARPU_GENERATE_TRACE
+When set to <c>1</c>, this variable indicates that StarPU should automatically
+generate a Paje trace when starpu_shutdown() is called.
+</dd>
+
+<dt>STARPU_MEMORY_STATS</dt>
+<dd>
+\anchor STARPU_MEMORY_STATS
+\addindex __env__STARPU_MEMORY_STATS
+When set to 0, disable the display of memory statistics on data which
+have not been unregistered at the end of the execution (\ref MemoryFeedback).
+</dd>
+
+<dt>STARPU_BUS_STATS</dt>
+<dd>
+\anchor STARPU_BUS_STATS
+\addindex __env__STARPU_BUS_STATS
+When defined, statistics about data transfers will be displayed when calling
+starpu_shutdown() (\ref Profiling).
+</dd>
+
+<dt>STARPU_WORKER_STATS</dt>
+<dd>
+\anchor STARPU_WORKER_STATS
+\addindex __env__STARPU_WORKER_STATS
+When defined, statistics about the workers will be displayed when calling
+starpu_shutdown() (\ref Profiling). When combined with the
+environment variable \ref STARPU_PROFILING, it displays the power
+consumption (\ref Power-basedScheduling).
+</dd>
+
+<dt>STARPU_STATS</dt>
+<dd>
+\anchor STARPU_STATS
+\addindex __env__STARPU_STATS
+When set to 0, data statistics will not be displayed at the
+end of the execution of an application (\ref DataStatistics).
+</dd>
+
+<dt>STARPU_WATCHDOG_TIMEOUT</dt>
+<dd>
+\anchor STARPU_WATCHDOG_TIMEOUT
+\addindex __env__STARPU_WATCHDOG_TIMEOUT
+When set to a value other than 0, allows to make StarPU print an error
+message whenever StarPU does not terminate any task for 10ms. Should
+be used in combination with \ref STARPU_WATCHDOG_CRASH (see \ref
+DetectionStuckConditions).
+</dd>
+
+<dt>STARPU_WATCHDOG_CRASH</dt>
+<dd>
+\anchor STARPU_WATCHDOG_CRASH
+\addindex __env__STARPU_WATCHDOG_CRASH
+When set to a value other than 0, it triggers a crash when the watch
+dog is reached, thus allowing to catch the situation in gdb, etc
+(see \ref DetectionStuckConditions)
+</dd>
+
+<dt>STARPU_DISABLE_KERNELS</dt>
+<dd>
+\anchor STARPU_DISABLE_KERNELS
+\addindex __env__STARPU_DISABLE_KERNELS
+When set to a value other than 1, it disables actually calling the kernel
+functions, thus allowing to quickly check that the task scheme is working
+properly, without performing the actual application-provided computation.
+</dd>
+
+<dt>STARPU_HISTORY_MAX_ERROR</dt>
+<dd>
+\anchor STARPU_HISTORY_MAX_ERROR
+\addindex __env__STARPU_HISTORY_MAX_ERROR
+History-based performance models will drop measurements which are really far
+froom the measured average. This specifies the allowed variation. The default is
+50 (%), i.e. the measurement is allowed to be x1.5 faster or /1.5 slower than the
+average.
+</dd>
+
+</dl>
+
+\section ConfiguringTheHypervisor Configuring The Hypervisor
+
+<dl>
+
+<dt>SC_HYPERVISOR_POLICY</dt>
+<dd>
+\anchor SC_HYPERVISOR_POLICY
+\addindex __env__SC_HYPERVISOR_POLICY
+Choose between the different resizing policies proposed by StarPU for the hypervisor: 
+idle, app_driven, feft_lp, teft_lp; ispeed_lp, throughput_lp etc.
+
+Use <c>SC_HYPERVISOR_POLICY=help</c> to get the list of available policies for the hypervisor
+</dd>
+
+<dt>SC_HYPERVISOR_TRIGGER_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_TRIGGER_RESIZE
+\addindex __env__SC_HYPERVISOR_TRIGGER_RESIZE
+Choose how should the hypervisor be triggered: <c>speed</c> if the resizing algorithm should
+be called whenever the speed of the context does not correspond to an optimal precomputed value,
+<c>idle</c> it the resizing algorithm should be called whenever the workers are idle for a period
+longer than the value indicated when configuring the hypervisor.
+</dd>
+
+<dt>SC_HYPERVISOR_START_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_START_RESIZE
+\addindex __env__SC_HYPERVISOR_START_RESIZE
+Indicate the moment when the resizing should be available. The value correspond to the percentage
+of the total time of execution of the application. The default value is the resizing frame.
+</dd>
+
+<dt>SC_HYPERVISOR_MAX_SPEED_GAP</dt>
+<dd>
+\anchor SC_HYPERVISOR_MAX_SPEED_GAP
+\addindex __env__SC_HYPERVISOR_MAX_SPEED_GAP
+Indicate the ratio of speed difference between contexts that should trigger the hypervisor.
+This situation may occur only when a theoretical speed could not be computed and the hypervisor
+has no value to compare the speed to. Otherwise the resizing of a context is not influenced by the 
+the speed of the other contexts, but only by the the value that a context should have.
+</dd>
+
+<dt>SC_HYPERVISOR_STOP_PRINT</dt>
+<dd>
+\anchor SC_HYPERVISOR_STOP_PRINT
+\addindex __env__SC_HYPERVISOR_STOP_PRINT
+By default the values of the speed of the workers is printed during the execution
+of the application. If the value 1 is given to this environment variable this printing
+is not done.
+</dd>
+
+<dt>SC_HYPERVISOR_LAZY_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_LAZY_RESIZE
+\addindex __env__SC_HYPERVISOR_LAZY_RESIZE
+By default the hypervisor resizes the contexts in a lazy way, that is workers are firstly added to a new context
+before removing them from the previous one. Once this workers are clearly taken into account 
+into the new context (a task was poped there) we remove them from the previous one. However if the application
+would like that the change in the distribution of workers should change right away this variable should be set to 0
+</dd>
+
+<dt>SC_HYPERVISOR_SAMPLE_CRITERIA</dt>
+<dd>
+\anchor SC_HYPERVISOR_SAMPLE_CRITERIA
+\addindex __env__SC_HYPERVISOR_SAMPLE_CRITERIA
+By default the hypervisor uses a sample of flops when computing the speed of the contexts and of the workers.
+If this variable is set to <c>time</c> the hypervisor uses a sample of time (10% of an aproximation of the total
+execution time of the application)
+</dd>
+
+</dl>
+
+*/

+ 25 - 1
doc/doxygen/chapters/16configure_options.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -526,6 +526,30 @@ export SIMGRID_LIBS="-L/usr/local/simgrid/lib -lsimgrid"
 
 </dd>
 
+<dt>--with-simgrid-dir</dt>
+<dd>
+\anchor with-simgrid-dir
+\addindex __configure__--with-simgrid-dir
+Similar to the option \ref enable-simgrid "--enable-simgrid" but also
+allows to specify the location to the SimGrid library.
+</dd>
+
+<dt>--with-simgrid-include-dir</dt>
+<dd>
+\anchor with-simgrid-include-dir
+\addindex __configure__--with-simgrid-include-dir
+Similar to the option \ref enable-simgrid "--enable-simgrid" but also
+allows to specify the location to the SimGrid include directory.
+</dd>
+
+<dt>--with-simgrid-lib-dir</dt>
+<dd>
+\anchor with-simgrid-lib-dir
+\addindex __configure__--with-simgrid-lib-dir
+Similar to the option \ref enable-simgrid "--enable-simgrid" but also
+allows to specify the location to the SimGrid lib directory.
+</dd>
+
 <dt>--enable-calibration-heuristic</dt>
 <dd>
 \anchor enable-calibration-heuristic

+ 31 - 24
doc/doxygen/chapters/17files.doxy

@@ -1,52 +1,59 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
 */
 
 /*! \page Files Files
 
-\file starpu_deprecated_api.h
+\file starpu_config.h
+
 \file starpu.h
+\file starpu_bitmap.h
+\file starpu_bound.h
+\file starpu_cublas.h
+\file starpu_cuda.h
 \file starpu_data_filters.h
+\file starpu_data.h
 \file starpu_data_interfaces.h
+\file starpu_deprecated_api.h
 \file starpu_disk.h
-\file starpu_worker.h
-\file starpu_task.h
-\file starpu_task_bundle.h
-\file starpu_task_list.h
-\file starpu_task_util.h
-\file starpu_data.h
-\file starpu_perfmodel.h
-\file starpu_util.h
+\file starpu_driver.h
+\file starpu_expert.h
 \file starpu_fxt.h
-\file starpu_cuda.h
-\file starpu_opencl.h
-\file starpu_sink.h
+\file starpu_hash.h
 \file starpu_mic.h
-\file starpu_scc.h
-\file starpu_expert.h
+\file starpu_opencl.h
+\file starpu_perfmodel.h
 \file starpu_profiling.h
-\file starpu_bound.h
-\file starpu_scheduler.h
+\file starpu_rand.h
+\file starpu_scc.h
 \file starpu_sched_ctx.h
 \file starpu_sched_ctx_hypervisor.h
-\file starpu_top.h
-\file starpu_hash.h
-\file starpu_rand.h
-\file starpu_cublas.h
-\file starpu_driver.h
+\file starpu_scheduler.h
+\file starpu_sink.h
 \file starpu_stdlib.h
+\file starpu_task_bundle.h
+\file starpu_task.h
+\file starpu_task_list.h
+\file starpu_task_util.h
 \file starpu_thread.h
 \file starpu_thread_util.h
+\file starpu_top.h
+\file starpu_tree.h
+\file starpu_util.h
+\file starpu_worker.h
+
 \file starpu_mpi.h
-\file sc_hypervisor.h
+
 \file sc_hypervisor_config.h
+\file sc_hypervisor.h
 \file sc_hypervisor_lp.h
 \file sc_hypervisor_monitoring.h
 \file sc_hypervisor_policy.h
-\file starpu_config.h
+
+\file starpufft.h
 
 */

doc/doxygen/chapters/18scaling-vector-example.doxy → doc/doxygen/chapters/50scaling-vector-example.doxy


doc/doxygen/chapters/19fdl-1.3.doxy → doc/doxygen/chapters/51fdl-1.3.doxy


+ 67 - 0
doc/doxygen/chapters/api/bitmap.doxy

@@ -0,0 +1,67 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2014  Centre National de la Recherche Scientifique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Bitmap  Bitmap
+
+\brief This section describes the bitmap facilities provided by StarPU.
+
+\fn struct starpu_bitmap *starpu_bitmap_create(void)
+\ingroup API_Bitmap
+todo
+
+\fn void starpu_bitmap_destroy(struct starpu_bitmap *b)
+\ingroup API_Bitmap
+todo
+
+\fn void starpu_bitmap_set(struct starpu_bitmap *b, int e)
+\ingroup API_Bitmap
+todo
+
+\fn void starpu_bitmap_unset(struct starpu_bitmap *b, int e)
+\ingroup API_Bitmap
+todo
+
+\fn void starpu_bitmap_unset_all(struct starpu_bitmap *b)
+\ingroup API_Bitmap
+todo
+
+\fn int starpu_bitmap_get(struct starpu_bitmap *b, int e)
+\ingroup API_Bitmap
+todo
+
+\fn void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c)
+\ingroup API_Bitmap
+Basically compute starpu_bitmap_unset_all(a) ; a = b & c;
+
+\fn void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b)
+\ingroup API_Bitmap
+Basically compute a |= b
+
+\fn int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e)
+\ingroup API_Bitmap
+return 1 iff e set in b1 AND e set in b2
+
+\fn int starpu_bitmap_cardinal(struct starpu_bitmap *b)
+\ingroup API_Bitmap
+todo
+
+\fn int starpu_bitmap_first(struct starpu_bitmap *b)
+\ingroup API_Bitmap
+return the index of first bit, -1 if none
+
+\fn int starpu_bitmap_last(struct starpu_bitmap *b)
+\ingroup API_Bitmap
+todo
+
+\fn int starpu_bitmap_next(struct starpu_bitmap *b, int e)
+\ingroup API_Bitmap
+return the index of bit right after e, -1 if none
+
+\fn int starpu_bitmap_has_next(struct starpu_bitmap *b, int e)
+\ingroup API_Bitmap
+todo
+
+*/

+ 32 - 1
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -223,6 +223,10 @@ If the field starpu_codelet::where is set, then the field
 starpu_codelet::cuda_funcs is ignored if ::STARPU_CUDA does not appear
 in the field starpu_codelet::where, it must be non-null otherwise.
 
+\var starpu_codelet::cuda_flags
+Optional array of flags for CUDA execution. They specify some semantic details
+about CUDA kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::opencl_funcs
 Optional array of function pointers to the OpenCL implementations of
 the codelet. It must be terminated by a NULL value. The functions
@@ -235,6 +239,10 @@ starpu_codelet::opencl_funcs is ignored if ::STARPU_OPENCL does not
 appear in the field starpu_codelet::where, it must be non-null
 otherwise.
 
+\var starpu_codelet::opencl_flags
+Optional array of flags for OpenCL execution. They specify some semantic details
+about OpenCL kernel execution, such as asynchronous execution.
+
 \var starpu_codelet::mic_funcs
 Optional array of function pointers to a function which returns the
 MIC implementation of the codelet. It must be terminated by a NULL
@@ -278,13 +286,36 @@ unsufficient, this value can be set with the configure option
 
 \var starpu_codelet::dyn_modes
 Is an array of ::starpu_data_access_mode. It describes the required
-access modes to the data neeeded by the codelet (e.g. ::STARPU_RW).
+access modes to the data needed by the codelet (e.g. ::STARPU_RW).
 The number of entries in this array must be specified in the field
 starpu_codelet::nbuffers. This field should be used for codelets having a
 number of datas greater than \ref STARPU_NMAXBUFS (see \ref
 SettingTheDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_codelet::modes defined above.
 
+\var starpu_codelet::specific_nodes.
+Default value is 0. If this flag is set, StarPU will not systematically
+send all data to the memory node where the task will be executing, it
+will read the starpu_codelet::nodes or starpu_codelet::dyn_nodes array to
+determine, for each data, whether to send it on the memory node where the task
+will be executing (-1), or on a specific node (!= -1).
+
+\var starpu_codelet::nodes.
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers, and should
+not exceed \ref STARPU_NMAXBUFS.
+
+\var starpu_codelet::dyn_nodes
+Optional field. When starpu_codelet::specific_nodes is 1, this specifies
+the memory nodes where each data should be sent to for task execution.
+The number of entries in this array is starpu_codelet::nbuffers.
+This field should be used for codelets having a
+number of datas greater than \ref STARPU_NMAXBUFS (see \ref
+SettingTheDataHandlesForATask). When defining a codelet, one
+should either define this field or the field starpu_codelet::nodes defined
+above.
+
 \var starpu_codelet::model
 Optional pointer to the task duration performance model associated to
 this codelet. This optional field is ignored when set to <c>NULL</c> or when

+ 5 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -192,6 +192,10 @@ accessed in the mode ::STARPU_REDUX. Per-worker buffers will be initialized with
 the codelet \p init_cl, and reduction between per-worker buffers will be
 done with the codelet \p redux_cl.
 
+\fn struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_handle_t handle)
+\ingroup API_Data_Management
+todo
+
 @name Access registered data from the application
 \ingroup API_Data_Management
 

+ 9 - 1
doc/doxygen/chapters/api/fxt_support.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -77,5 +77,13 @@ starpu_shutdown(). starpu_fxt_stop_profiling() can however be used to
 stop it earlier. starpu_fxt_start_profiling() can then be called to
 start recording it again, etc.
 
+\fn void starpu_fxt_write_data_trace(char *filename_in)
+\ingroup API_FxT_Support
+todo
+
+\fn void starpu_fxt_trace_user_event(unsigned long code)
+\ingroup API_FxT_Support
+Add an event in the execution trace if FxT is enabled.
+
 */
 

+ 5 - 1
doc/doxygen/chapters/api/implicit_dependencies.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -39,4 +39,8 @@ consistency mode set using this function has the priority over the
 default mode which can be set with
 starpu_data_set_default_sequential_consistency_flag().
 
+\fn unsigned starpu_data_get_sequential_consistency_flag(starpu_data_handle_t handle)
+\ingroup API_Implicit_Data_Dependencies
+Get the data consistency mode associated to the data handle \p handle
+
 */

+ 15 - 2
doc/doxygen/chapters/api/initialization.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -132,7 +132,7 @@ implementations) however do not support concurrent calls to
 parallel code. In such case, setting this flag makes StarPU
 only start one parallel task at a time (but other CPU and
 GPU tasks are not affected and can be run concurrently).
-The parallel task scheduler will however still however
+The parallel task scheduler will however
 still try varying combined worker sizes to look for the
 most efficient ones. This can also be specified with the environment
 variable \ref STARPU_SINGLE_COMBINED_WORKER.
@@ -241,6 +241,19 @@ This is StarPU termination method. It must be called at the end of the
 application: statistics and other post-mortem debugging information
 are not guaranteed to be available until this method has been called.
 
+\fn void starpu_pause(void)
+\ingroup API_Initialization_and_Termination
+This call is used to suspend the processing of new tasks by
+workers. It can be used in a program where StarPU is used during only
+a part of the execution. Without this call, the workers continue to
+poll for new tasks in a tight loop, wasting CPU time. The symmetric
+call to starpu_resume() should be used to unfreeze the workers.
+
+\fn void starpu_resume(void)
+\ingroup API_Initialization_and_Termination
+This is the symmetrical call to starpu_pause(), used to resume
+the workers polling for new tasks.
+
 \fn int starpu_asynchronous_copy_disabled(void)
 \ingroup API_Initialization_and_Termination
 Return 1 if asynchronous data transfers between CPU and accelerators

+ 12 - 3
doc/doxygen/chapters/api/insert_task.doxy

@@ -1,16 +1,16 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
 
 /*! \defgroup API_Insert_Task Insert_Task
 
-\def starpu_insert_task
+\fn starpu_insert_task(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task
-Convenience macro for the function starpu_task_insert() which used to be called starpu_insert_task.
+This function does the same as the function starpu_task_insert(). It has been kept to avoid breaking old codes.
 
 \fn int starpu_task_insert(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task
@@ -23,6 +23,9 @@ The arguments following the codelet can be of the following types:
 ::STARPU_REDUX an access mode followed by a data handle;
 <li> ::STARPU_DATA_ARRAY followed by an array of data handles and its
 number of elements;
+<li> ::STARPU_EXECUTE_ON_WORKER followed by an integer value
+specifying the worker on which to execute the task (as specified by
+starpu_task::execute_on_a_specific_worker)
 <li> the specific values ::STARPU_VALUE, ::STARPU_CALLBACK,
 ::STARPU_CALLBACK_ARG, ::STARPU_CALLBACK_WITH_ARG, ::STARPU_PRIORITY,
 ::STARPU_TAG, ::STARPU_FLOPS, ::STARPU_SCHED_CTX followed by the
@@ -71,6 +74,12 @@ be followed by a integer defining a priority level
 \ingroup API_Insert_Task
 TODO
 
+\def STARPU_EXECUTE_ON_WORKER
+\ingroup API_Insert_Task
+this macro is used when calling starpu_task_insert(), and must be
+followed by an integer value specifying the worker on which to execute
+the task (as specified by starpu_task::execute_on_a_specific_worker)
+
 \def STARPU_TAG
 \ingroup API_Insert_Task
 this macro is used when calling starpu_task_insert(), and must be followed by a tag.

+ 18 - 1
doc/doxygen/chapters/api/misc_helpers.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -32,5 +32,22 @@ until the function has been executed on every appropriate processing
 units, so that it may not be called from a callback function for
 instance.
 
+\fn void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t where, const char *name)
+\ingroup API_Miscellaneous_Helpers
+Same as starpu_execute_on_each_worker(), except that the task name is
+specified in the argument \p name.
+
+\fn void starpu_execute_on_specific_workers(void (*func)(void*), void *arg, unsigned num_workers, unsigned *workers, const char *name);
+\ingroup API_Miscellaneous_Helpers
+Call \p func(\p arg) on every worker in the \p workers array. \p
+num_workers indicates the number of workers in this array.  This
+function is synchronous, but the different workers may execute the
+function in parallel.
+
+\fn double starpu_timing_now(void)
+\ingroup API_Miscellaneous_Helpers
+Return the current date in micro-seconds.
+
+
 */
 

+ 35 - 5
doc/doxygen/chapters/api/mpi.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -163,19 +163,27 @@ source using the n-th message tag of the array \p mpi_tag within the
 n-th communicator of the array \p comm. On completion of the all the
 requests, \p tag is unlocked.
 
+\fn int starpu_mpi_get_communication_tag(void)
+\ingroup API_MPI_Support
+todo
+
+\fn void starpu_mpi_set_communication_tag(int tag)
+\ingroup API_MPI_Support
+todo
+
 @name Communication Cache
 \ingroup API_MPI_Support
 
 \fn void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 \ingroup API_MPI_Support
 Clear the send and receive communication cache for the data
-\p data_handle. The function has to be called synchronously by all the
+\p data_handle and invalidate the value. The function has to be called synchronously by all the
 MPI nodes. The function does nothing if the cache mechanism is
 disabled (see \ref STARPU_MPI_CACHE).
 
 \fn void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 \ingroup API_MPI_Support
-Clear the send and receive communication cache for all data. The
+Clear the send and receive communication cache for all data and invalidate their values. The
 function has to be called synchronously by all the MPI nodes. The
 function does nothing if the cache mechanism is disabled (see
 \ref STARPU_MPI_CACHE).
@@ -192,6 +200,12 @@ Tell StarPU-MPI which MPI tag to use when exchanging the data.
 \ingroup API_MPI_Support
 Returns the MPI tag to be used when exchanging the data.
 
+\fn int starpu_mpi_data_register(starpu_data_handle_t handle, int tag, int rank)
+\ingroup API_MPI_Support
+Calling this function should be prefered to calling both
+starpu_data_set_rank() and starpu_data_set_tag() as it also allows to
+automatically clear the MPI communication cache when unregistering the data.
+
 \fn int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 \ingroup API_MPI_Support
 Tell StarPU-MPI which MPI node "owns" a given data, that is, the node
@@ -214,9 +228,9 @@ this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a data handle to specify that the node owning the given
 data will execute the codelet.
 
-\def starpu_mpi_insert_task
+\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
-Convenience macro for the function starpu_mpi_task_insert() which used to be called starpu_mpi_insert_task.
+This function does the same as the function starpu_mpi_task_insert(). It has been kept to avoid breaking old codes.
 
 \fn int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
@@ -256,6 +270,22 @@ The algorithm also includes a communication cache mechanism that
 allows not to send data twice to the same MPI node, unless the data
 has been modified. The cache can be disabled (see \ref STARPU_MPI_CACHE).
 
+\fn struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+\ingroup API_MPI_Support
+Create a task corresponding to codelet with the following arguments.
+The argument list must be zero-terminated. The function performs the
+first two steps of the function starpu_mpi_task_insert(). Only the MPI
+node selected in the first step of the algorithm will return a valid
+task structure which can then be submitted. The function
+starpu_mpi_task_post_build() MUST be called after the submission of
+the task, with the SAME list of arguments.
+
+\fn int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+\ingroup API_MPI_Support
+This function MUST be called after a call to starpu_mpi_task_build(),
+with the SAME list of arguments. It performs the fourth -- last -- step of the algorithm described in
+starpu_mpi_task_insert().
+
 \fn void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 \ingroup API_MPI_Support
 Transfer data \p data_handle to MPI node \p node, sending it from its

+ 6 - 1
doc/doxygen/chapters/api/parallel_tasks.doxy

@@ -45,7 +45,12 @@ workers
 \fn void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid)
 \ingroup API_Parallel_Tasks
 Initialise the barrier for the parallel task, and dispatch the task
-between the different combined workers.
+between the different workers of the given combined worker.
+
+\fn void starpu_parallel_task_barrier_init_n(struct starpu_task *task, int worker_size)
+\ingroup API_Parallel_Tasks
+Initialise the barrier for the parallel task, to be pushed to \e worker_size
+workers (without having to explicit a given combined worker).
 
 */
 

+ 23 - 3
doc/doxygen/chapters/api/performance_model.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -103,8 +103,12 @@ must return a task duration estimation in micro-seconds.
 \var starpu_perfmodel::size_base
 Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
 ::STARPU_NL_REGRESSION_BASED. If not NULL, takes a task and
-implementation number, and returns the size to be used as index for
-history and regression.
+implementation number, and returns the size to be used as index to distinguish
+histories and as a base for regressions.
+\var starpu_perfmodel::footprint
+Used by ::STARPU_HISTORY_BASED. If not NULL, takes a task and returns the
+footprint to be used as index to distinguish histories. The default is to use
+the starpu_task_data_footprint function.
 \var starpu_perfmodel::per_arch
 Used by ::STARPU_PER_ARCH: array of structures starpu_per_arch_perfmodel
 \var starpu_perfmodel::is_loaded
@@ -207,6 +211,14 @@ in bytes
 \var starpu_perfmodel_history_entry::flops
 Provided by the application
 
+\fn void starpu_perfmodel_init(struct starpu_perfmodel *model)
+\ingroup API_Performance_Model
+todo
+
+\fn void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model)
+\ingroup API_Performance_Model
+todo
+
 \fn int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 \ingroup API_Performance_Model
 loads a given performance model. The model structure has to be
@@ -223,6 +235,10 @@ through the function starpu_perfmodel_load_symbol()
 \ingroup API_Performance_Model
 returns the path to the debugging information for the performance model.
 
+\fn char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
+\ingroup API_Performance_Model
+todo
+
 \fn void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl)
 \ingroup API_Performance_Model
 returns the architecture name for \p arch
@@ -276,4 +292,8 @@ Return the latency of data transfer between two memory nodes
 \ingroup API_Performance_Model
 Return the estimated time to transfer a given size between two memory nodes.
 
+\fn double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint)
+\ingroup API_Performance_Model
+todo
+
 */

+ 2 - 2
doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012, 2013 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -49,7 +49,7 @@ Whenever we want to exclude
 contexts from the resizing process we have to unregister them from the
 hypervisor.
 
-\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+\fn void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 \ingroup API_SC_Hypervisor_usage
 Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
 

+ 17 - 1
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -214,6 +214,14 @@ policy of the given scheduler context.
 Returns the current maximum priority level supported by the scheduling
 policy of the given scheduler context.
 
+\fn int starpu_sched_ctx_min_priority_is_set(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+todo
+
+\fn int starpu_sched_ctx_max_priority_is_set(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+todo
+
 @name Scheduling Context Worker Collection
 \ingroup API_Scheduling_Contexts
 
@@ -264,4 +272,12 @@ assigned to.
 \ingroup API_Scheduling_Contexts
 execute any parallel code on the workers of the sched_ctx (workers are blocked)
 
+\fn int starpu_sched_ctx_get_nready_tasks(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+todo
+
+\fn double starpu_sched_ctx_get_nready_flops(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+todo
+
 */

+ 6 - 5
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -126,13 +126,14 @@ Check if the worker specified by workerid can execute the codelet.
 Schedulers need to call it before assigning a task to a worker,
 otherwise the task may fail to execute.
 
-\fn double starpu_timing_now(void)
+\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
-Return the current date in micro-seconds.
+Returns the footprint for a given task, taking into account user-provided
+perfmodel footprint or size_base functions.
 
-\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
+\fn uint32_t starpu_task_data_footprint(struct starpu_task *task)
 \ingroup API_Scheduling_Policy
-Returns the footprint for a given task
+Returns the raw footprint for the data of a given task (without taking into account user-provided functions).
 
 \fn double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy

+ 6 - 0
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -65,6 +65,12 @@ This function frees memory by specifying its size. The given
 flags should be consistent with the ones given to starpu_malloc_flags()
 when allocating the memory.
 
+\fn ssize_t starpu_memory_get_total(unsigned node)
+\ingroup API_Standard_Memory_Library
+If a memory limit is defined on the given node (see Section \ref
+HowToLimitMemoryPerNode), return the amount of total memory
+on the node. Otherwise return -1.
+
 \fn ssize_t starpu_memory_get_available(unsigned node)
 \ingroup API_Standard_Memory_Library
 If a memory limit is defined on the given node (see Section \ref

+ 65 - 2
doc/doxygen/chapters/api/threads.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
@@ -11,7 +11,7 @@
 \brief This section describes the thread facilities provided
 by StarPU. The thread function are either implemented on top of the
 pthread library or the Simgrid library when the simulated performance
-mode is enabled (\ref SimulatedPerformance).
+mode is enabled (\ref SimGridSupport).
 
 \def STARPU_PTHREAD_CREATE_ON
 \ingroup API_Threads
@@ -140,6 +140,12 @@ terminate.  If that thread has already terminated, then the function
 returns immediately. The thread specified by \p thread must be
 joinable.
 
+\fn int starpu_pthread_exit(void *retval)
+\ingroup API_Threads
+This function terminates the calling thread and returns a value via
+\p retval that (if the thread is joinable) is available to another thread
+in the same process that calls starpu_pthread_join().
+
 \fn int starpu_pthread_attr_init(starpu_pthread_attr_t *attr)
 \ingroup API_Threads
 This function initializes the thread attributes object pointed to by
@@ -214,6 +220,22 @@ enable-fxt-lock "--enable-fxt-lock" is enabled.
 \ingroup API_Threads
 This macro initializes the mutex given in parameter.
 
+\fn int starpu_pthread_mutexattr_gettype(const starpu_pthread_mutexattr_t *attr, int *type)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_mutexattr_settype(starpu_pthread_mutexattr_t *attr, int type)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_mutexattr_destroy(starpu_pthread_mutexattr_t *attr)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_mutexattr_init(starpu_pthread_mutexattr_t *attr)
+\ingroup API_Threads
+todo
+
 \fn int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *))
 \ingroup API_Threads
 This function allocates a new TSD key. The key is stored in the
@@ -295,12 +317,53 @@ This function is the same as starpu_pthread_mutex_destroy().
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_lock().
 
+\fn int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+todo
+
 \fn starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_lock().
 
+\fn int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+todo
+
 \fn starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_unlock().
 
+\fn int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_spin_init(starpu_pthread_spinlock_t *lock, int pshared)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_spin_destroy(starpu_pthread_spinlock_t *lock)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_spin_lock(starpu_pthread_spinlock_t *lock)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_spin_trylock(starpu_pthread_spinlock_t *lock)
+\ingroup API_Threads
+todo
+
+\fn int starpu_pthread_spin_unlock(starpu_pthread_spinlock_t *lock)
+\ingroup API_Threads
+todo
+
+
 */

+ 46 - 0
doc/doxygen/chapters/api/tree.doxy

@@ -0,0 +1,46 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2014  Centre National de la Recherche Scientifique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Tree Tree
+
+\brief This section describes the tree facilities provided by StarPU.
+
+\struct starpu_tree
+\ingroup API_Tree
+\var starpu_tree::nodes
+todo
+\var starpu_tree::father
+todo
+\var starpu_tree::arity
+todo
+\var starpu_tree::id
+todo
+\var starpu_tree::level
+todo
+\var starpu_tree::is_pu
+todo
+
+\fn void starpu_tree_reset_visited(struct starpu_tree *tree, int *visited)
+\ingroup API_Tree
+todo
+
+\fn void starpu_tree_insert(struct starpu_tree *tree, int id, int level, int is_pu, int arity, struct starpu_tree *father)
+\ingroup API_Tree
+todo
+
+\fn struct starpu_tree *starpu_tree_get(struct starpu_tree *tree, int id)
+\ingroup API_Tree
+todo
+
+\fn struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, int *visited, int *present)
+\ingroup API_Tree
+todo
+
+\fn int starpu_tree_free(struct starpu_tree *tree)
+\ingroup API_Tree
+todo
+
+*/

+ 29 - 0
doc/doxygen/dev/sc_funcs.cocci

@@ -0,0 +1,29 @@
+// StarPU --- Runtime system for heterogeneous multicore architectures.
+//
+// Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+// Copyright (C) 2014 Centre National de la Recherche Scientifique
+//
+// StarPU is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation; either version 2.1 of the License, or (at
+// your option) any later version.
+//
+// StarPU is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+@scfunc@
+position p;
+type t;
+identifier f =~ "sc";
+@@
+
+t f@p( ... );
+
+@ script:python @
+p << scfunc.p;
+f << scfunc.f;
+@@
+print "%s,%s:%s" % (f,p[0].file,p[0].line)

+ 9 - 4
doc/doxygen/dev/starpu_check_documented.py

@@ -1,6 +1,7 @@
 #!/usr/bin/python
 
 import os
+import sys
 
 class bcolors:
     FAILURE = '\033[91m'
@@ -22,17 +23,21 @@ def loadFunctionsAndDatatypes(flist, dtlist, fname):
 functions = []
 datatypes = []
 
-for docfile in os.listdir('chapters/api'):
+dirname=os.path.dirname(sys.argv[0])
+docfile_dir=dirname+"/../chapters/api/"
+
+for docfile in os.listdir(docfile_dir):
     if docfile.count(".doxy"):
-        loadFunctionsAndDatatypes(functions, datatypes, "chapters/api/"+docfile)
+        loadFunctionsAndDatatypes(functions, datatypes, docfile_dir+docfile)
 
+incfiles=dirname+"/../../../include/*.h " + dirname + "/../../../mpi/include/*.h " + dirname + "/../../../starpufft/*h " + dirname + "/../../../sc_hypervisor/include/*.h " + dirname + "/../../../include/starpu_config.h.in"
 for function in functions:
-    x = os.system("fgrep -l \"" + function[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
+    x = os.system("fgrep -l \"" + function[0] + "\" " + incfiles + " > /dev/null")
     if x != 0:
         print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[1] + "> does not exist in StarPU's API"
 
 for datatype in datatypes:
-    x = os.system("fgrep -l \"" + datatype[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
+    x = os.system("fgrep -l \"" + datatype[0] + "\" " + incfiles + " > /dev/null")
     if x != 0:
         print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
 

+ 58 - 45
doc/doxygen/dev/starpu_check_undocumented.sh

@@ -4,7 +4,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
+# Copyright (C) 2011, 2012, 2013, 2014 Centre National de la Recherche Scientifique
 # Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -22,57 +22,70 @@ stcolor=$(tput sgr0)
 redcolor=$(tput setaf 1)
 greencolor=$(tput setaf 2)
 
-H_FILES=$(find ../../include ../../mpi/include -name '*.h')
+dirname=$(dirname $0)
 
-functions=$(spatch -very_quiet -sp_file ./dev/starpu_funcs.cocci $H_FILES)
-for func in $functions ; do
+STARPU_H_FILES=$(find $dirname/../../../include $dirname/../../../mpi/include -name '*.h')
+SC_H_FILES=$(find $dirname/../../../sc_hypervisor/include -name '*.h')
+SRC="$dirname/../../../src $dirname/../../../mpi/src $dirname/../../../sc_hypervisor/src"
+
+if [ "$1" == "--func" ] || [ "$1" == "" ] ; then
+    starpu_functions=$(spatch -very_quiet -sp_file $dirname/starpu_funcs.cocci $STARPU_H_FILES)
+    sc_functions=$(spatch -very_quiet -sp_file $dirname/sc_funcs.cocci $SC_H_FILES)
+    for func in $starpu_functions $sc_functions ; do
 	fname=$(echo $func|awk -F ',' '{print $1}')
 	location=$(echo $func|awk -F ',' '{print $2}')
-	x=$(grep "$fname(" chapters/api/*.doxy | grep "\\fn")
+	x=$(grep "$fname(" $dirname/../chapters/api/*.doxy | grep "\\fn")
 	if test "$x" == "" ; then
-		echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
-#	else
-#		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
+	    echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
+	    #	else
+	    #		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
 	fi
-done
-
-echo
-
-structs=$(grep "struct starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-for struct in $structs ; do
-    x=$(grep -F "\\struct $struct" chapters/api/*.doxy)
-    if test "$x" == "" ; then
-	echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-
-echo
+    done
+    echo
+fi
 
-enums=$(grep "enum starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-for enum in $enums ; do
-    x=$(grep -F "\\enum $enum" chapters/api/*.doxy)
-    if test "$x" == "" ; then
-	echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-
-echo
+if [ "$1" == "--struct" ] || [ "$1" == "" ] ; then
+    starpu_structs=$(grep "struct starpu" $STARPU_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+    sc_structs=$(grep "struct sc" $SC_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+    for struct in $starpu_structs $sc_structs ; do
+	x=$(grep -F "\\struct $struct" $dirname/../chapters/api/*.doxy)
+	if test "$x" == "" ; then
+	    echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
+	fi
+    done
+    echo
+fi
 
-macros=$(grep "define\b" $H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
-for macro in $macros ; do
-    x=$(grep -F "\\def $macro" chapters/api/*.doxy)
-    if test "$x" == "" ; then
-	echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
-    fi
-done
+if [ "$1" == "--enum" ] || [ "$1" == "" ] ; then
+    starpu_enums=$(grep "enum starpu" $STARPU_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+    sc_enums=$(grep "enum starpu" $SC_H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+    for enum in $starpu_enums $sc_enums ; do
+	x=$(grep -F "\\enum $enum" $dirname/../chapters/api/*.doxy)
+	if test "$x" == "" ; then
+	    echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
+	fi
+    done
+    echo
+fi
 
-echo
+if [ "$1" == "--macro" ] || [ "$1" == "" ] ; then
+    macros=$(grep "define\b" $STARPU_H_FILES $SC_H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
+    for macro in $macros ; do
+	x=$(grep -F "\\def $macro" $dirname/../chapters/api/*.doxy)
+	if test "$x" == "" ; then
+	    echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
+	fi
+    done
+    echo
+fi
 
-variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" src/| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
-for variable in $variables ; do
-    x=$(grep "$variable" chapters/environment_variables.doxy | grep "\\anchor")
-    if test "$x" == "" ; then
-	echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
-    fi
-done
+if [ "$1" == "--var" ] || [ "$1" == "" ] ; then
+    variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" $SRC| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
+    for variable in $variables ; do
+	x=$(grep "$variable" $dirname/../chapters/40environment_variables.doxy | grep "\\anchor")
+	if test "$x" == "" ; then
+	    echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
+	fi
+    done
+fi
 

+ 4 - 2
doc/doxygen/doxygen-config.cfg.in

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012  Institut National de Recherche en Informatique et Automatique
 #
@@ -20,14 +20,15 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 		       	 @top_srcdir@/doc/doxygen/chapters/api \
 		       	 @top_srcdir@/doc/doxygen/chapters/api/sc_hypervisor \
                          @top_builddir@/doc/doxygen/starpu_config.h \
+			 @top_srcdir@/include/starpu_bitmap.h \
 	 		 @top_srcdir@/include/starpu_bound.h \
 			 @top_srcdir@/include/starpu_cublas.h \
 			 @top_srcdir@/include/starpu_cuda.h \
 			 @top_srcdir@/include/starpu_data_filters.h \
 			 @top_srcdir@/include/starpu_data.h \
 			 @top_srcdir@/include/starpu_data_interfaces.h \
-			 @top_srcdir@/include/starpu_disk.h \
 			 @top_srcdir@/include/starpu_deprecated_api.h \
+			 @top_srcdir@/include/starpu_disk.h \
 			 @top_srcdir@/include/starpu_driver.h \
 			 @top_srcdir@/include/starpu_expert.h \
 			 @top_srcdir@/include/starpu_fxt.h \
@@ -51,6 +52,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_thread.h \
 			 @top_srcdir@/include/starpu_thread_util.h \
 			 @top_srcdir@/include/starpu_top.h \
+			 @top_srcdir@/include/starpu_tree.h \
 			 @top_srcdir@/include/starpu_util.h \
 			 @top_srcdir@/include/starpu_worker.h \
 			 @top_srcdir@/include/starpu_sched_component.h \

+ 3 - 1
doc/doxygen/doxygen_filter.sh.in

@@ -3,6 +3,8 @@
 if [ "$(basename $1)" == "starpufft.h" ] ; then
     gcc -E $1 -I @top_srcdir@/include/ -I @top_builddir@/include/ |grep starpufft
 else
-    sed -e 's/STARPU_DEPRECATED//' $1 | sed 's/\/\/.*//'
+    # the macro STARPU_DEPRECATED needs to be removed as it is not properly processed by doxygen
+    # lines starting with // in the doxygen input files are considered as comments to be removed
+    sed -e 's/STARPU_DEPRECATED//' $1 | sed 's/^\/\/.*//'
 fi
 

+ 73 - 35
doc/doxygen/refman.tex

@@ -68,7 +68,7 @@ was last updated on \STARPUUPDATED.\\
 
 Copyright © 2009–2013 Université de Bordeaux 1\\
 
-Copyright © 2010-2013 Centre National de la Recherche Scientifique\\
+Copyright © 2010-2014 Centre National de la Recherche Scientifique\\
 
 Copyright © 2011, 2012 Institut National de Recherche en Informatique et Automatique\\
 
@@ -94,7 +94,7 @@ Documentation License”.
 \hypertarget{index}{}
 \input{index}
 
-\part{Using StarPU}
+\part{StarPU Basics}
 
 \chapter{Building and Installing StarPU}
 \label{BuildingAndInstallingStarPU}
@@ -106,33 +106,77 @@ Documentation License”.
 \hypertarget{BasicExamples}{}
 \input{BasicExamples}
 
+\part{StarPU Quick Programming Guide}
+
 \chapter{Advanced Examples}
 \label{AdvancedExamples}
 \hypertarget{AdvancedExamples}{}
 \input{AdvancedExamples}
 
-\chapter{How To Optimize Performance With StarPU}
-\label{HowToOptimizePerformanceWithStarPU}
-\hypertarget{HowToOptimizePerformanceWithStarPU}{}
-\input{HowToOptimizePerformanceWithStarPU}
+\chapter{Check List When Performance Are Not There}
+\label{CheckListWhenPerformanceAreNotThere}
+\hypertarget{CheckListWhenPerformanceAreNotThere}{}
+\input{CheckListWhenPerformanceAreNotThere}
+
+\part{StarPU Inside}
+
+\chapter{Tasks In StarPU}
+\label{TasksInStarPU}
+\hypertarget{TasksInStarPU}{}
+\input{TasksInStarPU}
+
+\chapter{Data Management}
+\label{DataManagement}
+\hypertarget{DataManagement}{}
+\input{DataManagement}
+
+\chapter{Scheduling}
+\label{Scheduling}
+\hypertarget{Scheduling}{}
+\input{Scheduling}
+
+\chapter{Scheduling Contexts}
+\label{SchedulingContexts}
+\hypertarget{SchedulingContexts}{}
+\input{SchedulingContexts}
+
+\chapter{Scheduling Context Hypervisor}
+\label{SchedulingContextHypervisor}
+\hypertarget{SchedulingContextHypervisor}{}
+\input{SchedulingContextHypervisor}
+
+\chapter{Modularized Scheduler}
+\label{ModularizedScheduler}
+\hypertarget{ModularizedScheduler}{}
+\input{ModularizedScheduler}
+
+\chapter{Debugging Tools}
+\label{DebuggingTools}
+\hypertarget{DebuggingTools}{}
+\input{DebuggingTools}
+
+\chapter{Online Performance Tools}
+\label{OnlinePerformanceTools}
+\hypertarget{OnlinePerformanceTools}{}
+\input{OnlinePerformanceTools}
+
+\chapter{Offline Performance Tools}
+\label{OfflinePerformanceTools}
+\hypertarget{OfflinePerformanceTools}{}
+\input{OfflinePerformanceTools}
 
-\chapter{Performance Feedback}
-\label{PerformanceFeedback}
-\hypertarget{PerformanceFeedback}{}
-\input{PerformanceFeedback}
+\chapter{Frequently Asked Questions}
+\label{FrequentlyAskedQuestions}
+\hypertarget{FrequentlyAskedQuestions}{}
+\input{FrequentlyAskedQuestions}
 
-\chapter{Tips and Tricks To Know About}
-\label{TipsAndTricksToKnowAbout}
-\hypertarget{TipsAndTricksToKnowAbout}{}
-\input{TipsAndTricksToKnowAbout}
+\part{StarPU Extensions}
 
 \chapter{Out Of Core}
 \label{OutOfCore}
 \hypertarget{OutOfCore}{}
 \input{OutOfCore}
 
-
-
 \chapter{MPI Support}
 \label{MPISupport}
 \hypertarget{MPISupport}{}
@@ -158,22 +202,12 @@ Documentation License”.
 \hypertarget{SOCLOpenclExtensions}{}
 \input{SOCLOpenclExtensions}
 
-\chapter{Scheduling Contexts}
-\label{SchedulingContexts}
-\hypertarget{SchedulingContexts}{}
-\input{SchedulingContexts}
+\chapter{SimGrid Support}
+\label{SimGridSupport}
+\hypertarget{SimGridSupport}{}
+\input{SimGridSupport}
 
-\chapter{Scheduling Context Hypervisor}
-\label{SchedulingContextHypervisor}
-\hypertarget{SchedulingContextHypervisor}{}
-\input{SchedulingContextHypervisor}
-
-\chapter{Modularized Scheduler}
-\label{ModularizedScheduler}
-\hypertarget{ModularizedScheduler}{}
-\input{ModularizedScheduler}
-
-\part{Inside StarPU}
+\part{StarPU Reference API}
 
 \chapter{Execution Configuration Through Environment Variables}
 \label{ExecutionConfigurationThroughEnvironmentVariables}
@@ -197,6 +231,7 @@ Documentation License”.
 \input{group__API__Standard__Memory__Library}
 \input{group__API__Toolbox}
 \input{group__API__Threads}
+\input{group__API__Bitmap}
 \input{group__API__Workers__Properties}
 \input{group__API__Data__Management}
 \input{group__API__Data__Interfaces}
@@ -226,6 +261,7 @@ Documentation License”.
 \input{group__API__StarPUTop__Interface}
 \input{group__API__Scheduling__Contexts}
 \input{group__API__Scheduling__Policy}
+\input{group__API__Tree}
 \input{group__API__SC__Hypervisor__usage}
 \input{group__API__SC__Hypervisor}
 \input{group__API__Modularized__Scheduler}
@@ -238,6 +274,7 @@ Documentation License”.
 \hypertarget{FileDocumentation}{}
 
 \input{starpu_8h}
+\input{starpu__bitmap_8h}
 \input{starpu__bound_8h}
 \input{starpu__config_8h}
 \input{starpu__cublas_8h}
@@ -269,6 +306,7 @@ Documentation License”.
 \input{starpu__thread_8h}
 \input{starpu__thread__util_8h}
 \input{starpu__top_8h}
+\input{starpu__tree_8h}
 \input{starpu__util_8h}
 \input{starpu__worker_8h}
 \input{starpu__mpi_8h}
@@ -283,10 +321,6 @@ Documentation License”.
 \hypertarget{deprecated}{}
 \input{deprecated}
 
-
-\addcontentsline{toc}{chapter}{Index}
-\printindex
-
 \part{Appendix}
 
 \chapter{Full Source Code for the ’Scaling a Vector’ Example}
@@ -299,4 +333,8 @@ Documentation License”.
 \hypertarget{GNUFreeDocumentationLicense}{}
 \input{GNUFreeDocumentationLicense}
 
+\part{Index}
+\addcontentsline{toc}{chapter}{Index}
+\printindex
+
 \end{document}

doc/tutorial/hello_world_mvsc.c → doc/tutorial/hello_world_msvc.c


+ 16 - 5
examples/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011-2012  INRIA
 #
@@ -57,7 +57,7 @@ EXTRA_DIST = 					\
 	reductions/dot_product_opencl_kernels.cl	\
 	scheduler/schedulers.sh
 
-CLEANFILES = *.gcno *.gcda *.linkinfo
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
 
 if STARPU_USE_CUDA
 
@@ -98,7 +98,7 @@ noinst_HEADERS = 				\
 	common/blas.h				\
 	mult/simple.h				\
 	mult/double.h				\
-	fortran/bindings/StarPU_fortran.h	\
+	fortran/StarPU_fortran.h		\
 	ppm_downscaler/ppm_downscaler.h		\
 	ppm_downscaler/yuv_downscaler.h		\
 	spmv/matrix_market/mmio.h		\
@@ -188,6 +188,8 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/parallel_code			\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/prio				\
+	worker_collections/worker_tree_example  \
+	worker_collections/worker_list_example  \
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	mandelbrot/mandelbrot			\
@@ -200,7 +202,8 @@ examplebin_PROGRAMS +=				\
 
 if STARPU_HAVE_F77_H
 examplebin_PROGRAMS +=				\
-	basic_examples/vector_scal_fortran
+	basic_examples/vector_scal_fortran	\
+	fortran/hello
 endif
 endif
 
@@ -264,6 +267,8 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/sched_ctx			\
 	sched_ctx/prio				\
 	sched_ctx/dummy_sched_with_ctx		\
+	worker_collections/worker_tree_example  \
+	worker_collections/worker_list_example  \
 	reductions/dot_product			\
 	reductions/minmax_reduction
 
@@ -274,7 +279,8 @@ endif
 
 if STARPU_HAVE_F77_H
 STARPU_EXAMPLES +=				\
-	basic_examples/vector_scal_fortran
+	basic_examples/vector_scal_fortran	\
+	fortran/hello
 endif
 
 if !NO_BLAS_LIB
@@ -348,6 +354,11 @@ basic_examples_vector_scal_fortran_SOURCES +=	\
 basic_examples_vector_scal_fortran_LDADD =	\
 	$(STARPU_CUDA_FORTRAN_LDFLAGS)
 endif
+
+fortran_hello_SOURCES	=		\
+	fortran/hello_c.c		\
+	fortran/hello.F			\
+	fortran/StarPU_fortran.h
 endif
 
 #######################

+ 1 - 5
examples/basic_examples/dynamic_handles.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -52,8 +52,6 @@ static void dummy_big_kernel(void *descr[], void *cl_arg)
 
 static struct starpu_codelet dummy_small_cl =
 {
-	.cuda_funcs = {dummy_small_kernel, NULL},
-	.opencl_funcs = {dummy_small_kernel, NULL},
 	.cpu_funcs = {dummy_small_kernel, NULL},
 	.modes = {STARPU_RW},
 	.nbuffers = 1
@@ -61,8 +59,6 @@ static struct starpu_codelet dummy_small_cl =
 
 struct starpu_codelet dummy_big_cl =
 {
-	.cuda_funcs = {dummy_big_kernel, NULL},
-	.opencl_funcs = {dummy_big_kernel, NULL},
 	.cpu_funcs = {dummy_big_kernel, NULL},
 	.nbuffers = STARPU_NMAXBUFS+1
 };

+ 2 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -83,6 +83,7 @@ static struct starpu_codelet cl =
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 	/* OpenCL implementation of the codelet */

+ 1 - 3
examples/basic_examples/vector_scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,6 +42,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
-
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 9 - 1
examples/cholesky/cholesky.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -128,6 +128,14 @@ static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
 static unsigned chole2 = 0;
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
+struct starpu_codelet cl11;
+struct starpu_codelet cl21;
+struct starpu_codelet cl22;
+
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);

+ 1 - 38
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -18,10 +18,6 @@
 
 #include "cholesky.h"
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /*
  *	Some useful functions
  */
@@ -40,17 +36,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -77,17 +62,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
 {
 	int ret;
@@ -123,17 +97,6 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 	return ret;
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
 	int ret;

+ 12 - 55
examples/cholesky/cholesky_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -18,55 +18,6 @@
 
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
-/*
- *	Create the codelets
- */
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
-static struct starpu_codelet cl11 =
-{
-	.type = STARPU_SEQ,
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.model = &chol_model_11
-};
-
-static struct starpu_codelet cl21 =
-{
-	.type = STARPU_SEQ,
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 2,
-	.modes = {STARPU_R, STARPU_RW},
-	.model = &chol_model_21
-};
-
-static struct starpu_codelet cl22 =
-{
-	.type = STARPU_SEQ,
-	.max_parallelism = INT_MAX,
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = {(void*)1, NULL},
-#endif
-	.nbuffers = 3,
-	.modes = {STARPU_R, STARPU_R, STARPU_RW},
-	.model = &chol_model_22
-};
 
 /*
  *	code to bootstrap the factorization
@@ -90,10 +41,12 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
 
-	start = starpu_timing_now();
-
 	if (bound || bound_lp || bound_mps)
 		starpu_bound_start(bound_deps, 0);
+	starpu_fxt_start_profiling();
+
+	start = starpu_timing_now();
+
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
 	{
@@ -132,7 +85,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								 STARPU_R, sdataki,
 								 STARPU_R, sdatakj,
-								 STARPU_RW, sdataij,
+								 STARPU_RW | STARPU_COMMUTE, sdataij,
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 0);
 					if (ret == -ENODEV) return 77;
@@ -143,11 +96,13 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	}
 
 	starpu_task_wait_for_all();
-	if (bound || bound_lp || bound_mps)
-		starpu_bound_stop();
 
 	end = starpu_timing_now();
 
+	starpu_fxt_stop_profiling();
+	if (bound || bound_lp || bound_mps)
+		starpu_bound_stop();
+
 	double timing = end - start;
 
 	double flop = FLOPS_SPOTRF(n);
@@ -174,6 +129,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		{
 			double res;
 			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical makespan: %2.2f\n", res);
 			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
 		}
 	}
@@ -344,6 +300,7 @@ int main(int argc, char **argv)
 
 	int ret;
 	ret = starpu_init(NULL);
+	starpu_fxt_stop_profiling();
 
 	if (ret == -ENODEV)
                 return 77;

+ 49 - 4
examples/cholesky/cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -74,7 +74,6 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, STAR
 		cublasSgemm('n', 't', dy, dx, dz, 
 				-1.0f, left, ld21, right, ld12, 
 				 1.0f, center, ld22);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 
 	}
@@ -119,7 +118,6 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, STARPU_A
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -193,7 +191,6 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, STARPU_A
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			STARPU_ASSERT(!cures);
 			}
 #else
@@ -246,3 +243,51 @@ void chol_cublas_codelet_update_u11(void *descr[], void *_args)
 	chol_common_codelet_update_u11(descr, 1, _args);
 }
 #endif/* STARPU_USE_CUDA */
+
+struct starpu_codelet cl11 =
+{
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+#ifdef STARPU_HAVE_MAGMA
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.model = &chol_model_11
+};
+
+struct starpu_codelet cl21 =
+{
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 2,
+	.modes = { STARPU_R, STARPU_RW },
+	.model = &chol_model_21
+};
+
+struct starpu_codelet cl22 =
+{
+	.type = STARPU_SEQ,
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1, NULL},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 3,
+	.modes = { STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE },
+	.model = &chol_model_22
+};

+ 1 - 38
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -19,10 +19,6 @@
 #include "cholesky.h"
 #include <starpu_perfmodel.h>
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /*
  *	Some useful functions
  */
@@ -41,17 +37,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -79,17 +64,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG21(k, j));
@@ -127,17 +101,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
 {
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */

+ 1 - 38
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,10 +17,6 @@
 
 #include "cholesky.h"
 
-struct starpu_perfmodel chol_model_11;
-struct starpu_perfmodel chol_model_21;
-struct starpu_perfmodel chol_model_22;
-
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
@@ -43,17 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static struct starpu_codelet cl11 =
-{
-	.modes = { STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
 static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
@@ -80,17 +65,6 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	return task;
 }
 
-static struct starpu_codelet cl21 =
-{
-	.modes = { STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
 static int create_task_21(unsigned k, unsigned j)
 {
 	int ret;
@@ -126,17 +100,6 @@ static int create_task_21(unsigned k, unsigned j)
 	return ret;
 }
 
-static struct starpu_codelet cl22 =
-{
-	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
 static int create_task_22(unsigned k, unsigned i, unsigned j)
 {
 	int ret;

examples/fortran/bindings/Makefile → examples/fortran/Makefile


examples/fortran/bindings/StarPU_fortran.h → examples/fortran/StarPU_fortran.h


examples/fortran/bindings/hello.F → examples/fortran/hello.F


examples/fortran/bindings/hello_c.c → examples/fortran/hello_c.c


+ 5 - 2
examples/gl_interop/gl_interop.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012 Université de Bordeaux 1
+ * Copyright (C) 2012-2013 Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,9 @@
 
 #include <starpu.h>
 #include <unistd.h>
-#include <GL/glut.h>
+
+#if (defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+#include <GL/freeglut.h>
 
 void dummy(void *buffers[], void *cl_arg)
 {
@@ -72,6 +74,7 @@ void callback_func(void *foo) {
 	/* Tell it was already the last submitted task */
 	starpu_drivers_request_termination();
 }
+#endif
 
 int main(int argc, char **argv)
 {

+ 5 - 2
examples/gl_interop/gl_interop_idle.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012 Université de Bordeaux 1
+ * Copyright (C) 2012-2013 Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,7 +29,9 @@
 
 #include <starpu.h>
 #include <unistd.h>
-#include <GL/glut.h>
+
+#if (defined(STARPU_USE_CUDA) && defined(STARPU_OPENGL_RENDER))
+#include <GL/freeglut.h>
 
 void dummy(void *buffers[], void *cl_arg)
 {
@@ -89,6 +91,7 @@ static void idle(void)
 {
 	starpu_driver_run_once(&drivers[0]);
 }
+#endif
 
 int main(int argc, char **argv)
 {

+ 3 - 1
examples/incrementer/incrementer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,9 +69,11 @@ int main(int argc, char **argv)
 		.cpu_funcs_name = {"cpu_codelet", NULL},
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_codelet, NULL},
+		.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 #ifdef STARPU_USE_OPENCL
 		.opencl_funcs = {opencl_codelet, NULL},
+		.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 		.nbuffers = 1,
 		.modes = {STARPU_RW},

+ 1 - 2
examples/incrementer/incrementer_kernels.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -31,5 +31,4 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 3 - 7
examples/incrementer/incrementer_kernels_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,11 +47,7 @@ void opencl_codelet(void *descr[], void *_args)
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-	}
-
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
 
-	starpu_opencl_release_kernel(kernel);
+		starpu_opencl_release_kernel(kernel);
+	}
 }

+ 19 - 37
examples/matvecmult/matvecmult_kernel.cl

@@ -1,49 +1,31 @@
 /*
- * Copyright 1993-2009 NVIDIA Corporation.  All rights reserved.
+ * StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * NVIDIA Corporation and its licensors retain all intellectual property and
- * proprietary rights in and to this software and related documentation.
- * Any use, reproduction, disclosure, or distribution of this software
- * and related documentation without an express license agreement from
- * NVIDIA Corporation is strictly prohibited.
+ * Copyright (C) 2014  Université de Bordeaux 1
  *
- * Please refer to the applicable NVIDIA end user license agreement (EULA)
- * associated with this source code for terms and conditions that govern
- * your use of this NVIDIA software.
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
  *
- */
-
-/* Matrix-vector multiplication: W = M * V.
- * Device code.
- *
- * This sample implements matrix-vector multiplication.
- * It has been written for clarity of exposition to illustrate various OpenCL
- * programming principles and optimizatoins, not with the goal of providing
- * the most performant generic kernel for matrix-vector multiplication.
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  *
- * CUBLAS provides high-performance matrix-vector multiplication on GPU.
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void matVecMult(
-                         __global float* M,
-                         __global float* V,
-                         int width, int height,
-                         __global float* W
-                         )
+__kernel void matVecMult(const __global float *A, const __global float *X, int n, int m, __global float *Y)
 {
-        // Row index
-        uint y = get_global_id(0);
-        if (y < height)
+	const int i = get_global_id(0);
+	if (i < m)
 	{
-                // Row pointer
-                const __global float* row = M + y * width;
+		float val = 0;
+		int j;
 
-                // Compute dot product
-                float dotProduct = 0;
-                for (int x = 0; x < width; ++x)
-                        dotProduct += row[x] * V[x];
+		for (j = 0; j < n; j++)
+		       val += A[i*n+j] * X[j];
 
-                // Write result to global memory
-                W[y] = dotProduct;
-        }
+		Y[i] = val;
+	}
 }

+ 118 - 25
examples/sched_ctx/parallel_code.c

@@ -16,23 +16,49 @@
  */
 
 #include <starpu.h>
+#include <omp.h>
 
 #ifdef STARPU_QUICK_CHECK
 #define NTASKS 64
 #else
-#define NTASKS 1000
+#define NTASKS 10
 #endif
 
-int tasks_executed = 0;
+int tasks_executed[2];
 starpu_pthread_mutex_t mut;
 
-static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+int parallel_code(int sched_ctx)
 {
-	starpu_pthread_mutex_lock(&mut);
-	tasks_executed++;
-	starpu_pthread_mutex_unlock(&mut);
+	int i;
+	int t = 0;
+	int *cpuids = NULL;
+	int ncpuids = 0;
+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
+
+//	printf("execute task of %d threads \n", ncpuids);
+	omp_set_nested(1);
+#pragma omp parallel num_threads(1)
+	{
+#pragma omp parallel num_threads(ncpuids)
+		{
+			starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
+#pragma omp for
+			for(i = 0; i < NTASKS; i++)
+				t++;
+		}
+	}
+	free(cpuids);
+	return t;
+}
+
+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	unsigned sched_ctx = (unsigned)arg;
+	tasks_executed[sched_ctx-1] = parallel_code(sched_ctx);
 }
 
+
 static struct starpu_codelet sched_ctx_codelet =
 {
 	.cpu_funcs = {sched_ctx_func, NULL},
@@ -43,15 +69,10 @@ static struct starpu_codelet sched_ctx_codelet =
 	.name = "sched_ctx"
 };
 
-int parallel_code(int nprocs)
+void *th(void* p)
 {
-	int i;
-	int tasks = 0;
-#pragma omp parallel for num_threads(nprocs)
-	for (i = 0; i < NTASKS; i++) 
-		tasks++;
-
-	return tasks;
+	unsigned sched_ctx = (unsigned)p;
+	tasks_executed[sched_ctx-1] = (int)starpu_sched_ctx_exec_parallel_code((void*)parallel_code, (void*)sched_ctx, sched_ctx); 
 }
 
 int main(int argc, char **argv)
@@ -67,12 +88,12 @@ int main(int argc, char **argv)
 	starpu_pthread_mutex_init(&mut, NULL);
 	int nprocs1 = 1;
 	int nprocs2 = 1;
-	int procs1[20], procs2[20];
-	procs1[0] = 0;
-	procs2[0] = 0;
+	int *procs1, *procs2;
 
 #ifdef STARPU_USE_CPU
 	unsigned ncpus =  starpu_cpu_worker_get_count();
+	procs1 = (int*)malloc(ncpus*sizeof(int));
+	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 
 	nprocs1 = ncpus/2;
@@ -80,14 +101,60 @@ int main(int argc, char **argv)
 	int j, k = 0;
 	for(j = nprocs1; j < nprocs1+nprocs2; j++)
 		procs2[k++] = j;
+#else
+	procs1 = (int*)malloc(nprocs1*sizeof(int));
+	procs2 = (int*)malloc(nprocs2*sizeof(int));
+	procs1[0] = 0:
+	procs2[0] = 0:
+
 #endif
 
+	int p;
+	for(p = 0; p <nprocs1; p++)
+		printf("w %d in ctx 1 \n", procs1[p]);
+
+	for(p = 0; p <nprocs2; p++)
+		printf("w %d in ctx 2 \n", procs2[p]);
+
 	/*create contexts however you want*/
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
 
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
-	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	int nprocs3 = nprocs1/2;
+	int nprocs4 = nprocs1/2;
+	int nprocs5 = nprocs2/2;
+	int nprocs6 = nprocs2/2;
+	int procs3[nprocs3];
+	int procs4[nprocs4];
+	int procs5[nprocs5];
+	int procs6[nprocs6];
+
+	k = 0;
+	for(j = 0; j < nprocs3; j++)
+		procs3[k++] = procs1[j];
+	k = 0;
+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
+		procs4[k++] = procs1[j];
+
+	k = 0;
+	for(j = 0; j < nprocs5; j++)
+		procs5[k++] = procs2[j];
+	k = 0;
+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
+		procs6[k++] = procs2[j];
+
+	int master3 = starpu_sched_ctx_book_workers_for_task(sched_ctx1, procs3, nprocs3);
+	int master4 = starpu_sched_ctx_book_workers_for_task(sched_ctx1, procs4, nprocs4);
+
+	int master5 = starpu_sched_ctx_book_workers_for_task(sched_ctx2, procs5, nprocs5);
+	int master6 = starpu_sched_ctx_book_workers_for_task(sched_ctx2, procs6, nprocs6);
+
+/* 	int master1 = starpu_sched_ctx_book_workers_for_task(sched_ctx1, procs1, nprocs1); */
+/* 	int master2 = starpu_sched_ctx_book_workers_for_task(sched_ctx2, procs2, nprocs2); */
+
 
 	int i;
 	for (i = 0; i < ntasks; i++)
@@ -95,7 +162,7 @@ int main(int argc, char **argv)
 		struct starpu_task *task = starpu_task_create();
 
 		task->cl = &sched_ctx_codelet;
-		task->cl_arg = NULL;
+		task->cl_arg = sched_ctx1;
 
 		/*submit tasks to context*/
 		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
@@ -103,23 +170,49 @@ int main(int argc, char **argv)
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx2;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+
 	/* tell starpu when you finished submitting tasks to this context
 	   in order to allow moving resources from this context to the inheritor one
 	   when its corresponding tasks finished executing */
 
-	starpu_sched_ctx_finished_submit(sched_ctx1);
 
-	/* execute an openmp code */
-	int ret_ntasks = (int)starpu_sched_ctx_exec_parallel_code((void*)parallel_code, (void*)nprocs2, sched_ctx2);
-	starpu_sched_ctx_finished_submit(sched_ctx2);
 
 	/* wait for all tasks at the end*/
 	starpu_task_wait_for_all();
 
+/* 	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master1); */
+/* 	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master2); */
+
+	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master3);
+	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master4);
+
+	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master5);
+	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master6);
+
+	pthread_t mp[2];
+	pthread_create(&mp[0], NULL, th, sched_ctx1);
+	pthread_create(&mp[1], NULL, th, sched_ctx2);
+
+	pthread_join(mp[0], NULL);
+	pthread_join(mp[1], NULL);
+
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx2);
-	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed, ntasks);
-	printf("ctx%d: tasks openmp executed %d out of %d\n", sched_ctx2, ret_ntasks, NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
 	starpu_shutdown();
 
 	return 0;

+ 2 - 1
examples/stencil/Makefile.am

@@ -122,7 +122,8 @@ EXTRA_DIST = $(outs) results run README
 
 pics: $(outs:.out=.xpm)
 
-CLEANFILES = *.xpm
+CLEANFILES = *.xpm starpu_idle_microsec.log
+
 
 .out.out2:
 	grep '^|' $< | tr -d ' ' > $@

+ 89 - 0
examples/worker_collections/worker_list_example.c

@@ -0,0 +1,89 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main()
+{
+	starpu_init(NULL);
+
+	int procs[STARPU_NMAXWORKERS];
+	unsigned ncpus =  starpu_cpu_worker_get_count();
+        starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, ncpus);
+
+	struct starpu_worker_collection *co = (struct starpu_worker_collection*)malloc(sizeof(struct starpu_worker_collection));
+	co->has_next = worker_list.has_next;
+	co->get_next = worker_list.get_next;
+	co->add = worker_list.add;
+	co->remove = worker_list.remove;
+	co->init = worker_list.init;
+	co->deinit = worker_list.deinit;
+	co->init_iterator = worker_list.init_iterator;
+	co->type = STARPU_WORKER_LIST;
+
+	FPRINTF(stderr, "ncpus %d \n", ncpus);
+
+	struct timeval start_time;
+        struct timeval end_time;
+        gettimeofday(&start_time, NULL);
+
+	co->init(co);
+
+	gettimeofday(&end_time, NULL);
+
+        long diff_s = end_time.tv_sec  - start_time.tv_sec;
+        long diff_us = end_time.tv_usec  - start_time.tv_usec;
+
+	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+	int i;
+	for(i = 0; i < ncpus; i++)
+	{
+		int added = co->add(co, procs[i]);
+		FPRINTF(stderr, "added proc %d to the tree \n", added);
+	}
+
+	struct starpu_sched_ctx_iterator it;
+        if(co->init_iterator)
+                co->init_iterator(co, &it);
+
+	int pu;
+	while(co->has_next(co, &it))
+	{
+		pu = co->get_next(co, &it);
+		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+	}
+
+	for(i = 0; i < 6; i++)
+	{
+		co->remove(co, i);
+		FPRINTF(stderr, "remove %d out of %d workers\n", i, co->nworkers);
+	}
+
+	while(co->has_next(co, &it))
+	{
+		pu = co->get_next(co, &it);
+		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+	}
+
+	FPRINTF(stderr, "timing init = %lf \n", timing);
+	co->deinit(co);
+	starpu_shutdown();
+}

+ 99 - 0
examples/worker_collections/worker_tree_example.c

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#if !defined(STARPU_HAVE_HWLOC)
+#warning hwloc is not enabled. Skipping test
+int main(int argc, char **argv)
+{
+	return 77;
+}
+#else
+
+int main()
+{
+	starpu_init(NULL);
+
+	int procs[STARPU_NMAXWORKERS];
+	unsigned ncpus =  starpu_cpu_worker_get_count();
+        starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, ncpus);
+
+	struct starpu_worker_collection *co = (struct starpu_worker_collection*)malloc(sizeof(struct starpu_worker_collection));
+	co->has_next = worker_tree.has_next;
+	co->get_next = worker_tree.get_next;
+	co->add = worker_tree.add;
+	co->remove = worker_tree.remove;
+	co->init = worker_tree.init;
+	co->deinit = worker_tree.deinit;
+	co->init_iterator = worker_tree.init_iterator;
+	co->type = STARPU_WORKER_TREE;
+
+	FPRINTF(stderr, "ncpus %d \n", ncpus);
+
+	struct timeval start_time;
+        struct timeval end_time;
+        gettimeofday(&start_time, NULL);
+
+	co->init(co);
+
+	gettimeofday(&end_time, NULL);
+
+        long diff_s = end_time.tv_sec  - start_time.tv_sec;
+        long diff_us = end_time.tv_usec  - start_time.tv_usec;
+
+	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+	int i;
+	for(i = 0; i < ncpus; i++)
+	{
+		int added = co->add(co, procs[i]);
+//		FPRINTF(stderr, "added proc %d to the tree \n", added);
+	}
+
+	struct starpu_sched_ctx_iterator it;
+        if(co->init_iterator)
+                co->init_iterator(co, &it);
+
+	int pu;
+	while(co->has_next(co, &it))
+	{
+		pu = co->get_next(co, &it);
+//		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+	}
+
+	for(i = 0; i < 6; i++)
+	{
+		co->remove(co, i);
+//		FPRINTF(stderr, "remove %d out of %d workers\n", i, co->nworkers);
+	}
+
+	while(co->has_next(co, &it))
+	{
+		pu = co->get_next(co, &it);
+//		FPRINTF(stderr, "pu = %d out of %d workers \n", pu, co->nworkers);
+	}
+
+	FPRINTF(stderr, "timing init = %lf \n", timing);
+
+	co->deinit(co);
+	starpu_shutdown();
+}
+#endif

+ 1 - 0
gcc-plugin/examples/Makefile.am

@@ -41,6 +41,7 @@ endif
 
 endif
 
+CLEANFILES = starpu_idle_microsec.log
 examplebin_PROGRAMS =			\
 	hello-starpu 			\
 	matrix-mult			\

+ 2 - 1
gcc-plugin/tests/Makefile.am

@@ -89,7 +89,8 @@ CLEANFILES = *.gimple *.o			\
   unregister					\
   heap-allocated				\
   acquire					\
-  opencl
+  opencl					\
+  starpu_idle_microsec.log
 
 
 EXTRA_DIST += ./run-test.in			\

+ 5 - 1
include/starpu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2013  Université de Bordeaux 1
- * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -65,6 +65,7 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_top.h>
 #include <starpu_fxt.h>
 #include <starpu_driver.h>
+#include <starpu_tree.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -130,6 +131,9 @@ int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 
 int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
 
+void starpu_pause(void);
+void starpu_resume(void);
+
 void starpu_shutdown(void);
 
 void starpu_topology_print(FILE *f);

+ 22 - 23
include/starpu_bitmap.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013  Simon Archipoff
+ * Copyright (C) 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,34 +17,32 @@
 
 #ifndef __STARPU_BITMAP_H__
 #define __STARPU_BITMAP_H__
-struct starpu_bitmap * starpu_bitmap_create(void);
-void starpu_bitmap_destroy(struct starpu_bitmap *);
 
-void starpu_bitmap_set(struct starpu_bitmap *, int);
-void starpu_bitmap_unset(struct starpu_bitmap *, int);
-void starpu_bitmap_unset_all(struct starpu_bitmap *);
-
-int starpu_bitmap_get(struct starpu_bitmap *, int);
+#ifdef __cplusplus
+extern "C"
+{
+#endif
 
-/* basicaly compute starpu_bitmap_unset_all(a) ; a = b & c; */
-void starpu_bitmap_unset_and(struct starpu_bitmap * a, struct starpu_bitmap * b, struct starpu_bitmap * c);
+struct starpu_bitmap *starpu_bitmap_create(void);
+void starpu_bitmap_destroy(struct starpu_bitmap *b);
 
-/* this is basically compute a |= b;*/
-void starpu_bitmap_or(struct starpu_bitmap * a,
-		       struct starpu_bitmap * b);
+void starpu_bitmap_set(struct starpu_bitmap *b, int e);
+void starpu_bitmap_unset(struct starpu_bitmap *b, int e);
+void starpu_bitmap_unset_all(struct starpu_bitmap *b);
 
-//return 1 iff e set in b1 AND e set in b2
-int starpu_bitmap_and_get(struct starpu_bitmap * b1,
-			   struct starpu_bitmap * b2,
-			   int e);
+int starpu_bitmap_get(struct starpu_bitmap *b, int e);
+void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c);
+void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b);
+int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e);
+int starpu_bitmap_cardinal(struct starpu_bitmap *b);
 
-int starpu_bitmap_cardinal(struct starpu_bitmap *);
+int starpu_bitmap_first(struct starpu_bitmap *b);
+int starpu_bitmap_last(struct starpu_bitmap *b);
+int starpu_bitmap_next(struct starpu_bitmap *b, int e);
+int starpu_bitmap_has_next(struct starpu_bitmap *b, int e);
 
-//return the index of first bit, -1 if none
-int starpu_bitmap_first(struct starpu_bitmap *);
-int starpu_bitmap_last(struct starpu_bitmap *);
-//return the index of bit right after e, -1 if none
-int starpu_bitmap_next(struct starpu_bitmap *, int e);
-int starpu_bitmap_has_next(struct starpu_bitmap * b, int e);
+#ifdef __cplusplus
+}
+#endif
 
 #endif

+ 5 - 1
include/starpu_config.h.in

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -57,6 +57,7 @@
 #undef STARPU_HAVE_SYNC_FETCH_AND_ADD
 #undef STARPU_HAVE_SYNC_FETCH_AND_OR
 #undef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET
+#undef STARPU_HAVE_SYNC_SYNCHRONIZE
 
 #undef STARPU_MODEL_DEBUG
 #undef STARPU_NO_ASSERT
@@ -79,6 +80,7 @@
 #undef STARPU_MAXIMPLEMENTATIONS
 #undef STARPU_MAXMPKERNELS
 #undef STARPU_USE_SC_HYPERVISOR
+#undef STARPU_SC_HYPERVISOR_DEBUG
 #undef STARPU_HAVE_GLPK_H
 
 #undef STARPU_HAVE_LIBNUMA
@@ -120,5 +122,7 @@ struct timespec
 #undef STARPU_USE_TOP
 
 #undef STARPU_HAVE_HWLOC
+#undef STARPU_HAVE_PTHREAD_SPIN_LOCK
+#undef STARPU_HAVE_PTHREAD_BARRIER
 
 #endif

+ 5 - 6
include/starpu_data.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -57,11 +57,11 @@ void starpu_data_invalidate_submit(starpu_data_handle_t handle);
 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 
 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode);
-int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode);
+int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode);
 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
-int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
+int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg);
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
-int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
+int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency);
 
 #ifdef __GCC__
 #  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
@@ -77,7 +77,7 @@ int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t h
 #endif
 
 void starpu_data_release(starpu_data_handle_t handle);
-void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
+void starpu_data_release_on_node(starpu_data_handle_t handle, int node);
 
 void starpu_data_display_memory_stats();
 
@@ -125,7 +125,6 @@ int starpu_data_get_rank(starpu_data_handle_t handle);
 
 int starpu_data_set_tag(starpu_data_handle_t handle, int tag);
 int starpu_data_get_tag(starpu_data_handle_t handle);
-starpu_data_handle_t starpu_data_get_data_handle_from_tag(int tag);
 struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_handle_t handle);
 
 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle_t handle, unsigned memory_node);

+ 2 - 1
include/starpu_fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -66,6 +66,7 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options);
 void starpu_fxt_start_profiling(void);
 void starpu_fxt_stop_profiling(void);
 void starpu_fxt_write_data_trace(char *filename_in);
+void starpu_fxt_trace_user_event(unsigned long code);
 
 #ifdef __cplusplus
 }

+ 2 - 2
include/starpu_hash.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2009-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -36,4 +36,4 @@ uint32_t starpu_hash_crc32c_string(const char *str, uint32_t inputcrc);
 }
 #endif
 
-#endif // __STARPU_HASH_H__
+#endif /* __STARPU_HASH_H__ */

+ 4 - 3
include/starpu_perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -38,8 +38,8 @@ struct starpu_data_descr;
 struct starpu_perfmodel_arch
 {
 	enum starpu_worker_archtype type;
-	int devid;
-	int ncore;
+	int devid;	/* identifier of the precise device */
+	int ncore;	/* number of execution in parallel, minus 1 */
 };
 
 struct starpu_perfmodel_history_entry
@@ -122,6 +122,7 @@ struct starpu_perfmodel
 	double (*cost_function)(struct starpu_task *, unsigned nimpl);
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
+	uint32_t (*footprint)(struct starpu_task *);
 
 	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
 

+ 1 - 0
include/starpu_sched_component.h

@@ -18,6 +18,7 @@
 #define __STARPU_SCHED_COMPONENT_H__
 #include <starpu.h>
 #include <common/starpu_spinlock.h>
+#include <common/fxt.h>
 #include <starpu_bitmap.h>
 
 #ifdef STARPU_HAVE_HWLOC

+ 21 - 4
include/starpu_sched_ctx.h

@@ -28,10 +28,11 @@ extern "C"
 #define STARPU_SCHED_CTX_POLICY_STRUCT		 (2<<16)
 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
+#define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
-unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap);
+unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap);
 
 void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args);
 
@@ -43,6 +44,10 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
 
+unsigned starpu_sched_ctx_get_inheritor(unsigned sched_ctx_id);
+
+unsigned starpu_sched_ctx_get_hierarchy_level(unsigned sched_ctx_id);
+
 void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_context(void);
@@ -102,15 +107,27 @@ void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
 
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
-int starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id);
+int starpu_sched_ctx_get_nready_tasks(unsigned sched_ctx_id);
 
-double starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id);
+double starpu_sched_ctx_get_nready_flops(unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
 
+void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
+
+unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
+
+void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids, int *ncpuids);
+
+void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid);
+
+int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers);
+
+void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
-#endif //STARPU_USE_SC_HYPERVISOR
+#endif /* STARPU_USE_SC_HYPERVISOR */
 
 #ifdef __cplusplus
 }

+ 3 - 2
include/starpu_scheduler.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -33,7 +33,7 @@ struct starpu_sched_policy
 	void (*deinit_sched)(unsigned sched_ctx_id);
 
 	int (*push_task)(struct starpu_task *);
-	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned sched_ctx_id);
+	void (*push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id);
 	struct starpu_task *(*pop_task)(unsigned sched_ctx_id);
 	struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id);
 
@@ -69,6 +69,7 @@ int starpu_get_prefetch_flag(void);
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
 uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
+uint32_t starpu_task_data_footprint(struct starpu_task *task);
 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);

+ 2 - 1
include/starpu_stdlib.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -36,6 +36,7 @@ int starpu_free(void *A);
 int starpu_malloc_flags(void **A, size_t dim, int flags);
 int starpu_free_flags(void *A, size_t dim, int flags);
 
+starpu_ssize_t starpu_memory_get_total(unsigned node);
 starpu_ssize_t starpu_memory_get_available(unsigned node);
 
 #ifdef __cplusplus

+ 15 - 2
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
@@ -41,9 +41,12 @@ extern "C"
 #define STARPU_MIC	((1ULL)<<7)
 #define STARPU_SCC	((1ULL)<<8)
 
+#define STARPU_CUDA_ASYNC	(1<<0)
+#define STARPU_OPENCL_ASYNC	(1<<0)
+
 enum starpu_codelet_type
 {
-	STARPU_SEQ,
+	STARPU_SEQ = 0,
 	STARPU_SPMD,
 	STARPU_FORKJOIN
 };
@@ -90,7 +93,9 @@ struct starpu_codelet
 
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char cuda_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	char opencl_flags[STARPU_MAXIMPLEMENTATIONS];
 	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
 
@@ -100,6 +105,10 @@ struct starpu_codelet
 	enum starpu_data_access_mode modes[STARPU_NMAXBUFS];
 	enum starpu_data_access_mode *dyn_modes;
 
+	unsigned specific_nodes;
+	int nodes[STARPU_NMAXBUFS];
+	int *dyn_nodes;
+
 	struct starpu_perfmodel *model;
 	struct starpu_perfmodel *power_model;
 
@@ -211,6 +220,9 @@ struct starpu_task
 #define STARPU_CODELET_GET_MODE(codelet, i) ((codelet->dyn_modes) ? codelet->dyn_modes[i] : codelet->modes[i])
 #define STARPU_CODELET_SET_MODE(codelet, mode, i) do { if (codelet->dyn_modes) codelet->dyn_modes[i] = mode; else codelet->modes[i] = mode; } while(0)
 
+#define STARPU_CODELET_GET_NODE(codelet, i) ((codelet->dyn_nodes) ? codelet->dyn_nodes[i] : codelet->nodes[i])
+#define STARPU_CODELET_SET_NODE(codelet, __node, i) do { if (codelet->dyn_nodes) codelet->dyn_nodes[i] = __node; else codelet->nodes[i] = __node; } while(0)
+
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
 
@@ -254,6 +266,7 @@ void starpu_codelet_display_stats(struct starpu_codelet *cl);
 struct starpu_task *starpu_task_get_current(void);
 
 void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid);
+void starpu_parallel_task_barrier_init_n(struct starpu_task *task, int worker_size);
 
 struct starpu_task *starpu_task_dup(struct starpu_task *task);
 

+ 2 - 2
include/starpu_task_bundle.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2012  Inria
  *
@@ -40,4 +40,4 @@ void starpu_task_bundle_close(starpu_task_bundle_t bundle);
 }
 #endif
 
-#endif // __STARPU_TASK_BUNDLE_H__
+#endif /* __STARPU_TASK_BUNDLE_H__ */

+ 3 - 6
include/starpu_task_list.h

@@ -39,8 +39,7 @@ void starpu_task_list_init(struct starpu_task_list *list)
 }
 
 static STARPU_INLINE
-void starpu_task_list_push_front(struct starpu_task_list *list,
-				struct starpu_task *task)
+void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task)
 {
 	if (list->tail == NULL)
 	{
@@ -57,8 +56,7 @@ void starpu_task_list_push_front(struct starpu_task_list *list,
 }
 
 static STARPU_INLINE
-void starpu_task_list_push_back(struct starpu_task_list *list,
-				struct starpu_task *task)
+void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task)
 {
 	if (list->head == NULL)
 	{
@@ -93,8 +91,7 @@ int starpu_task_list_empty(struct starpu_task_list *list)
 }
 
 static STARPU_INLINE
-void starpu_task_list_erase(struct starpu_task_list *list,
-				struct starpu_task *task)
+void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task)
 {
 	struct starpu_task *p = task->prev;
 

+ 0 - 0
include/starpu_task_util.h


Some files were not shown because too many files changed in this diff