瀏覽代碼

Merge remote-tracking branch 'svn/trunk' into multiple_regression

# Conflicts (simple comment):
#	src/util/starpu_task_insert.c
Luka Stanisic 8 年之前
父節點
當前提交
74fad1f7f7
共有 100 個文件被更改,包括 2596 次插入970 次删除
  1. 1 0
      AUTHORS
  2. 53 13
      ChangeLog
  3. 4 2
      Makefile.am
  4. 3 1
      README
  5. 6 3
      autogen.sh
  6. 188 51
      configure.ac
  7. 62 52
      doc/doxygen/Makefile.am
  8. 25 5
      doc/doxygen/chapters/00introduction.doxy
  9. 17 14
      doc/doxygen/chapters/01building.doxy
  10. 12 15
      doc/doxygen/chapters/02basic_examples.doxy
  11. 0 59
      doc/doxygen/chapters/11debugging_tools.doxy
  12. 0 0
      doc/doxygen/chapters/201_advanced_examples.doxy
  13. 105 35
      doc/doxygen/chapters/05check_list_performance.doxy
  14. 53 16
      doc/doxygen/chapters/06tasks.doxy
  15. 28 24
      doc/doxygen/chapters/07data_management.doxy
  16. 94 15
      doc/doxygen/chapters/08scheduling.doxy
  17. 59 39
      doc/doxygen/chapters/09scheduling_contexts.doxy
  18. 29 25
      doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
  19. 53 54
      doc/doxygen/chapters/modularized_scheduler.doxy
  20. 110 0
      doc/doxygen/chapters/360_debugging_tools.doxy
  21. 72 21
      doc/doxygen/chapters/12online_performance_tools.doxy
  22. 78 28
      doc/doxygen/chapters/13offline_performance_tools.doxy
  23. 7 7
      doc/doxygen/chapters/14faq.doxy
  24. 4 4
      doc/doxygen/chapters/15out_of_core.doxy
  25. 126 20
      doc/doxygen/chapters/16mpi_support.doxy
  26. 0 0
      doc/doxygen/chapters/420_fft_support.doxy
  27. 23 7
      doc/doxygen/chapters/18mic_scc_support.doxy
  28. 0 0
      doc/doxygen/chapters/440_c_extensions.doxy
  29. 226 0
      doc/doxygen/chapters/450_native_fortran_support.doxy
  30. 3 3
      doc/doxygen/chapters/20socl_opencl_extensions.doxy
  31. 60 18
      doc/doxygen/chapters/21simgrid.doxy
  32. 0 0
      doc/doxygen/chapters/480_openmp_runtime_support.doxy
  33. 67 22
      doc/doxygen/chapters/23clustering_a_machine.doxy
  34. 157 28
      doc/doxygen/chapters/40environment_variables.doxy
  35. 15 5
      doc/doxygen/chapters/41configure_options.doxy
  36. 5 1
      doc/doxygen/chapters/45files.doxy
  37. 0 0
      doc/doxygen/chapters/601_scaling_vector_example.doxy
  38. 0 0
      doc/doxygen/chapters/610_fdl_1_3.doxy
  39. 41 30
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  40. 12 12
      doc/doxygen/chapters/api/data_interfaces.doxy
  41. 11 4
      doc/doxygen/chapters/api/data_management.doxy
  42. 6 0
      doc/doxygen/chapters/api/explicit_dependencies.doxy
  43. 10 0
      doc/doxygen/chapters/api/fxt_support.doxy
  44. 2 2
      doc/doxygen/chapters/api/implicit_dependencies.doxy
  45. 11 11
      doc/doxygen/chapters/api/initialization.doxy
  46. 21 5
      doc/doxygen/chapters/api/insert_task.doxy
  47. 9 7
      doc/doxygen/chapters/api/modularized_scheduler.doxy
  48. 40 4
      doc/doxygen/chapters/api/mpi.doxy
  49. 1 1
      doc/doxygen/chapters/api/multiformat_data_interface.doxy
  50. 6 2
      doc/doxygen/chapters/api/opencl_extensions.doxy
  51. 5 0
      doc/doxygen/chapters/api/openmp_runtime_support.doxy
  52. 33 41
      doc/doxygen/chapters/api/performance_model.doxy
  53. 4 4
      doc/doxygen/chapters/api/profiling.doxy
  54. 5 1
      doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor.doxy
  55. 3 3
      doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
  56. 12 2
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  57. 40 7
      doc/doxygen/chapters/api/scheduling_policy.doxy
  58. 30 10
      doc/doxygen/chapters/api/standard_memory_library.doxy
  59. 2 2
      doc/doxygen/chapters/api/task_bundles.doxy
  60. 5 1
      doc/doxygen/chapters/api/task_lists.doxy
  61. 27 21
      doc/doxygen/chapters/api/threads.doxy
  62. 3 3
      doc/doxygen/chapters/api/tree.doxy
  63. 17 1
      doc/doxygen/chapters/api/workers.doxy
  64. 35 0
      doc/doxygen/chapters/code/nf_initexit.f90
  65. 2 3
      doc/doxygen/chapters/code/scal_pragma.cu
  66. 24 9
      doc/doxygen/dev/starpu_check_documented.py
  67. 68 0
      doc/doxygen/dev/starpu_check_refs.sh
  68. 2 2
      doc/doxygen/dev/starpu_check_undocumented.sh
  69. 7 2
      doc/doxygen/doxygen-config.cfg.in
  70. 13 7
      doc/doxygen/refman.tex
  71. 2 3
      doc/tutorial/vector_scal_plugin_cuda.cu
  72. 150 25
      examples/Makefile.am
  73. 2 1
      examples/axpy/axpy_opencl.c
  74. 2 2
      examples/basic_examples/block_opencl.c
  75. 1 1
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  76. 2 1
      examples/basic_examples/multiformat_opencl.c
  77. 8 1
      examples/basic_examples/variable.c
  78. 2 2
      examples/basic_examples/variable_kernels_opencl.c
  79. 4 4
      examples/basic_examples/vector_scal.c
  80. 2 2
      examples/basic_examples/vector_scal_opencl.c
  81. 28 4
      examples/binary/binary.c
  82. 3 1
      examples/cg/cg.c
  83. 1 1
      examples/cg/cg_kernels.c
  84. 10 19
      examples/cholesky/cholesky_grain_tag.c
  85. 8 10
      examples/cholesky/cholesky_implicit.c
  86. 11 19
      examples/cholesky/cholesky_tag.c
  87. 4 8
      examples/cholesky/cholesky_tile_tag.c
  88. 3 2
      examples/filters/custom_mf/conversion_opencl.c
  89. 3 2
      examples/filters/custom_mf/custom_interface.c
  90. 3 2
      examples/filters/custom_mf/custom_opencl.c
  91. 2 2
      examples/filters/fblock_opencl.c
  92. 1 1
      examples/filters/fmatrix.c
  93. 5 2
      examples/filters/shadow2d.c
  94. 5 1
      examples/heat/dw_factolu.c
  95. 4 4
      examples/heat/dw_factolu_grain.c
  96. 31 31
      examples/heat/dw_factolu_kernels.c
  97. 2 2
      examples/heat/dw_sparse_cg.c
  98. 31 31
      examples/heat/heat.c
  99. 14 2
      examples/incrementer/incrementer.c
  100. 0 0
      examples/incrementer/incrementer_kernels_opencl.c

+ 1 - 0
AUTHORS

@@ -29,6 +29,7 @@ Marc Sergent <marc.sergent@inria.fr>
 Anthony Simonet <anthony.simonet@etu.u-bordeaux.fr>
 Luka Stanisic <luka.stanisic@imag.fr>
 Ludovic Stordeur <ludovic.stordeur@inria.fr>
+Guillaume Sylvand <guillaume.sylvand@airbus.com>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>

+ 53 - 13
ChangeLog

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2015  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+# Copyright (C) 2009-2016  Université de Bordeaux
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
 # Copyright (C) 2014 INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -19,14 +19,20 @@ StarPU 1.3.0 (svn revision xxxx)
 ==============================================
 
 New features:
-  * Enable anticipative writeback by default.
   * New scheduler with heterogeneous priorities
   * Support priorities for data transfers.
 
 Changes:
   * Vastly improve simgrid simulation time.
 
-StarPU 1.2.0 (svn revision xxxx)
+StarPU 1.2.1 (svn revision xxx)
+==============================================
+New features:
+  * Add starpu_fxt_trace_user_event_string.
+  * Add starpu_tasks_rec_complete tool to add estimation times in tasks.rec
+    files.
+
+StarPU 1.2.0 (svn revision 18521)
 ==============================================
 
 New features:
@@ -61,7 +67,6 @@ New features:
 	  basic tracing of communications.
 	- New function starpu_mpi_init_comm() which allows to specify
 	  a MPI communicator.
-
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
     let starpu commute write accesses.
   * Out-of-core support, through registration of disk areas as additional memory
@@ -96,15 +101,16 @@ New features:
     only when the bus is idle.
   * Make starpu_data_prefetch_on_node not forcibly flush data out, introduce
     starpu_data_fetch_on_node for that.
+  * Add data access arbiters, to improve parallelism of concurrent data
+    accesses, notably with STARPU_COMMUTE.
   * Anticipative writeback, to flush dirty data asynchronously before the
     GPU device is full. Disabled by default. Use STARPU_MINIMUM_CLEAN_BUFFERS
     and STARPU_TARGET_CLEAN_BUFFERS to enable it.
   * Add starpu_data_wont_use to advise that a piece of data will not be used
     in the close future.
+  * Enable anticipative writeback by default.
   * New scheduler 'dmdasd' that considers priority when deciding on
     which worker to schedule
-  * Add data access arbiters, to improve parallelism of concurrent data
-    accesses, notably with STARPU_COMMUTE.
   * Add the capability to define specific MPI datatypes for
     StarPU user-defined interfaces.
   * Add tasks.rec trace output to make scheduling analysis easier.
@@ -113,6 +119,14 @@ New features:
   * Generate animated html trace of modular schedulers.
   * Add asynchronous partition planning. It only supports coherency through
     the home node of data for now.
+  * Add STARPU_MALLOC_SIMULATION_FOLDED flag to save memory when simulating.
+  * Include application threads in the trace.
+  * Add starpu_task_get_task_scheduled_succs to get successors of a task.
+  * Add graph inspection facility for schedulers.
+  * New STARPU_LOCALITY flag to mark data which should be taken into account
+    by schedulers for improving locality.
+  * Experimental support for data locality in ws and lws.
+  * Add a preliminary framework for native Fortran support for StarPU
 
 Small features:
   * Tasks can now have a name (via the field const char *name of
@@ -155,21 +169,45 @@ Small features:
     its own allocation to the reclaiming engine.
   * Add STARPU_SIMGRID_CUDA_MALLOC_COST and STARPU_SIMGRID_CUDA_QUEUE_COST to
     disable CUDA costs simulation in simgrid mode.
-  * Add starpu_memory_pin and starpu_memory_unpin to pin memory allocated
-    another way than starpu_malloc.
-  * Add STARPU_NOWHERE to create synchronization tasks with data.
-  * Document how to switch between differents views of the same data.
-  * Add STARPU_NAME to specify a task name from a starpu_task_insert call.
   * Add starpu_task_get_task_succs to get the list of children of a given
     task.
   * Add starpu_malloc_on_node_flags, starpu_free_on_node_flags, and
     starpu_malloc_on_node_set_default_flags to control the allocation flags
     used for allocations done by starpu.
+  * Ranges can be provided in STARPU_WORKERS_CPUID
+  * Add starpu_fxt_autostart_profiling to be able to avoid autostart.
+  * Add arch_cost_function perfmodel function field.
+  * Add STARPU_TASK_BREAK_ON_SCHED, STARPU_TASK_BREAK_ON_PUSH, and
+  STARPU_TASK_BREAK_ON_POP environment variables to debug schedulers.
+  * Add starpu_sched_display tool.
+  * Add starpu_memory_pin and starpu_memory_unpin to pin memory allocated
+    another way than starpu_malloc.
+  * Add STARPU_NOWHERE to create synchronization tasks with data.
+  * Document how to switch between differents views of the same data.
+  * Add STARPU_NAME to specify a task name from a starpu_task_insert call.
   * Add configure option to disable fortran --disable-fortran
   * Add configure option to give path for smpirun executable --with-smpirun
   * Add configure option to disable the build of tests --disable-build-tests
   * Add starpu-all-tasks debugging support
-  * Ranges can be provided in STARPU_WORKERS_CPUID
+  * New function
+    void starpu_opencl_load_program_source_malloc(const char *source_file_name, char **located_file_name, char **located_dir_name, char **opencl_program_source)
+    which allocates the pointers located_file_name, located_dir_name
+    and opencl_program_source.
+  * Add submit_hook and do_schedule scheduler methods.
+  * Add starpu_sleep.
+  * Add starpu_task_list_ismember.
+  * Add _starpu_fifo_pop_this_task.
+  * Add STARPU_MAX_MEMORY_USE environment variable.
+  * Add starpu_worker_get_id_check().
+  * New function starpu_mpi_wait_for_all(MPI_Comm comm) that allows to
+    wait until all StarPU tasks and communications for the given
+    communicator are completed.
+  * New function starpu_codelet_unpack_args_and_copyleft() which
+    allows to copy in a new buffer values which have not been unpacked by
+    the current call
+  * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
+  * Add STARPU_CL_ARGS flag to starpu_task_insert() and
+    starpu_mpi_task_insert() functions call
 
 Changes:
   * Data interfaces (variable, vector, matrix and block) now define
@@ -185,6 +223,8 @@ Changes:
 Small changes:
   * Rename function starpu_trace_user_event() as
     starpu_fxt_trace_user_event()
+  * "power" is renamed into "energy" wherever it applies, notably energy
+    consumption performance models
   * Update starpu_task_build() to set starpu_task::cl_arg_free to 1 if
     some arguments of type ::STARPU_VALUE are given.
   * Simplify performance model loading API

+ 4 - 2
Makefile.am

@@ -1,8 +1,9 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2015  Université de Bordeaux
+# Copyright (C) 2009-2016  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
 # Copyright (C) 2014  INRIA
+# Copyright (C) 2016  Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -96,6 +97,7 @@ versinclude_HEADERS = 				\
 	include/starpu_tree.h			\
 	include/starpu_simgrid_wrap.h		\
 	include/starpu_mod.f90			\
+	include/fstarpu_mod.f90			\
 	include/starpu_clusters_util.h
 
 nodist_versinclude_HEADERS = 			\
@@ -157,7 +159,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
+EXTRA_DIST = autogen.sh AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
 
 DISTCLEANFILES = STARPU-REVISION
 

+ 3 - 1
README

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2012  Université de Bordeaux
+# Copyright (C) 2009-2012, 2016  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2013, 2014, 2015  CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -105,6 +105,8 @@ The password is 'anonsvn'
 +---------------------------
 | IV.a. For svn version only
 
+Please skip this step if you are building from a tarball.
+
   $ ./autogen.sh
 
 +-----------------------

+ 6 - 3
autogen.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2010  Université de Bordeaux
+# Copyright (C) 2010, 2016  Université de Bordeaux
 # Copyright (C) 2010  CNRS
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,11 @@ then
 	# Perhaps we are on a Mac
 	if ! glibtool --version > /dev/null
 	then
-	echo "GNU Libtool is missing, please install it."
-	exit 1
+		echo "GNU Libtool is missing, please install it and fix the PATH to it."
+		exit 1
+	else
+		export LIBTOOL=glibtool
+		export LIBTOOLIZE=glibtoolize
 	fi
 fi
 autoreconf -ivf -I m4

+ 188 - 51
configure.ac

@@ -1,10 +1,10 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2016  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012, 2014  INRIA
-# Copyright (C) 2015  Inria
+# Copyright (C) 2015, 2016  Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -82,6 +82,10 @@ AC_CHECK_PROGS(PROG_STAT,gstat stat)
 AC_CHECK_PROGS(PROG_DATE,gdate date)
 AC_OPENMP
 
+if test x$enable_perf_debug = xyes; then
+    enable_shared=no
+fi
+
 LT_PREREQ([2.2])
 LT_INIT([win32-dll])
 
@@ -1071,7 +1075,7 @@ if test x$enable_simgrid = xyes ; then
 		]
 	)
 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
-   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root xbt_mutex_try_acquire smpi_process_set_user_data])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 	AC_CHECK_FUNCS([SIMIX_process_get_code], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE], [1], [Define to 1 if you have the `SIMIX_process_get_code' function.])])
@@ -1092,6 +1096,20 @@ if test x$enable_simgrid = xyes ; then
 	AC_DEFINE(STARPU_SIMGRID, 1, [Define this to enable simgrid execution])
 	# We won't bind or detect anything
 	with_hwloc=no
+
+	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
+	AC_LANG_PUSH([C++])
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+			  #ifdef HAVE_SIMGRID_MSG_H
+			  #include <simgrid/msg.h>
+			  #include <simgrid/host.h>
+			  #else
+			  #include <msg/msg.h>
+			  #endif
+			  ]])],,
+			  CXXFLAGS="-std=c++11 $CXXFLAGS"
+			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
+	AC_LANG_POP([C++])
 fi
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
@@ -1133,6 +1151,12 @@ AM_CONDITIONAL(MLR_MODEL, test x$enable_mlr_model = xyes)
 #                                                                             #
 ###############################################################################
 
+# ignore these otions, only meant for mic-configure, but also passed here.
+AC_ARG_ENABLE(native-mic)
+AC_ARG_WITH(compiler)
+AC_ARG_WITH(mic-param)
+AC_ARG_WITH(host-param)
+
 AC_MSG_CHECKING(maximum number of MIC devices)
 AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
 			[maximum number of MIC devices])],
@@ -1314,13 +1338,16 @@ if test x$enable_mic = xyes ; then
 	AC_MSG_ERROR([cannot find MIC's SCIF runtime])
     fi
 
-    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
-
-    # Host runtime is not compatible, we are probably cross-compiling
-    # Let's have a look for the device runtime which lib has a different name
-    if test "$have_valid_coi" = "no" ; then
+    case $host_vendor in
+	*1om)
+	    # We are cross-compiling.
+	    # Let's have a look for the device runtime which lib has a different name
 	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
-    fi
+	    ;;
+	*)
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
+	    ;;
+    esac
 
     if test "$have_valid_coi" = "no" ; then
 	AC_MSG_ERROR([cannot find MIC's COI runtime])
@@ -1479,13 +1506,25 @@ if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
 	CXXFLAGS="$CXXFLAGS -O0"
 	enable_spinlock_check=yes
+	if test x$GCC = xyes; then
+		if test x$starpu_windows != xyes ; then
+			CFLAGS="$CFLAGS -fstack-protector-all"
+			CXXFLAGS="$CXXFLAGS -fstack-protector-all"
+		fi
+		CPPFLAGS="$CPPFLAGS -D_FORTIFY_SOURCE=2"
+	fi
 else
 	CFLAGS="-O3 $CFLAGS"
 	CXXFLAGS="-O3 $CXXFLAGS"
 fi
-CFLAGS+=" -gdwarf-2 -g3 "
-CXXFLAGS+=" -gdwarf-2 -g3 "
-LDFLAGS+=" -gdwarf-2 -g3 "
+if test x$GCC = xyes; then
+	CFLAGS+=" -gdwarf-2"
+	CXXFLAGS+=" -gdwarf-2"
+	LDFLAGS+=" -gdwarf-2"
+fi
+CFLAGS+=" -g3 "
+CXXFLAGS+=" -g3 "
+LDFLAGS+=" -g3 "
 
 if test x$enable_spinlock_check = xyes; then
 	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
@@ -1524,8 +1563,15 @@ if test x$enable_coverage = xyes; then
 	CFLAGS="${CFLAGS} --coverage"
 	CXXFLAGS="${CXXFLAGS} --coverage"
 	LDFLAGS="${LDFLAGS} --coverage"
+	LIBS="${LIBS} -lgcov"
 fi
 
+AC_MSG_CHECKING(whether coverity mode should be enabled)
+AC_ARG_ENABLE(coverity, [AS_HELP_STRING([--enable-coverity], [enable coverity mode])],
+			enable_coverity=$enableval, enable_coverity=no)
+AC_MSG_RESULT($enable_coverity)
+AM_CONDITIONAL(STARPU_COVERITY, test x$enable_coverity = xyes)
+
 # shall we use FxT to generate trace of the execution ?
 AC_MSG_CHECKING(whether FxT traces should be generated)
 AC_ARG_WITH(fxt, [AS_HELP_STRING([--with-fxt[=<dir>]], [generate fxt traces])],
@@ -1587,6 +1633,12 @@ if test x$use_fxt = xyes; then
 	AC_CHECK_DECLS([fut_set_filename])
 	CFLAGS="$save_CFLAGS"
 
+	if test x$enable_simgrid = xyes -a x$enable_shared = xno ; then
+                # simgrid's SMPI needs fxt to be linked in statically for
+                # variable privatization to work
+		FXT_LIBS="$(pkg-config --variable=libdir fxt)/libfxt.a -Wl,--as-needed $(pkg-config --libs --static fxt) -Wl,--no-as-needed"
+	fi
+
         AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
 			[enable details about codelets in the paje trace])],
 			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
@@ -1620,8 +1672,8 @@ AC_MSG_RESULT($enable_perf_debug)
 AC_SUBST(STARPU_PERF_DEBUG, $enable_perf_debug)
 if test x$enable_perf_debug = xyes; then
 	AC_DEFINE(STARPU_PERF_DEBUG, [1], [enable performance debug])
-	CPPFLAGS="${CPPFLAGS} -pg -fprofile-arcs "
-	LDFLAGS="${LDFLAGS} -pg -fprofile-arcs "
+	CPPFLAGS="${CPPFLAGS} -pg "
+	LDFLAGS="${LDFLAGS} -pg "
 fi
 
 AC_MSG_CHECKING(whether performance model debugging should be enabled)
@@ -1787,11 +1839,17 @@ AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
 
+# Enable LevelDB support if requested and the lib is found
+AC_ARG_ENABLE(leveldb, [AS_HELP_STRING([--enable-leveldb],
+				   [Enable linking with LevelDB if available])],
+				   enable_leveldb=$enableval, enable_leveldb=no)
+if  test x$enable_leveldb = xyes; then
 AC_LANG_PUSH([C++])
 AC_CHECK_HEADERS([leveldb/db.h], [AC_DEFINE([STARPU_HAVE_LEVELDB], [1], [Define to 1 if you have the <leveldb/db.h> header file.])])
 STARPU_HAVE_LIBRARY(LEVELDB, [leveldb])
-AM_CONDITIONAL(STARPU_HAVE_LEVELDB, test "x$ac_cv_lib_leveldb_main" = "xyes")
 AC_LANG_POP([C++])
+fi
+AM_CONDITIONAL(STARPU_HAVE_LEVELDB, test  "x$enable_leveldb" = "xyes" -a "x$ac_cv_lib_leveldb_main" = "xyes")
 
 # Defines the calibration heuristic for the history-based calibration of StarPU
 AC_MSG_CHECKING(calibration heuristic of history-based StarPU calibrator)
@@ -1924,6 +1982,13 @@ else
 	running_mpi_check=no
 fi
 
+AC_ARG_WITH(mpiexec-args, [AS_HELP_STRING([--with-mpiexec-args[=<arguments to give when running mpiexec>]],
+			[Arguments for mpiexec])],
+	[
+		mpiexec_args=$withval
+	])
+AC_SUBST(MPIEXEC_ARGS,$mpiexec_args)
+
 AC_ARG_ENABLE(mpi-progression-hook, [AS_HELP_STRING([--enable-mpi-progression-hook],
 				   [Enable StarPU MPI activity polling method])],
 				   enable_mpi_progression_hook=$enableval, enable_mpi_progression_hook=no)
@@ -2069,10 +2134,87 @@ AC_SUBST([pkglibdir])
 AC_ARG_ENABLE(fortran, [AS_HELP_STRING([--disable-fortran],
 			[disable build of fortran examples])],
 			enable_build_fortran=$enableval, enable_build_fortran=yes)
-if test "x$FC" != "x" -a "x$enable_build_fortran" = "xyes" ; then
-  AC_DEFINE(STARPU_HAVE_FC, [], [Define this if a Fortran compiler is available])
+if test "x$FC" != "x"; then
+	if $FC --version|grep -q 'GNU Fortran'; then
+		gfortran_fc_version=`$FC --version|head -1|sed 's/.*)//;s/^.*\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\)\.\([[0-9]][[0-9]]*\).*/\1.\2.\3/'`
+		gfortran_maj_version=`echo $gfortran_fc_version|cut -d. -f1`
+		gfortran_min_version=`echo $gfortran_fc_version|cut -d. -f2`
+
+		if test $gfortran_maj_version -lt 4 -o \( $gfortran_maj_version -eq 4 -a $gfortran_min_version -lt 9 \) ; then
+			AC_MSG_WARN([GFortran $gfortran_fc_version too old, version >= 4.9.x needed, Fortran examples will not be built])
+			enable_build_fortran="no"
+		fi
+	else
+		if $FC -V 2>&1|grep -q 'Intel(R) Fortran'; then
+			ifort_fc_version=`$FC -V 2>&1 |head -1|sed 's/.*Version //;s/ Build.*//'`
+			ifort_maj_version=`echo $ifort_fc_version|cut -d. -f1`
+
+			if test $ifort_maj_version -lt 16; then
+				AC_MSG_WARN([Intel Fortran compiler $ifort_fc_version too old, version >= 2016.x needed, Fortran examples will not be built])
+				enable_build_fortran="no"
+			fi
+		else
+			AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
+		fi
+	fi
+	if test "x$enable_build_fortran" = "xyes" ; then
+		AC_DEFINE(STARPU_HAVE_FC, [], [Define this if a Fortran compiler is available])
+		if test x$use_mpi = xyes; then
+			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort[=<path to mpifort>]],
+				    [Path of the mpifort compiler])],
+				    [
+				     if test x$withval = xyes; then
+					     AC_MSG_ERROR(--with-mpifort must be given a pathname)
+					     else
+						     mpifort_path=$withval
+					     fi
+					     ],
+					     [
+					      if test x$enable_simgrid = xyes ; then
+						      DEFAULT_MPIFORT=smpifort
+					      else
+						      DEFAULT_MPIFORT=mpif90
+					      fi
+					      # nothing was specified: default value is used
+					      AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$(dirname $mpicc_path):$simgrid_dir/bin:$PATH])
+					      ])
+
+			# We test if the MPIFORT compiler exists
+			if test ! -x $mpifort_path; then
+				#MPIFORT does not exists or is not executable
+				AC_MSG_RESULT(The mpifort compiler '$mpifort_path' does not have the execute permission)
+				use_mpi_fort=no
+			else
+				OLD_CC=$CC
+				CC=$mpicc_path
+				AC_LINK_IFELSE(
+						AC_LANG_PROGRAM(
+							[[#include <mpi.h>]],
+							[[AC_LANG_SOURCE([return MPI_Comm_f2c(0);])]]
+							),
+						[use_mpi_fort=yes],
+						[use_mpi_fort=no]
+						)
+				CC=$OLD_CC
+				if test "x$use_mpi_fort" = xyes; then
+					AC_DEFINE([HAVE_MPI_COMM_F2C], 1, [Function MPI_Comm_f2c is available])
+					AC_MSG_CHECKING(mpifort path)
+					AC_MSG_RESULT($mpifort_path)
+					AC_SUBST(MPIFORT, $mpifort_path)
+				fi
+			fi
+		fi
+	fi
+fi
+if test "x$enable_build_fortran" = "xyes" ; then
+   if test "x$FC" = "x" ; then
+      enable_build_fortran="no"
+   fi
 fi
+
 AM_CONDITIONAL([STARPU_HAVE_FC], [test "x$FC" != "x" -a "x$enable_build_fortran" = "xyes"])
+AM_CONDITIONAL([STARPU_HAVE_F77], [test "x$F77" != "x" -a "x$enable_build_fortran" = "xyes"])
+AM_CONDITIONAL([STARPU_HAVE_MPIFORT], [test "x$use_mpi_fort" = "xyes"])
 
 ###############################################################################
 #                                                                             #
@@ -2152,7 +2294,7 @@ AM_CONDITIONAL([RUN_GCC_PLUGIN_TESTS],
 ###############################################################################
 
 AC_ARG_ENABLE(openmp, [AS_HELP_STRING([--enable-openmp],
-			[build the OpenMP runtime support (experimental)])],
+			[build the OpenMP runtime support])],
 			enable_openmp=$enableval, enable_openmp=no)
 
 AC_MSG_CHECKING(for OpenMP runtime support)
@@ -2202,7 +2344,7 @@ AM_CONDITIONAL([STARPU_USE_SOCL], [test "x$build_socl" = "xyes"])
 
 if test "$build_socl" = "yes" ; then
    AC_CHECK_FUNCS([clGetExtensionFunctionAddressForPlatform])
-   if test -n "$SOCL_OCL_LIB_OPENCL" -a -f $SOCL_OCL_LIB_OPENCL ; then
+   if test -n "$SOCL_OCL_LIB_OPENCL" -a -f "$SOCL_OCL_LIB_OPENCL" ; then
       run_socl_check=yes
       SOCL_OCL_LIB_OPENCL_DIR=$(dirname $SOCL_OCL_LIB_OPENCL)
       AC_SUBST(SOCL_OCL_LIB_OPENCL_DIR)
@@ -2453,20 +2595,25 @@ AM_CONDITIONAL(BUILD_STARPUFFT_EXAMPLES, [test x$enable_starpufft_examples = xye
 # hwloc                                  #
 ##########################################
 
+have_valid_hwloc=no
+SAVED_LDFLAGS="${LDFLAGS}"
+SAVED_CPPFLAGS="${CPPFLAGS}"
+SAVED_PKG_CONFIG_PATH="$PKG_CONFIG_PATH"
 AC_ARG_WITH([hwloc],
 	[AS_HELP_STRING([--without-hwloc], [Disable hwloc (enabled by default)])],
 	[
 		if test x$withval != xno; then
 			if test "$withval" = "yes" ; then
-				use_hwloc_from_system=yes
 				use_hwloc=yes
 			else
 				# use specified path
-				use_hwloc_from_system=no
 				if test ! -d "$withval" ; then
 				   AC_MSG_ERROR("Directory specified for hwloc <$withval> does not exist")
 				fi
-				hwloc_dir=$withval
+				if test ! -d "$withval/lib/pkgconfig" ; then
+				   AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig")
+				fi
+				export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
 				use_hwloc=yes
 			fi
 		else
@@ -2475,28 +2622,10 @@ AC_ARG_WITH([hwloc],
 	],
 	[
 		use_hwloc=maybe
-		use_hwloc_from_system=yes
 	])
-SAVED_LDFLAGS="${LDFLAGS}"
-SAVED_CPPFLAGS="${CPPFLAGS}"
-AS_IF([test "$use_hwloc" = "no"],
-  [have_valid_hwloc=no],
-  [AS_IF([test "$use_hwloc_from_system" = "yes"],
-  	  [PKG_CHECK_MODULES([HWLOC],[hwloc], [
-	      	have_valid_hwloc=yes
-            have_pkgconfig_hwloc=yes],
-		[
-            have_valid_hwloc=no
-            have_pkgconfig_hwloc=no])
-	  ],
-	  #else
-	  [have_pkgconfig_hwloc=no
-	   CPPFLAGS="${SAVED_CPPFLAGS} -I$hwloc_dir/include"
-	   AC_CHECK_HEADER([hwloc.h],[have_valid_hwloc=yes],[have_valid_hwloc=no])
-	   LDFLAGS="${SAVED_LDFLAGS} -L$hwloc_dir/lib"
-       AC_HAVE_LIBRARY([hwloc],[have_valid_hwloc=yes],[have_valid_hwloc=no])
-	  ])
-  ])
+AS_IF([test "$use_hwloc" != "no"],
+      [PKG_CHECK_MODULES([HWLOC],[hwloc], [have_valid_hwloc=yes], [have_valid_hwloc=no])]
+     )
 AM_CONDITIONAL(STARPU_HAVE_HWLOC, test "x$have_valid_hwloc" = "xyes")
 # in case hwloc was explicitely required, but is not available, this is an error
 AS_IF([test "$use_hwloc" = "yes" -a "$have_valid_hwloc" = "no"],
@@ -2504,28 +2633,25 @@ AS_IF([test "$use_hwloc" = "yes" -a "$have_valid_hwloc" = "no"],
      )
 # in case hwloc is not available but was not explicitely disabled, this is an error
 AS_IF([test "$have_valid_hwloc" = "no" -a "$use_hwloc" != "no"],
-      [AC_MSG_ERROR([hwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install hwloc. However, if you really want to use StarPU without enabling hwloc, please restart configure by specifying the option '--without-hwloc'.])]
+      [AC_MSG_ERROR([libhwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install libhwloc. However, if you really want to use StarPU without enabling libhwloc, please restart configure by specifying the option '--without-hwloc'.])]
      )
 
+LDFLAGS="${HWLOC_LIBS} ${SAVED_LDFLAGS}"
+CPPFLAGS="${HWLOC_CFLAGS} ${SAVED_CPPFLAGS}"
+
 AS_IF([test "$have_valid_hwloc" = "yes"],
       [AC_DEFINE([STARPU_HAVE_HWLOC], [1], [Define to 1 if you have the hwloc library.])
        HWLOC_REQUIRES=hwloc
        AC_SUBST([STARPU_HAVE_HWLOC], [1])
        AC_CHECK_DECLS([hwloc_cuda_get_device_osdev_by_index], [], [], [[#include <hwloc/cuda.h>]])
-       AS_IF([test "$have_pkgconfig_hwloc" = "no"],
-             [HWLOC_CFLAGS="-I$hwloc_dir/include"
-          HWLOC_LIBS="-L$hwloc_dir/lib -lhwloc"
-          # TODO: either detect this or make the API compatible with older hwloc versions
-          ],)
       ])
 
-LDFLAGS="${HWLOC_LIBS} ${SAVED_LDFLAGS}"
-CPPFLAGS="${HWLOC_CFLAGS} ${SAVED_CPPFLAGS}"
 AC_CHECK_FUNCS([hwloc_topology_dup])
 AM_CONDITIONAL(STARPU_HWLOC_HAVE_TOPOLOGY_DUP, test $ac_cv_func_hwloc_topology_dup = yes)
 
 LDFLAGS="${SAVED_LDFLAGS}"
 CPPFLAGS="${SAVED_CPPFLAGS}"
+export PKG_CONFIG_PATH=$SAVED_PKG_CONFIG_PATH
 
 AC_MSG_CHECKING(whether hwloc should be used)
 AC_MSG_RESULT($have_valid_hwloc)
@@ -2662,6 +2788,15 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tools/starpu_paje_sort
   chmod +x tools/starpu_smpirun
   chmod +x doc/doxygen/doxygen_filter.sh
+  mkdir -p tests/microbenchs
+  test -e tests/microbenchs/tasks_size_overhead.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead.sh tests/microbenchs/
+  test -e tests/microbenchs/tasks_size_overhead.gp || ln -sf $ac_abs_top_srcdir/tests/microbenchs/tasks_size_overhead.gp tests/microbenchs/
+  test -e tests/microbenchs/microbench.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/microbench.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_dependent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_dependent_homogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_independent_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
 ])
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
@@ -2806,6 +2941,8 @@ AC_MSG_NOTICE([
                Scheduler Hypervisor:                        $build_sc_hypervisor
                simgrid enabled:                             $enable_simgrid
                ayudame enabled:                             $ac_cv_header_Ayudame_h
+	       Native fortran support:                      $enable_build_fortran
+	       Native MPI fortran support:                  $use_mpi_fort
 ])
 
 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then

+ 62 - 52
doc/doxygen/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009, 2011, 2013-2014  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
 # Copyright (C) 2014  INRIA
 #
 # Permission is granted to copy, distribute and/or modify this document
@@ -25,35 +25,36 @@ DOX_PDF = starpu.pdf
 DOX_TAG = starpu.tag
 
 chapters =	\
-	chapters/00introduction.doxy \
-	chapters/01building.doxy \
-	chapters/02basic_examples.doxy \
-	chapters/03advanced_examples.doxy \
-	chapters/05check_list_performance.doxy \
-	chapters/06tasks.doxy \
-	chapters/07data_management.doxy \
-	chapters/08scheduling.doxy \
-	chapters/09scheduling_contexts.doxy \
-	chapters/10scheduling_context_hypervisor.doxy \
-	chapters/11debugging_tools.doxy \
-	chapters/12online_performance_tools.doxy \
-	chapters/13offline_performance_tools.doxy \
-	chapters/14faq.doxy \
-	chapters/15out_of_core.doxy \
-	chapters/16mpi_support.doxy \
-	chapters/17fft_support.doxy \
-	chapters/18mic_scc_support.doxy \
-	chapters/19c_extensions.doxy \
-	chapters/20socl_opencl_extensions.doxy \
-	chapters/21simgrid.doxy \
-	chapters/22openmp_runtime_support.doxy \
-	chapters/23clustering_a_machine.doxy \
-	chapters/40environment_variables.doxy \
-	chapters/41configure_options.doxy \
-	chapters/45files.doxy \
-	chapters/50scaling-vector-example.doxy \
-	chapters/51fdl-1.3.doxy \
-	chapters/modularized_scheduler.doxy \
+	chapters/000_introduction.doxy		\
+	chapters/101_building.doxy		\
+	chapters/110_basic_examples.doxy		\
+	chapters/201_advanced_examples.doxy		\
+	chapters/210_check_list_performance.doxy		\
+	chapters/301_tasks.doxy		\
+	chapters/310_data_management.doxy		\
+	chapters/320_scheduling.doxy		\
+	chapters/330_scheduling_contexts.doxy		\
+	chapters/340_scheduling_context_hypervisor.doxy		\
+	chapters/350_modularized_scheduler.doxy		\
+	chapters/360_debugging_tools.doxy		\
+	chapters/370_online_performance_tools.doxy		\
+	chapters/380_offline_performance_tools.doxy		\
+	chapters/390_faq.doxy		\
+	chapters/401_out_of_core.doxy		\
+	chapters/410_mpi_support.doxy		\
+	chapters/420_fft_support.doxy		\
+	chapters/430_mic_scc_support.doxy		\
+	chapters/440_c_extensions.doxy		\
+	chapters/450_native_fortran_support.doxy		\
+	chapters/460_socl_opencl_extensions.doxy		\
+	chapters/470_simgrid.doxy		\
+	chapters/480_openmp_runtime_support.doxy		\
+	chapters/490_clustering_a_machine.doxy		\
+	chapters/501_environment_variables.doxy		\
+	chapters/510_configure_options.doxy		\
+	chapters/520_files.doxy		\
+	chapters/601_scaling_vector_example.doxy		\
+	chapters/610_fdl_1_3.doxy		\
 	chapters/code/hello_pragma2.c \
 	chapters/code/hello_pragma.c \
 	chapters/code/scal_pragma.cu \
@@ -71,6 +72,7 @@ chapters =	\
 	chapters/code/vector_scal_opencl_codelet.cl \
 	chapters/code/disk_copy.c \
 	chapters/code/disk_compute.c \
+	chapters/code/nf_initexit.f90 \
 	chapters/api/codelet_and_tasks.doxy \
 	chapters/api/cuda_extensions.doxy \
 	chapters/api/data_interfaces.doxy \
@@ -206,40 +208,48 @@ dox_inputs = $(DOX_CONFIG) 				\
 	chapters/version.sty				\
 	chapters/version.html				\
 	$(top_srcdir)/include/starpu.h			\
+	$(top_srcdir)/include/starpu_bitmap.h		\
+	$(top_srcdir)/include/starpu_bound.h		\
+	$(top_srcdir)/include/starpu_clusters_util.h	\
+	$(top_srcdir)/include/starpu_cublas.h		\
+	$(top_srcdir)/include/starpu_cuda.h		\
 	$(top_srcdir)/include/starpu_data_filters.h	\
+	$(top_srcdir)/include/starpu_data.h		\
 	$(top_srcdir)/include/starpu_data_interfaces.h	\
+	$(top_srcdir)/include/starpu_deprecated_api.h	\
 	$(top_srcdir)/include/starpu_disk.h		\
-	$(top_srcdir)/include/starpu_worker.h		\
-	$(top_srcdir)/include/starpu_task.h		\
-	$(top_srcdir)/include/starpu_task_bundle.h	\
-	$(top_srcdir)/include/starpu_task_list.h	\
-	$(top_srcdir)/include/starpu_task_util.h	\
-	$(top_srcdir)/include/starpu_data.h		\
-	$(top_srcdir)/include/starpu_perfmodel.h	\
-	$(top_srcdir)/include/starpu_util.h		\
+	$(top_srcdir)/include/starpu_driver.h		\
+	$(top_srcdir)/include/starpu_expert.h		\
 	$(top_srcdir)/include/starpu_fxt.h		\
-	$(top_srcdir)/include/starpu_cuda.h		\
+	$(top_srcdir)/include/starpu_hash.h		\
+	$(top_srcdir)/include/starpu_mic.h		\
+	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\
 	$(top_srcdir)/include/starpu_openmp.h		\
-	$(top_srcdir)/include/starpu_sink.h		\
-	$(top_srcdir)/include/starpu_mic.h		\
-	$(top_srcdir)/include/starpu_scc.h		\
-	$(top_srcdir)/include/starpu_expert.h		\
+	$(top_srcdir)/include/starpu_perfmodel.h	\
 	$(top_srcdir)/include/starpu_profiling.h	\
-	$(top_srcdir)/include/starpu_bound.h		\
-	$(top_srcdir)/include/starpu_scheduler.h	\
-	$(top_srcdir)/include/starpu_sched_ctx.h	\
-	$(top_srcdir)/include/starpu_clusters_util.h			\
-	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h		\
-	$(top_srcdir)/include/starpu_top.h		\
-	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_rand.h		\
-	$(top_srcdir)/include/starpu_cublas.h		\
-	$(top_srcdir)/include/starpu_driver.h		\
+	$(top_srcdir)/include/starpu_scc.h		\
+	$(top_srcdir)/include/starpu_sched_component.h	\
+	$(top_srcdir)/include/starpu_sched_ctx.h	\
+	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h	\
+	$(top_srcdir)/include/starpu_scheduler.h	\
+	$(top_srcdir)/include/starpu_simgrid_wrap.h	\
+	$(top_srcdir)/include/starpu_sink.h		\
 	$(top_srcdir)/include/starpu_stdlib.h		\
+	$(top_srcdir)/include/starpu_task_bundle.h	\
+	$(top_srcdir)/include/starpu_task.h		\
+	$(top_srcdir)/include/starpu_task_list.h	\
+	$(top_srcdir)/include/starpu_task_util.h	\
 	$(top_srcdir)/include/starpu_thread.h		\
 	$(top_srcdir)/include/starpu_thread_util.h	\
+	$(top_srcdir)/include/starpu_top.h		\
+	$(top_srcdir)/include/starpu_tree.h		\
+	$(top_srcdir)/include/starpu_util.h		\
+	$(top_srcdir)/include/starpu_worker.h		\
+	$(top_srcdir)/include/fstarpu_mod.f90		\
 	$(top_srcdir)/mpi/include/starpu_mpi.h 		\
+	$(top_srcdir)/mpi/include/fstarpu_mpi_mod.f90		\
 	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor.h 		\
 	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_config.h 	\
 	$(top_srcdir)/sc_hypervisor/include/sc_hypervisor_lp.h		\

+ 25 - 5
doc/doxygen/chapters/00introduction.doxy

@@ -1,8 +1,8 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
- * Copyright (C) 2011, 2012 INRIA
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2016 INRIA
  * See the file version.doxy for copying conditions.
 */
 
@@ -153,7 +153,7 @@ them.
 
 A <b>performance model</b> is a (dynamic or static) model of the performance of a
 given codelet. Codelets can have execution time performance model as well as
-power consumption performance models.
+energy consumption performance models.
 
 A data \b interface describes the layout of the data: for a vector, a pointer
 for the start, the number of elements and the size of elements ; for a matrix, a
@@ -178,7 +178,7 @@ unregister it.
 \section ResearchPapers Research Papers
 
 Research papers about StarPU can be found at
-http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
+http://starpu.gforge.inria.fr/publications/.
 
 A good overview is available in the research report at
 http://hal.archives-ouvertes.fr/inria-00467677.
@@ -194,6 +194,7 @@ Many examples are also available in the StarPU sources in the directory
 <dl>
 <dt> <c>incrementer/</c> </dt>
 <dd> Trivial incrementation test. </dd>
+
 <dt> <c>basic_examples/</c> </dt>
 <dd>
         Simple documented Hello world and vector/scalar product (as
@@ -202,17 +203,30 @@ Many examples are also available in the StarPU sources in the directory
         interface, an example using the variable data interface, and an example
         using different formats on CPUs and GPUs.
 </dd>
+
 <dt> <c>matvecmult/</c></dt>
 <dd>
     OpenCL example from NVidia, adapted to StarPU.
 </dd>
+
 <dt> <c>axpy/</c></dt>
 <dd>
     AXPY CUBLAS operation adapted to StarPU.
 </dd>
+
+<dt> <c>native_fortran/</c> </dt>
+<dd>
+    Example of using StarPU's native Fortran support.
+</dd>
+
+<dt> <c>fortran90/</c> </dt>
+<dd>
+    Example of Fortran 90 bindings, using C marshalling wrappers.
+</dd>
+
 <dt> <c>fortran/</c> </dt>
 <dd>
-    Example of Fortran bindings.
+    Example of Fortran 77 bindings, using C marshalling wrappers.
 </dd>
 </dl>
 
@@ -223,10 +237,12 @@ More advanced examples include:
 <dd>
     Examples using filters, as shown in \ref PartitioningData.
 </dd>
+
 <dt><c>lu/</c></dt>
 <dd>
     LU matrix factorization, see for instance <c>xlu_implicit.c</c>
 </dd>
+
 <dt><c>cholesky/</c></dt>
 <dd>
     Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
@@ -255,6 +271,7 @@ The documentation chapters include
 <li> \ref Scheduling
 <li> \ref SchedulingContexts
 <li> \ref SchedulingContextHypervisor
+<li> \ref ModularizedScheduler
 <li> \ref DebuggingTools
 <li> \ref OnlinePerformanceTools
 <li> \ref OfflinePerformanceTools
@@ -267,8 +284,11 @@ The documentation chapters include
 <li> \ref FFTSupport
 <li> \ref MICSCCSupport
 <li> \ref cExtensions
+<li> \ref NativeFortranSupport
 <li> \ref SOCLOpenclExtensions
 <li> \ref SimGridSupport
+<li> \ref OpenMPRuntimeSupport
+<li> \ref ClusteringAMachine
 </ul>
 <li> Part 5: StarPU Reference API
 <ul>

+ 17 - 14
doc/doxygen/chapters/01building.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -35,26 +35,27 @@ can be used to install StarPU.
 The <c>hwloc</c> (http://www.open-mpi.org/software/hwloc) topology
 discovery library is not mandatory to use StarPU but strongly
 recommended.  It allows for topology aware scheduling, which improves
-performance.  <c>hwloc</c> is available in major free operating system
+performance.  <c>libhwloc</c> is available in major free operating system
 distributions, and for most operating systems.
 
-If <c>hwloc</c> is not available on your system, the option
+If <c>libhwloc</c> is not available on your system, the option
 \ref without-hwloc "--without-hwloc" should be explicitely given when calling the
-<c>configure</c> script. If <c>hwloc</c> is installed with a <c>pkg-config</c> file,
-no option is required, it will be detected automatically, otherwise
-\ref with-hwloc "--with-hwloc" should be used to specify its location.
+<c>configure</c> script. If <c>libhwloc</c> is installed in a standard
+location, no option is required, it will be detected automatically,
+otherwise \ref with-hwloc "--with-hwloc=<directory>" should be used to specify its
+location.
 
 \subsection GettingSources Getting Sources
 
 StarPU's sources can be obtained from the download page of
-the StarPU website (http://runtime.bordeaux.inria.fr/StarPU/files/).
+the StarPU website (http://starpu.gforge.inria.fr/files/).
 
 All releases and the development tree of StarPU are freely available
 on INRIA's gforge under the LGPL license. Some releases are available
 under the BSD license.
 
 The latest release can be downloaded from the INRIA's gforge (http://gforge.inria.fr/frs/?group_id=1570) or
-directly from the StarPU download page (http://runtime.bordeaux.inria.fr/StarPU/files/).
+directly from the StarPU download page (http://starpu.gforge.inria.fr/files/).
 
 The latest nightly snapshot can be downloaded from the StarPU gforge website (http://starpu.gforge.inria.fr/testing/).
 
@@ -92,7 +93,8 @@ $ ./configure
 \endverbatim
 
 If <c>configure</c> does not detect some software or produces errors, please
-make sure to post the content of <c>config.log</c> when reporting the issue.
+make sure to post the contents of the file <c>config.log</c> when
+reporting the issue.
 
 By default, the files produced during the compilation are placed in
 the source directory. As the compilation generates a lot of files, it
@@ -218,7 +220,7 @@ When StarPU is used for the first time, the directory
 that directory (\ref STARPU_HOME).
 
 Please note that buses are benchmarked when StarPU is launched for the
-first time. This may take a few minutes, or less if <c>hwloc</c> is
+first time. This may take a few minutes, or less if <c>libhwloc</c> is
 installed. This step is done only once per user and per machine.
 
 \subsection RunningABasicStarPUApplicationOnMicrosoft Running a Basic StarPU Application on Microsoft Visual C
@@ -227,7 +229,7 @@ Batch files are provided to run StarPU applications under Microsoft
 Visual C. They are installed in <c>$STARPU_PATH/bin/msvc</c>.
 
 To execute a StarPU application, you first need to set the environment
-variable <c>STARPU_PATH</c>.
+variable \ref STARPU_PATH.
 
 \verbatim
 c:\....> cd c:\cygwin\home\ci\starpu\
@@ -346,8 +348,9 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 using the simgrid version of StarPU: first install the simgrid simulator from
-http://simgrid.gforge.inria.fr/ , then configure StarPU with \ref enable-simgrid "--enable-simgrid"
-and rebuild and install it, and then you can simulate the performance for a
+http://simgrid.gforge.inria.fr/ (we tested with simgrid 3.11, 3.12 and 3.13, other versions
+may have compatibility issues), then configure StarPU with \ref enable-simgrid
+"--enable-simgrid" and rebuild and install it, and then you can simulate the performance for a
 few virtualized systems shipped along StarPU: attila, mirage, idgraf, and sirocco.
 
 For instance:
@@ -363,7 +366,7 @@ system. It will be interesting to try with different matrix sizes and
 schedulers.
 
 Performance models are available for cholesky_*, lu_*, *gemm, with block sizes
-320, 640, or 960, and for stencil with block size 128x128x128, 192x192x192, and
+320, 640, or 960 (plus 1440 for sirocco), and for stencil with block size 128x128x128, 192x192x192, and
 256x256x256.
 
 */

+ 12 - 15
doc/doxygen/chapters/02basic_examples.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -65,13 +65,13 @@ may contain an implementation of the same kernel on different architectures
 (e.g. CUDA, x86, ...). For compatibility, make sure that the whole
 structure is properly initialized to zero, either by using the
 function starpu_codelet_init(), or by letting the
-compiler implicitly do it as examplified above.
+compiler implicitly do it as examplified below.
 
 The field starpu_codelet::nbuffers specifies the number of data buffers that are
 manipulated by the codelet: here the codelet does not access or modify any data
 that is controlled by our data management library.
 
-We create a codelet which may only be executed on the CPUs. When a CPU
+We create a codelet which may only be executed on CPUs. When a CPU
 core will execute a codelet, it will call the function
 <c>cpu_func</c>, which \em must have the following prototype:
 
@@ -100,17 +100,15 @@ struct starpu_codelet cl =
 \subsection SubmittingATask Submitting A Task
 
 Before submitting any tasks to StarPU, starpu_init() must be called. The
-<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
-be submitted after the termination of StarPU by a call to
-starpu_shutdown().
+<c>NULL</c> argument specifies that we use the default configuration.
+Tasks can then be submitted until the termination of StarPU -- done by a
+call to starpu_shutdown().
 
-In the example above, a task structure is allocated by a call to
-starpu_task_create(). This function only allocates and fills the
-corresponding structure with the default settings, but it does not
+In the example below, a task structure is allocated by a call to
+starpu_task_create(). This function allocates and fills the
+task structure with its default settings, it does not
 submit the task to StarPU.
 
-// not really clear ;)
-
 The field starpu_task::cl is a pointer to the codelet which the task will
 execute: in other words, the codelet structure describes which computational
 kernel should be offloaded on the different architectures, and the task
@@ -323,8 +321,7 @@ additional examples, is available in the directory <c>gcc-plugin/examples</c>
 of the StarPU distribution. These extensions map directly
 to StarPU's main concepts: tasks, task implementations for CPU,
 OpenCL, or CUDA, and registered data buffers. The standard C version
-that uses StarPU's standard C programming interface is given in \ref
-VectorScalingUsingStarPUAPI.
+that uses StarPU's standard C programming interface is given in \ref VectorScalingUsingStarPUAPI.
 
 First of all, the vector-scaling task and its simple CPU implementation
 has to be defined:
@@ -530,9 +527,9 @@ starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
 \endcode
 
 The first argument, called the <b>data handle</b>, is an opaque pointer which
-designates the array in StarPU. This is also the structure which is used to
+designates the array within StarPU. This is also the structure which is used to
 describe which data is used by a task. The second argument is the node number
-where the data originally resides. Here it is STARPU_MAIN_RAM since the array <c>vector</c> is in
+where the data originally resides. Here it is ::STARPU_MAIN_RAM since the array <c>vector</c> is in
 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
 the number of elements in the vector and the size of each element.
 The following shows how to construct a StarPU task that will manipulate the

+ 0 - 59
doc/doxygen/chapters/11debugging_tools.doxy

@@ -1,59 +0,0 @@
-/*
- * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
- * Copyright (C) 2011, 2012 INRIA
- * See the file version.doxy for copying conditions.
- */
-
-/*! \page DebuggingTools Debugging Tools
-
-StarPU provides several tools to help debugging applications. Execution traces
-can be generated and displayed graphically, see \ref
-GeneratingTracesWithFxT.
-
-Some gdb helpers are also provided to show the whole StarPU state:
-
-\verbatim
-(gdb) source tools/gdbinit
-(gdb) help starpu
-\endverbatim
-
-Valgrind can be used on StarPU: valgrind.h just needs to be found at ./configure
-time, to tell valgrind about some known false positives and disable host memory
-pinning. Other known false positives can be suppressed by giving the suppression
-files in tools/valgrind/ *.suppr to valgrind's --suppressions option.
-
-The environment variable \ref STARPU_DISABLE_KERNELS can also be set to 1 to make
-StarPU do everything (schedule tasks, transfer memory, etc.) except actually
-calling the application-provided kernel functions, i.e. the computation will not
-happen. This permits to quickly check that the task scheme is working properly.
-
-The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
-
-\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
-
-StarPU can connect to Temanejo >= 1.0rc2 (see
-http://www.hlrs.de/temanejo), to permit
-nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
-install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
-<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
-to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
-
-It permits to visualize the task graph, add breakpoints, continue execution
-task-by-task, and run gdb on a given task, etc.
-
-\image html temanejo.png
-\image latex temanejo.png "" width=\textwidth
-
-Make sure to specify at least the same number of CPUs in the dialog box as your
-machine has, otherwise an error will happen during execution. Future versions
-of Temanejo should be able to tell StarPU the number of CPUs to use.
-
-Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
-Temanejo (so as to distinguish them from tasks).
-
-
-
-*/

doc/doxygen/chapters/03advanced_examples.doxy → doc/doxygen/chapters/201_advanced_examples.doxy


+ 105 - 35
doc/doxygen/chapters/05check_list_performance.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -10,10 +10,16 @@
 
 TODO: improve!
 
-Simply encapsulating application kernels into tasks already permits to
-seamlessly support CPU and GPUs at the same time. To achieve good
+To achieve good
 performance, we give below a list of features which should be checked.
 
+\section ConfigurationImprovePerformance Configuration That May Improve Performance
+
+The \ref enable-fast "--enable-fast" configuration option disables all
+assertions. This makes StarPU more performant for really small tasks by
+disabling all sanity checks. Only use this for measurements and production, not for development, since this will drop all basic checks.
+
+
 \section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
 
 link to \ref DataManagement
@@ -34,7 +40,7 @@ link to \ref TaskSchedulingPolicy
 
 link to \ref TaskDistributionVsDataTransfer
 
-link to \ref Power-basedScheduling
+link to \ref Energy-basedScheduling
 
 link to \ref StaticScheduling
 
@@ -57,7 +63,7 @@ kernels. That will lower the potential for overlapping.
 
 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 CUBLAS library. Some libraries like Magma may however change the current stream,
-one then has to call cublasSetKernelStream(starpu_cuda_get_local_stream()); at
+one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
 stream.
 
@@ -65,7 +71,7 @@ If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel.  That means setting
 the flag ::STARPU_CUDA_ASYNC in the corresponding field starpu_codelet::cuda_flags, and dropping the
-cudaStreamSynchronize() call at the end of the cuda_func function, so that it
+<c>cudaStreamSynchronize()</c> call at the end of the <c>cuda_func</c> function, so that it
 returns immediately after having queued the kernel to the local stream. That way, StarPU will be
 able to submit and complete data transfers while kernels are executing, instead of only at each
 kernel submission. The kernel just has to make sure that StarPU can use the
@@ -83,7 +89,7 @@ If the kernel can be made to only use the StarPU-provided command queue or other
 queues, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel. This means setting
 the flag ::STARPU_OPENCL_ASYNC in the corresponding field starpu_codelet::opencl_flags and dropping the
-clFinish() and starpu_opencl_collect_stats() calls at the end of the kernel, so
+<c>clFinish()</c> and starpu_opencl_collect_stats() calls at the end of the kernel, so
 that it returns immediately after having queued the kernel to the provided queue.
 That way, StarPU will be able to submit and complete data transfers while kernels are executing, instead of
 only at each kernel submission. The kernel just has to make sure
@@ -100,15 +106,15 @@ driver, etc.
 <c>export STARPU_WATCHDOG_TIMEOUT=10000</c> (\ref STARPU_WATCHDOG_TIMEOUT)
 
 allows to make StarPU print an error message whenever StarPU does not terminate
-any task for 10ms. In addition to that,
+any task for 10ms, but lets the application continue normally. In addition to that,
 
 <c>export STARPU_WATCHDOG_CRASH=1</c> (\ref STARPU_WATCHDOG_CRASH)
 
-raises SIGABRT in that condition, thus allowing to catch the situation in gdb.
-It can also be useful to type "handle SIGABRT nopass" in gdb to be able to let
+raises <c>SIGABRT</c> in that condition, thus allowing to catch the situation in gdb.
+It can also be useful to type <c>handle SIGABRT nopass</c> in <c>gdb</c> to be able to let
 the process continue, after inspecting the state of the process.
 
-\section HowToLimitMemoryPerNode How to limit memory used by StarPU and cache buffer allocations
+\section HowToLimitMemoryPerNode How to Limit Memory Used By StarPU And Cache Buffer Allocations
 
 By default, StarPU makes sure to use at most 90% of the memory of GPU devices,
 moving data in and out of the device as appropriate and with prefetch and
@@ -118,9 +124,9 @@ memory gets tight. This also means that by default StarPU will not cache buffer
 allocations in main memory, since it does not know how much of the system memory
 it can afford.
 
-In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref
-STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and \ref
-STARPU_LIMIT_OPENCL_devid_MEM environment variables can be used to control how
+In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM environment variables
+can be used to control how
 much (in MiB) of the GPU device memory should be used at most by StarPU (their
 default values are 90% of the available memory).
 
@@ -133,27 +139,36 @@ involved, or if allocation fragmentation can become a problem), and when using
 
 It should be noted that by default only buffer allocations automatically
 done by StarPU are accounted here, i.e. allocations performed through
-<c>starpu_malloc_on_node()</c> which are used by the data interfaces
+starpu_malloc_on_node() which are used by the data interfaces
 (matrix, vector, etc.).  This does not include allocations performed by
 the application through e.g. malloc(). It does not include allocations
-performed through <c>starpu_malloc()</c> either, only allocations
-performed explicitly with the \ref STARPU_MALLOC_COUNT flag (i.e. through
-<c>starpu_malloc_flags(STARPU_MALLOC_COUNT)</c>) are taken into account.  If the
+performed through starpu_malloc() either, only allocations
+performed explicitly with the \ref STARPU_MALLOC_COUNT flag, i.e. by calling
+
+\code{.c}
+starpu_malloc_flags(STARPU_MALLOC_COUNT)
+\endcode
+
+are taken into account.  If the
 application wants to make StarPU aware of its own allocations, so that StarPU
 knows precisely how much data is allocated, and thus when to evict allocation
-caches or data out to the disk, \ref starpu_memory_allocate can be used to
-specify an amount of memory to be accounted for. \ref starpu_memory_deallocate
+caches or data out to the disk, starpu_memory_allocate() can be used to
+specify an amount of memory to be accounted for. starpu_memory_deallocate()
 can be used to account freed memory back. Those can for instance be used by data
-interfaces with dynamic data buffers: instead of using starpu_malloc_on_node,
+interfaces with dynamic data buffers: instead of using starpu_malloc_on_node(),
 they would dynamically allocate data with malloc/realloc, and notify starpu of
-the delta thanks to starpu_memory_allocate and starpu_memory_deallocate calls.
+the delta thanks to starpu_memory_allocate() and starpu_memory_deallocate() calls.
 
-\ref starpu_memory_get_total and \ref starpu_memory_get_available
+starpu_memory_get_total() and starpu_memory_get_available()
 can be used to get an estimation of how much memory is available.
-\ref starpu_memory_wait_available can also be used to block until an
-amount of memory becomes available (but it may be preferrable to use
-<c>starpu_memory_allocate(STARPU_MEMORY_WAIT)</c> to reserve that amount
-immediately).
+starpu_memory_wait_available() can also be used to block until an
+amount of memory becomes available, but it may be preferrable to call
+
+\code{.c}
+starpu_memory_allocate(STARPU_MEMORY_WAIT)
+\endcode
+
+to reserve that amount immediately.
 
 \section HowToReduceTheMemoryFootprintOfInternalDataStructures How To Reduce The Memory Footprint Of Internal Data Structures
 
@@ -175,10 +190,33 @@ execution. For example, in the Cholesky factorization (dense linear algebra
 application), the GEMM task uses up to 3 buffers, so it is possible to set the
 maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 
-\section HowtoReuseMemory How to reuse memory
+The size of the various structures of StarPU can be printed by 
+<c>tests/microbenchs/display_structures_size</c>.
+
+It is also often useless to submit *all* the tasks at the same time. One can
+make the starpu_task_submit() function block when a reasonable given number of
+tasks have been submitted, by setting the \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS and
+\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
+
+<c>
+export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
+
+export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
+</c>
+
+To make StarPU block submission when 10000 tasks are submitted, and unblock
+submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
+completed among the 10000 that were submitted when submission was blocked. Of
+course this may reduce parallelism if the threshold is set too low. The precise
+balance depends on the application task graph.
+
+An idea of how much memory is used for tasks and data handles can be obtained by
+setting the \ref STARPU_MAX_MEMORY_USE environment variable to <c>1</c>.
+
+\section HowtoReuseMemory How To Reuse Memory
 
 When your application needs to allocate more data than the available amount of
-memory usable by StarPU (given by \ref starpu_memory_get_available() ), the
+memory usable by StarPU (given by starpu_memory_get_available()), the
 allocation cache system can reuse data buffers used by previously executed
 tasks. For that system to work with MPI tasks, you need to submit tasks progressively instead
 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
@@ -186,16 +224,16 @@ buffers will be done at submission time, not at execution time.
 
 You have two options to control the task submission flow. The first one is by
 controlling the number of submitted tasks during the whole execution. This can
-be done whether by setting the environment variables \ref
-STARPU_LIMIT_MAX_NSUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_NSUBMITTED_TASKS to
+be done whether by setting the environment variables
+\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
 tell StarPU when to stop submitting tasks and when to wake up and submit tasks
-again, or by explicitely calling \ref starpu_task_wait_for_n_submitted() in
+again, or by explicitely calling starpu_task_wait_for_n_submitted() in
 your application code for finest grain control (for example, between two
 iterations of a submission loop).
 
 The second option is to control the memory size of the allocation cache. This
-can be done in the application by using jointly \ref
-starpu_memory_get_available() and \ref starpu_memory_wait_available() to submit
+can be done in the application by using jointly
+starpu_memory_get_available() and starpu_memory_wait_available() to submit
 tasks only when there is enough memory space to allocate the data needed by the
 task, i.e when enough data are available for reuse in the allocation cache.
 
@@ -216,7 +254,7 @@ has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
 made on each architecture, to avoid badly scheduling tasks just because the
 first measurements were not so good. Details on the current performance model status
-can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
+can be obtained from the tool <c>starpu_perfmodel_display</c>: the <c>-l</c>
 option lists the available performance models, and the <c>-s</c> option permits
 to choose the performance model to be displayed. The result looks like:
 
@@ -312,4 +350,36 @@ Statistics on the execution can then be obtained by using <c>export
 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
  More details on performance feedback are provided in the next chapter.
 
+\section OverheadProfiling Overhead Profiling
+
+\ref OfflinePerformanceTools can already provide an idea of to what extent and
+which part of StarPU bring overhead on the execution time. To get a more precise
+analysis of the parts of StarPU which bring most overhead, <c>gprof</c> can be used.
+
+First, recompile and reinstall StarPU with <c>gprof</c> support:
+
+\code
+./configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
+\endcode
+
+Make sure not to leave a dynamic version of StarPU in the target path: remove
+any remaining <c>libstarpu-*.so</c>
+
+Then relink your application with the static StarPU library, make sure that
+running <c>ldd</c> on your application does not mention any libstarpu
+(i.e. it's really statically-linked).
+
+\code
+gcc test.c -o test $(pkg-config --cflags starpu-1.3) $(pkg-config --libs starpu-1.3)
+\endcode
+
+Now you can run your application, and a <c>gmon.out</c> file should appear in the
+current directory, you can process it by running <c>gprof</c> on your application:
+
+\code
+gprof ./test
+\endcode
+
+That will dump an analysis of the time spent in StarPU functions.
+
 */

+ 53 - 16
doc/doxygen/chapters/06tasks.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -95,12 +95,12 @@ starpu_task_insert(&dummy_big_cl,
 The whole code for this complex data interface is available in the
 directory <c>examples/basic_examples/dynamic_handles.c</c>.
 
-\section SettingVariableDataHandlesForATask Setting a Variable number Data Handles For a Task
+\section SettingVariableDataHandlesForATask Setting a Variable Number Of Data Handles For a Task
 
 Normally, the number of data handles given to a task is fixed in the
 starpu_codelet::nbuffers codelet field. This field can however be set to
-STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
-must be set, and the starpu_task::modes field (or starpu_task_dyn_modes field,
+\ref STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
+must be set, and the starpu_task::modes field (or starpu_task::dyn_modes field,
 see \ref SettingManyDataHandlesForATask) should be used to specify the modes for
 the handles.
 
@@ -294,7 +294,8 @@ And the call to the function starpu_task_insert():
 starpu_task_insert(&mycodelet,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
                    STARPU_VALUE, &ffactor, sizeof(ffactor),
-                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+                   STARPU_RW, data_handles[0],
+		   STARPU_RW, data_handles[1],
                    0);
 \endcode
 
@@ -338,7 +339,9 @@ starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
 
 /* And submit the corresponding task */
 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
-                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
+                       starpu_task_insert(&work,
+		                          STARPU_RW, A_handle[i],
+					  0));
 \endcode
 
 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
@@ -350,6 +353,41 @@ be executed, and is allowed to read from <c>i</c> to use it e.g. as an
 index. Note that this macro is only avaible when compiling StarPU with
 the compiler <c>gcc</c>.
 
+There is several ways of calling the function starpu_codelet_unpack_args().
+
+\code{.c}
+void func_cpu(void *descr[], void *_args)
+{
+        int ifactor;
+        float ffactor;
+
+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
+}
+\endcode
+
+\code{.c}
+void func_cpu(void *descr[], void *_args)
+{
+        int ifactor;
+        float ffactor;
+
+        starpu_codelet_unpack_args(_args, &ifactor, NULL);
+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
+}
+\endcode
+
+\code{.c}
+void func_cpu(void *descr[], void *_args)
+{
+        int ifactor;
+        float ffactor;
+	char buffer[100];
+
+        starpu_codelet_unpack_args_and_copyleft(_args, buffer, 100, &ifactor, NULL);
+        starpu_codelet_unpack_args(buffer, &ffactor);
+}
+\endcode
+
 \section GettingTaskChildren Getting Task Children
 
 It may be interesting to get the list of tasks which depend on a given task,
@@ -385,7 +423,7 @@ allowed to start to achieve the computation. The CPU binding mask for the whole
 set of CPUs is already enforced, so that threads created by the function will
 inherit the mask, and thus execute where StarPU expected, the OS being in charge
 of choosing how to schedule threads on the corresponding CPUs. The application
-can also choose to bind threads by hand, using e.g. sched_getaffinity to know
+can also choose to bind threads by hand, using e.g. <c>sched_getaffinity</c> to know
 the CPU binding mask that StarPU chose.
 
 For instance, using OpenMP (full source is available in
@@ -458,13 +496,12 @@ structure as detected by <c>hwloc</c>. It means that for each object of the <c>h
 topology (NUMA node, socket, cache, ...) a combined worker will be created. If
 some nodes of the hierarchy have a big arity (e.g. many cores in a socket
 without a hierarchy of shared caches), StarPU will create combined workers of
-intermediate sizes. The variable \ref
-STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
-arity between levels of combined workers.
+intermediate sizes. The variable \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
+permits to tune the maximum arity between levels of combined workers.
 
 The combined workers actually produced can be seen in the output of the
-tool <c>starpu_machine_display</c> (the environment variable \ref
-STARPU_SCHED has to be set to a combined worker-aware scheduler such
+tool <c>starpu_machine_display</c> (the environment variable
+\ref STARPU_SCHED has to be set to a combined worker-aware scheduler such
 as <c>pheft</c> or <c>peager</c>).
 
 \subsection ConcurrentParallelTasks Concurrent Parallel Tasks
@@ -489,7 +526,7 @@ CPU and GPU tasks are not affected and can be run concurrently). The parallel
 task scheduler will however still however still try varying combined worker
 sizes to look for the most efficient ones.
 
-\subsection SynchronizationTasks Synchronization tasks
+\subsection SynchronizationTasks Synchronization Tasks
 
 For the application conveniency, it may be useful to define tasks which do not
 actually make any computation, but wear for instance dependencies between other
@@ -498,13 +535,13 @@ tasks or tags, or to be submitted in callbacks, etc.
 The obvious way is of course to make kernel functions empty, but such task will
 thus have to wait for a worker to become ready, transfer data, etc.
 
-A much lighter way to define a synchronization task is to set its <c>cl</c>
+A much lighter way to define a synchronization task is to set its starpu_task::cl
 field to <c>NULL</c>. The task will thus be a mere synchronization point,
 without any data access or execution content: as soon as its dependencies become
 available, it will terminate, call the callbacks, and release dependencies.
 
-An intermediate solution is to define a codelet with its <c>where</c> field set
-to STARPU_NOWHERE, for instance this:
+An intermediate solution is to define a codelet with its
+starpu_codelet::where field set to \ref STARPU_NOWHERE, for instance:
 
 \code{.c}
 struct starpu_codelet {

+ 28 - 24
doc/doxygen/chapters/07data_management.doxy

@@ -1,14 +1,14 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
 
 /*! \page DataManagement Data Management
 
-intro qui parle de coherency entre autres
+TODO: intro qui parle de coherency entre autres
 
 \section DataManagement Data Management
 
@@ -111,7 +111,7 @@ starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
 only when the bus is idle.
 
 Conversely, one can advise StarPU that some data will not be useful in the
-close future by calling starpu_data_wont_use. StarPU will then write its value
+close future by calling starpu_data_wont_use(). StarPU will then write its value
 back to its home node, and evict it from GPUs when room is needed.
 
 \section PartitioningData Partitioning Data
@@ -194,7 +194,7 @@ but applications can also write their own data interfaces and filters, see
 \section AsynchronousPartitioning Asynchronous Partitioning
 
 The partitioning functions described in the previous section are synchronous:
-starpu_data_partition and starpu_data_unpartition both wait for all the tasks
+starpu_data_partition() and starpu_data_unpartition() both wait for all the tasks
 currently working on the data.  This can be a bottleneck for the application.
 
 An asynchronous API also exists, it works only on handles with sequential
@@ -217,20 +217,22 @@ struct starpu_data_filter f_vert =
 starpu_data_partition_plan(handle, &f_vert, vert_handle);
 \endcode
 
-starpu_data_partition_plan returns the handles for the partition in vert_handle.
+starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
 
-One can submit tasks working on the main handle, but not yet on the vert_handle
+One can submit tasks working on the main handle, but not yet on the <c>vert_handle</c>
 handles. Now we submit the partitioning:
 
 \code{.c}
 starpu_data_partition_submit(handle, PARTS, vert_handle);
 \endcode
 
-And now we can submit tasks working on vert_handle handles (and not on the main
+And now we can submit tasks working on <c>vert_handle</c> handles (and not on the main
 handle any more). Eventually we want to work on the main handle again, so we
 submit the unpartitioning:
 
+\code{.c}
 starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
+\endcode
 
 And now we can submit tasks working on the main handle again.
 
@@ -242,7 +244,7 @@ to submit unpartitioning (to get back to the initial handle) before submitting
 another partitioning.
 
 It is also possible to activate several partitioning at the same time, in
-read-only mode, by using starpu_data_partition_readonly_submit.  A complete
+read-only mode, by using starpu_data_partition_readonly_submit(). A complete
 example is available in <c>examples/filters/fmultiple_submit_readonly.c</c>.
 
 \section ManualPartitioning Manual Partitioning
@@ -441,16 +443,16 @@ properly be serialized against accesses with this flag. For instance:
         0);
 \endcode
 
-The two tasks running cl2 will be able to commute: depending on whether the
-value of handle1 or handle2 becomes available first, the corresponding task
-running cl2 will start first. The task running cl1 will however always be run
-before them, and the task running cl3 will always be run after them.
+The two tasks running <c>cl2</c> will be able to commute: depending on whether the
+value of <c>handle1</c> or <c>handle2</c> becomes available first, the corresponding task
+running <c>cl2</c> will start first. The task running <c>cl1</c> will however always be run
+before them, and the task running <c>cl3</c> will always be run after them.
 
 If a lot of tasks use the commute access on the same set of data and a lot of
 them are ready at the same time, it may become interesting to use an arbiter,
-see \ref ConcurrentDataAccess .
+see \ref ConcurrentDataAccess.
 
-\section ConcurrentDataAccess Concurrent Data accesses
+\section ConcurrentDataAccess Concurrent Data Accesses
 
 When several tasks are ready and will work on several data, StarPU is faced with
 the classical Dining Philosophers problem, and has to determine the order in
@@ -472,9 +474,9 @@ be avoided by using several arbiters, thus separating sets of data for which
 arbitration will be done.  If a task accesses data from different arbiters, it
 will acquire them arbiter by arbiter, in arbiter pointer value order.
 
-See the tests/datawizard/test_arbiter.cpp example.
+See the <c>tests/datawizard/test_arbiter.cpp</c> example.
 
-Arbiters however do not support the STARPU_REDUX flag yet.
+Arbiters however do not support the ::STARPU_REDUX flag yet.
 
 \section TemporaryBuffers Temporary Buffers
 
@@ -519,11 +521,13 @@ codelet is needed).
 Some kernels sometimes need temporary data to achieve the computations, i.e. a
 workspace. The application could allocate it at the start of the codelet
 function, and free it at the end, but that would be costly. It could also
-allocate one buffer per worker (similarly to \ref
-HowToInitializeAComputationLibraryOnceForEachWorker), but that would
+allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),
+but that would
 make them systematic and permanent. A more  optimized way is to use
 the data access mode ::STARPU_SCRATCH, as examplified below, which
-provides per-worker buffers without content consistency.
+provides per-worker buffers without content consistency. The buffer is
+registered only once, using memory node <c>-1</c>, i.e. the application didn't allocate
+memory for it, and StarPU will allocate it on demand at task execution.
 
 \code{.c}
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
@@ -536,7 +540,7 @@ StarPU will make sure that the buffer is allocated before executing the task,
 and make this allocation per-worker: for CPU workers, notably, each worker has
 its own buffer. This means that each task submitted above will actually have its
 own workspace, which will actually be the same for all tasks running one after
-the other on the same worker. Also, if for instance GPU memory becomes scarce,
+the other on the same worker. Also, if for instance memory becomes scarce,
 StarPU will notice that it can free such buffers easily, since the content does
 not matter.
 
@@ -694,16 +698,16 @@ The whole code for this complex data interface is available in the
 directory <c>examples/interface/</c>.
 
 
-\section SpecifyingATargetNode Specifying a target node for task data
+\section SpecifyingATargetNode Specifying A Target Node For Task Data
 
 When executing a task on a GPU for instance, StarPU would normally copy all the
 needed data for the tasks on the embedded memory of the GPU.  It may however
 happen that the task kernel would rather have some of the datas kept in the
 main memory instead of copied in the GPU, a pivoting vector for instance.
 This can be achieved by setting the starpu_codelet::specific_nodes flag to
-1, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
-starpu_codelet::nbuffers is greater than STARPU_NMAXBUFS) with the node numbers
-where data should be copied to, or -1 to let StarPU copy it to the memory node
+<c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
+starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
+where data should be copied to, or <c>-1</c> to let StarPU copy it to the memory node
 where the task will be executed. For instance, with the following codelet:
 
 \code{.c}

+ 94 - 15
doc/doxygen/chapters/08scheduling.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -93,38 +93,38 @@ latter is estimated based on bus calibration before execution start,
 i.e. with an idle machine, thus without contention. You can force bus
 re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
 beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
-by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
+by using <c>export STARPU_SCHED_BETA=2</c> (\ref STARPU_SCHED_BETA) for instance, since during
 real application execution, contention makes transfer times bigger.
 This is of course imprecise, but in practice, a rough estimation
 already gives the good results that a precise estimation would give.
 
-\section Power-basedScheduling Power-based Scheduling
+\section Energy-basedScheduling Energy-based Scheduling
 
-If the application can provide some power performance model (through
-the field starpu_codelet::power_model), StarPU will
+If the application can provide some energy consumption performance model (through
+the field starpu_codelet::energy_model), StarPU will
 take it into account when distributing tasks. The target function that
 the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
 beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
 is the estimated task consumption in Joules. To tune this parameter, use
-<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
+<c>export STARPU_SCHED_GAMMA=3000</c> (\ref STARPU_SCHED_GAMMA) for instance, to express that each Joule
 (i.e kW during 1000us) is worth 3000us execution time penalty. Setting
-<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
+<c>alpha</c> and <c>beta</c> to zero permits to only take into account energy consumption.
 
-This is however not sufficient to correctly optimize power: the scheduler would
+This is however not sufficient to correctly optimize energy: the scheduler would
 simply tend to run all computations on the most energy-conservative processing
 unit. To account for the consumption of the whole machine (including idle
 processing units), the idle power of the machine should be given by setting
-<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
+<c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
 be obtained from the machine power supplier.
 
-The power actually consumed by the total execution can be displayed by setting
+The energy actually consumed by the total execution can be displayed by setting
 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
 
 On-line task consumption measurement is currently only supported through the
 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
 simulator. Applications can however provide explicit measurements by
 using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
-with the <c>power_model</c> performance model). Fine-grain
+with the <c>energy_model</c> performance model). Fine-grain
 measurement is often not feasible with the feedback provided by the hardware, so
 the user can for instance run a given task a thousand times, measure the global
 consumption for that series of tasks, divide it by a thousand, repeat for
@@ -170,7 +170,7 @@ statically scheduling tasks.
 A full example showing how to define a new scheduling policy is available in
 the StarPU sources in the directory <c>examples/scheduler/</c>.
 
-See \ref API_Scheduling_Policy
+The scheduler has to provide methods:
 
 \code{.c}
 static struct starpu_sched_policy dummy_sched_policy = {
@@ -179,13 +179,92 @@ static struct starpu_sched_policy dummy_sched_policy = {
     .add_workers = dummy_sched_add_workers,
     .remove_workers = dummy_sched_remove_workers,
     .push_task = push_task_dummy,
-    .push_prio_task = NULL,
     .pop_task = pop_task_dummy,
-    .post_exec_hook = NULL,
-    .pop_every_task = NULL,
     .policy_name = "dummy",
     .policy_description = "dummy scheduling strategy"
 };
 \endcode
 
+The idea is that when a task becomes ready for execution, the
+starpu_sched_policy::push_task method is called. When a worker is idle, the
+starpu_sched_policy::pop_task method is called to get a task. It is up to the
+scheduler to implement what is between. A simple eager scheduler is for instance
+to make starpu_sched_policy::push_task push the task to a global list, and make
+starpu_sched_policy::pop_task pop from that list.
+
+The \ref starpu_sched_policy section provides the exact rules that govern the
+methods of the policy.
+
+Make sure to have a look at the \ref API_Scheduling_Policy section, which
+provides a list of the available functions for writing advanced schedulers, such
+as starpu_task_expected_length(), starpu_task_expected_data_transfer_time(),
+starpu_task_expected_energy(), etc. Other
+useful functions include starpu_transfer_bandwidth(), starpu_transfer_latency(),
+starpu_transfer_predict(), ...
+
+Usual functions can also be used on tasks, for instance one can do
+
+\code{.c}
+size = 0;
+write = 0;
+if (task->cl)
+    for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++)
+    {
+        starpu_data_handle_t data = STARPU_TASK_GET_HANDLE(task, i)
+	size_t datasize = starpu_data_get_size(data);
+        size += datasize;
+	if (STARPU_TASK_GET_MODE(task, i) & STARPU_W)
+	    write += datasize;
+    }
+\endcode
+
+And various queues can be used in schedulers. A variety of examples of
+schedulers can be read in <c>src/sched_policies</c>, for
+instance <c>random_policy.c</c>, <c>eager_central_policy.c</c>,
+<c>work_stealing_policy.c</c>
+
+\section GraphScheduling Graph-based Scheduling
+
+For performance reasons, most of the schedulers shipped with StarPU use simple
+list-scheduling heuristics, assuming that the application has already set
+priorities.  That is why they do their scheduling between when tasks become
+available for execution and when a worker becomes idle, without looking at the
+task graph.
+
+Other heuristics can however look at the task graph. Recording the task graph
+is expensive, so it is not available by default, the scheduling heuristic has
+to set _starpu_graph_record to 1 from the initialization function, to make it
+available. Then the <c>_starpu_graph*</c> functions can be used.
+
+<c>src/sched_policies/graph_test_policy.c</c> is an example of simple greedy
+policy which automatically computes priorities by bottom-up rank.
+
+The idea is that while the application submits tasks, they are only pushed
+to a bag of tasks. When the application is finished with submitting tasks,
+it calls starpu_do_schedule() (or starpu_task_wait_for_all(), which calls
+starpu_do_schedule()), and the starpu_sched_policy::do_schedule method of the
+scheduler is called. This method calls _starpu_graph_compute_depths to compute
+the bottom-up ranks, and then uses these rank to set priorities over tasks.
+
+It then has two priority queues, one for CPUs, and one for GPUs, and uses a dumb
+heuristic based on the duration of the task over CPUs and GPUs to decide between
+the two queues. CPU workers can then pop from the CPU priority queue, and GPU
+workers from the GPU priority queue.
+
+\section DebuggingScheduling Debugging Scheduling
+
+All the \ref OnlinePerformanceTools and \ref OfflinePerformanceTools can
+be used to get information about how well the execution proceeded, and thus the
+overall quality of the execution.
+
+Precise debugging can also be performed by using the
+\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
+\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
+scheduled, pushed, or popped by the scheduler. That means that when one notices
+that a task is being scheduled in a seemingly odd way, one can just reexecute
+the application in a debugger, with some of those variables set, and the
+execution will stop exactly at the scheduling points of that task, thus allowing
+to inspect the scheduler state, etc.
+
 */

+ 59 - 39
doc/doxygen/chapters/09scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -14,10 +14,11 @@ TODO: improve!
 
 Scheduling contexts represent abstracts sets of workers that allow the
 programmers to control the distribution of computational resources
-(i.e. CPUs and GPUs) to concurrent parallel kernels. The main goal is
+(i.e. CPUs and GPUs) to concurrent kernels. The main goal is
 to minimize interferences between the execution of multiple parallel
 kernels, by partitioning the underlying pool of workers using
-contexts.
+contexts. Scheduling contexts additionally allow a user to make use of
+a different scheduling policy depending on the target resource set.
 
 
 \section CreatingAContext Creating A Context
@@ -25,36 +26,41 @@ contexts.
 By default, the application submits tasks to an initial context, which
 disposes of all the computation resources available to StarPU (all
 the workers). If the application programmer plans to launch several
-parallel kernels simultaneously, by default these kernels will be
+kernels simultaneously, by default these kernels will be
 executed within this initial context, using a single scheduler
 policy(see \ref TaskSchedulingPolicy). Meanwhile, if the application
 programmer is aware of the demands of these kernels and of the
 specificity of the machine used to execute them, the workers can be
 divided between several contexts. These scheduling contexts will
 isolate the execution of each kernel and they will permit the use of a
-scheduling policy proper to each one of them. 
-
-Scheduling Contexts may be created in two ways: either the programmers indicates
-the set of workers corresponding to each context (providing he knows the 
-identifiers of the workers running within StarPU), or the programmer
-does not provide any worker list and leaves the Hypervisor assign
-workers to each context according to their needs (\ref SchedulingContextHypervisor)
-
-Both cases require a call to the function <c>starpu_sched_ctx_create</c>, which 
-requires as input the worker list (the exact list or a NULL pointer) and the scheduling
-policy. The latter one can be a character list corresponding to the name of a StarPU
-predefined policy or the pointer to a custom policy. The function returns 
-an identifier of the context created which you will use to
-indicate the context you want to submit the tasks to.
-
+scheduling policy proper to each one of them.
+
+Scheduling Contexts may be created in two ways: either the programmers
+indicates the set of workers corresponding to each context (providing
+he knows the identifiers of the workers running within StarPU), or the
+programmer does not provide any worker list and leaves the Hypervisor
+assign workers to each context according to their needs (\ref
+SchedulingContextHypervisor).
+
+Both cases require a call to the function
+starpu_sched_ctx_create(), which requires as input the worker
+list (the exact list or a <c>NULL</c> pointer), the amount of workers
+(or <c>-1</c> to designate all workers on the platform) and a list of
+optional parameters such as the scheduling policy, terminated by a
+<c>0</c>. The scheduling policy can be a character list corresponding
+to the name of a StarPU predefined policy or the pointer to a custom
+policy. The function returns an identifier of the context created
+which you will use to indicate the context you want to submit the
+tasks to.
 
 \code{.c}
 /* the list of resources the context will manage */
 int workerids[3] = {1, 3, 10};
 
-/* indicate the scheduling policy to be used within the context, the list of
-   workers assigned to it, the number of workers, the name of the context */
-int id_ctx = starpu_sched_ctx_create("dmda", workerids, 3, "my_ctx");
+/* indicate the list of workers assigned to it, the number of workers,
+the name of the context and the scheduling policy to be used within
+the context */
+int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
 
 /* let StarPU know that the following tasks will be submitted to this context */
 starpu_sched_ctx_set_task_context(id);
@@ -66,11 +72,36 @@ starpu_task_submit(task);
 Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine.
 Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
 
+\subsection CreatingAContextWithTheDefaultBehavior Creating A Context With The Default Behavior
+
+If <b>no scheduling policy</b> is specified when creating the context,
+it will be used as <b>another type of resource</b>: a cluster. A
+cluster is a context without scheduler (eventually delegated to
+another runtime). For more information see \ref ClusteringAMachine. It
+is therefore <b>mandatory</b> to stipulate a scheduler to use the
+contexts in this traditional way.
+
+To create a <b>context</b> with the default scheduler, that is either
+controlled through the environment variable <c>STARPU_SCHED</c> or the
+StarPU default scheduler, one can explicitly use the option <c>STARPU_SCHED_CTX_POLICY_NAME, NULL</c> as in the following example:
+
+\code{.c}
+/* the list of resources the context will manage */
+int workerids[3] = {1, 3, 10};
+
+/* indicate the list of workers assigned to it, the number of workers,
+and use the default scheduling policy. */
+int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, NULL, 0);
+
+/* .... */
+\endcode
+
+
 \section ModifyingAContext Modifying A Context
 
-A scheduling context can be modified dynamically. The applications may
+A scheduling context can be modified dynamically. The application may
 change its requirements during the execution and the programmer can
-add additional workers to a context or remove if no longer needed. In
+add additional workers to a context or remove those no longer needed. In
 the following example we have two scheduling contexts
 <c>sched_ctx1</c> and <c>sched_ctx2</c>. After executing a part of the
 tasks some of the workers of <c>sched_ctx1</c> will be moved to
@@ -90,13 +121,13 @@ starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 \section SubmittingTasksToAContext Submitting Tasks To A Context
 The application may submit tasks to several contexts either 
 simultaneously or sequnetially. If several threads of submission
-are used the function <c>starpu_sched_ctx_set_context</c> may be called just
-before <c>starpu_task_submit</c>. Thus StarPU considers that 
+are used the function starpu_sched_ctx_set_context() may be called just
+before starpu_task_submit(). Thus StarPU considers that 
 the current thread will submit tasks to the coresponding context.
  
 When the application may not assign a thread of submission to each
 context, the id of the context must be indicated by using the
-function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
+function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX 
 for starpu_task_insert().
 
 \section DeletingAContext Deleting A Context
@@ -144,17 +175,6 @@ the contexts these tasks start being submitted. However, if resources
 are never allocated to the context the program will not terminate. 
 If these tasks have low
 priority the programmer can forbid the application to submit them
-by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
-
-\section ContextsSharingWorkers Contexts Sharing Workers
-
-Contexts may share workers when a single context cannot execute
-efficiently enough alone on these workers or when the application
-decides to express a hierarchy of contexts. The workers apply an
-alogrithm of ``Round-Robin'' to chose the context on which they will
-``pop'' next. By using the function
-<c>starpu_sched_ctx_set_turn_to_other_ctx</c>, the programmer can impose
-the <c>workerid</c> to ``pop'' in the context <c>sched_ctx_id</c>
-next.
+by calling the function starpu_sched_ctx_stop_task_submission().
 
 */

+ 29 - 25
doc/doxygen/chapters/10scheduling_context_hypervisor.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -13,12 +13,12 @@
 StarPU proposes a platform to construct Scheduling Contexts, to
 delete and modify them dynamically. A parallel kernel, can thus
 be isolated into a scheduling context and interferences between
-several parallel kernels are avoided. If the user knows exactly how
-many workers each scheduling context needs, he can assign them to the
+several parallel kernels are avoided. If users know exactly how
+many workers each scheduling context needs, they can assign them to the
 contexts at their creation time or modify them during the execution of
 the program.
 
-The Scheduling Context Hypervisor Plugin is available for the users
+The Scheduling Context Hypervisor Plugin is available for users
 who do not dispose of a regular parallelism, who cannot know in
 advance the exact size of the context and need to resize the contexts
 according to the behavior of the parallel kernels.
@@ -46,23 +46,23 @@ The hypervisor resizes only the registered contexts.
 
 The runtime provides the hypervisor with information concerning the
 behavior of the resources and the application. This is done by using
-the <c>performance_counters</c> which represent callbacks indicating 
-when the resources are idle or not efficient, when the application 
+the <c>performance_counters</c> which represent callbacks indicating
+when the resources are idle or not efficient, when the application
 submits tasks or when it becomes to slow.
 
 \section TriggerTheHypervisor Trigger the Hypervisor
 
-The resizing is triggered either when the application requires it 
-(<c> sc_hypervisor_resize_ctxs </c>) or
+The resizing is triggered either when the application requires it
+(sc_hypervisor_resize_ctxs()) or
 when the initials distribution of resources alters the performance of
 the application (the application is to slow or the resource are idle
-for too long time). If the environment 
-variable <c>SC_HYPERVISOR_TRIGGER_RESIZE</c> is set to <c>speed</c> 
+for too long time). If the environment
+variable \ref SC_HYPERVISOR_TRIGGER_RESIZE is set to <c>speed</c>
 the monitored speed of the contexts is compared to a theoretical value
 computed with a linear program, and the resizing is triggered
-whenever the two values do not correspond. Otherwise, if the environment 
+whenever the two values do not correspond. Otherwise, if the environment
 variable is set to <c>idle</c> the hypervisor triggers the resizing algorithm
-whenever the workers are idle for a period longer than the threshold 
+whenever the workers are idle for a period longer than the threshold
 indicated by the programmer. When this
 happens different resizing strategy are applied that target minimizing
 the total execution of the application, the instant speed or the idle
@@ -72,9 +72,9 @@ time of the resources.
 
 The plugin proposes several strategies for resizing the scheduling context.
 
-The <b>Application driven</b> strategy uses the user's input concerning the moment when he wants to resize the contexts.
-Thus, the users tags the task that should trigger the resizing
-process. We can set directly the field starpu_task::hypervisor_tag or
+The <b>Application driven</b> strategy uses users's input concerning the moment when they want to resize the contexts.
+Thus, users tag the task that should trigger the resizing
+process. One can set directly the field starpu_task::hypervisor_tag or
 use the macro ::STARPU_HYPERVISOR_TAG in the function
 starpu_task_insert().
 
@@ -91,13 +91,13 @@ starpu_task_insert(&codelet,
                     0);
 \endcode
 
-Then the user has to indicate that when a task with the specified tag is executed the contexts should resize.
+Then users have to indicate that when a task with the specified tag is executed the contexts should resize.
 
 \code{.c}
 sc_hypervisor_resize(sched_ctx, 2);
 \endcode
 
-The user can use the same tag to change the resizing configuration of the contexts if he considers it necessary.
+Users can use the same tag to change the resizing configuration of the contexts if they consider it necessary.
 
 \code{.c}
 sc_hypervisor_ctl(sched_ctx,
@@ -109,7 +109,7 @@ sc_hypervisor_ctl(sched_ctx,
 
 
 The <b>Idleness</b> based strategy moves workers unused in a certain context to another one needing them.
-(see \ref UsersInputInTheResizingProcess "Users’ Input In The Resizing Process")
+(see \ref API_SC_Hypervisor_usage)
 
 \code{.c}
 int workerids[3] = {1, 3, 10};
@@ -122,13 +122,17 @@ sc_hypervisor_ctl(sched_ctx_id,
 
 The <b>Gflops rate</b> based strategy resizes the scheduling contexts such that they all finish at the same time.
 The speed of each of them is computed and once one of them is significantly slower the resizing process is triggered.
-In order to do these computations the user has to input the total number of instructions needed to be executed by the
+In order to do these computations users have to input the total number of instructions needed to be executed by the
 parallel kernels and the number of instruction to be executed by each
 task.
 
 The number of flops to be executed by a context are passed as
  parameter when they are registered to the hypervisor,
- (<c>sc_hypervisor_register_ctx(sched_ctx_id, flops)</c>) and the one
+\code{.c}
+sc_hypervisor_register_ctx(sched_ctx_id, flops)
+\endcode
+
+and the one
  to be executed by each task are passed when the task is submitted.
  The corresponding field is starpu_task::flops and the corresponding
  macro in the function starpu_task_insert() is ::STARPU_FLOPS
@@ -154,12 +158,12 @@ such that the application finishes in a minimum amount of time. As for the <b>Gf
 strategy the programmers has to indicate the total number of flops to be executed
 when registering the context. This number of flops may be updated dynamically during the execution
 of the application whenever this information is not very accurate from the beginning.
-The function <c>sc_hypervisor_update_diff_total_flop </c> is called in order add or remove
+The function sc_hypervisor_update_diff_total_flops() is called in order to add or to remove
 a difference to the flops left to be executed.
-Tasks are provided also the number of flops corresponding to each one of them. During the 
+Tasks are provided also the number of flops corresponding to each one of them. During the
 execution of the application the hypervisor monitors the consumed flops and recomputes
 the time left and the number of resources to use. The speed of each type of resource
-is (re)evaluated and inserter in the linear program in order to better adapt to the 
+is (re)evaluated and inserter in the linear program in order to better adapt to the
 needs of the application.
 
 The <b>Teft</b> strategy uses a linear program too, that considers all the types of tasks
@@ -170,12 +174,12 @@ in order to have good predictions of the execution time of each type of task.
 The types of tasks may be determines directly by the hypervisor when they are submitted.
 However there are applications that do not expose all the graph of tasks from the beginning.
 In this case in order to let the hypervisor know about all the tasks the function
-<c> sc_hypervisor_set_type_of_task </c> will just inform the hypervisor about future tasks
+sc_hypervisor_set_type_of_task() will just inform the hypervisor about future tasks
 without submitting them right away.
 
 The <b>Ispeed </b> strategy divides the execution of the application in several frames.
 For each frame the hypervisor computes the speed of the contexts and tries making them
-run at the same speed. The strategy requires less contribution from the user as
+run at the same speed. The strategy requires less contribution from users as
 the hypervisor requires only the size of the frame in terms of flops.
 
 \code{.c}

+ 53 - 54
doc/doxygen/chapters/modularized_scheduler.doxy

@@ -9,8 +9,8 @@
 
 \section Introduction
 
-StarPU's Modularized Schedulers are made of individual Scheduling Components 
-Modularizedly assembled as a Scheduling Tree. Each Scheduling Component has an 
+StarPU's Modularized Schedulers are made of individual Scheduling Components
+Modularizedly assembled as a Scheduling Tree. Each Scheduling Component has an
 unique purpose, such as prioritizing tasks or mapping tasks over resources.
 A typical Scheduling Tree is shown below.
 
@@ -21,30 +21,30 @@ A typical Scheduling Tree is shown below.
                                   v
                             Fifo_Component
                                 |  ^
-                                |  |        
+                                |  |
                                 v  |
                            Eager_Component
                                 |  ^
-                                |  |    
+                                |  |
                                 v  |
                  --------><--------------><--------
                  |  ^                          |  ^
-                 |  |                          |  |        
+                 |  |                          |  |
                  v  |                          v  |
              Fifo_Component                 Fifo_Component
                  |  ^                          |  ^
-                 |  |                          |  |        
+                 |  |                          |  |
                  v  |                          v  |
             Worker_Component               Worker_Component
 </pre>
 
 When a task is pushed by StarPU in a Modularized Scheduler, the task moves from
 a Scheduling Component to an other, following the hierarchy of the
-Scheduling Tree, and is stored in one of the Scheduling Components of the 
+Scheduling Tree, and is stored in one of the Scheduling Components of the
 strategy.
 When a worker wants to pop a task from the Modularized Scheduler, the
-corresponding Worker Component of the Scheduling Tree tries to pull a task from 
-its parents, following the hierarchy, and gives it to the worker if it succeded 
+corresponding Worker Component of the Scheduling Tree tries to pull a task from
+its parents, following the hierarchy, and gives it to the worker if it succeded
 to get one.
 
 
@@ -52,7 +52,7 @@ to get one.
 
 \subsection ExistingModularizedSchedulers Existing Modularized Schedulers
 
-StarPU is currently shipped with the following pre-defined Modularized 
+StarPU is currently shipped with the following pre-defined Modularized
 Schedulers :
 
 - Eager-based Schedulers (with/without prefetching) : \n
@@ -60,11 +60,11 @@ Naive scheduler, which tries to map a task on the first available resource
 it finds.
 
 - Prio-based Schedulers (with/without prefetching) : \n
-Similar to Eager-Based Schedulers. Can handle tasks which have a defined 
+Similar to Eager-Based Schedulers. Can handle tasks which have a defined
 priority and schedule them accordingly.
 
 - Random-based Schedulers (with/without prefetching) : \n
-Selects randomly a resource to be mapped on for each task. 
+Selects randomly a resource to be mapped on for each task.
 
 - HEFT Scheduler : \n
 Heterogeneous Earliest Finish Time Scheduler.
@@ -73,8 +73,8 @@ defined performance model (\ref PerformanceModelCalibration)
 to work efficiently, but can handle tasks without a performance
 model.
 
-It is currently needed to set the environment variable \ref STARPU_SCHED 
-to use those Schedulers. Modularized Schedulers' naming is tree-*
+To use one of these schedulers, one can set the environment variable \ref STARPU_SCHED.
+All modularized schedulers are named following the RE <c>tree-*</c>
 
 \subsection ExampleTreeEagerPrefetchingStrategy An Example : The Tree-Eager-Prefetching Strategy
 
@@ -89,7 +89,7 @@ to use those Schedulers. Modularized Schedulers' naming is tree-*
                                 v  |
                           Eager_Component
                                 |  ^
-                                |  |    
+                                |  |
                                 v  |
               --------><-------------------><---------
               |  ^                                |  ^
@@ -104,22 +104,22 @@ to use those Schedulers. Modularized Schedulers' naming is tree-*
 
 \subsection Interface
 
-Each Scheduling Component must follow the following pre-defined Interface 
+Each Scheduling Component must follow the following pre-defined Interface
 to be able to interact with other Scheduling Components.
 
 	- Push (Caller_Component, Child_Component, Task) \n
-	The calling Scheduling Component transfers a task to its 
-	Child Component. When the Push function returns, the task no longer 
-	belongs to the calling Component. The Modularized Schedulers' 
+	The calling Scheduling Component transfers a task to its
+	Child Component. When the Push function returns, the task no longer
+	belongs to the calling Component. The Modularized Schedulers'
 	model relies on this function to perform prefetching.
 
 	- Pull (Caller_Component, Parent_Component)  ->  Task \n
 	The calling Scheduling Component requests a task from
-	its Parent Component. When the Pull function ends, the returned 
+	its Parent Component. When the Pull function ends, the returned
 	task belongs to the calling Component.
 
 	- Can_Push (Caller_Component, Parent_Component) \n
-	The calling Scheduling Component notifies its Parent Component that 
+	The calling Scheduling Component notifies its Parent Component that
 	it is ready to accept new tasks.
 
 	- Can_Pull (Caller_Component, Child_Component) \n
@@ -127,13 +127,13 @@ to be able to interact with other Scheduling Components.
 	that it is ready to give new tasks.
 
 
-\section BuildAModularizedScheduler Build a Modularized Scheduler
+\section BuildAModularizedScheduler Building a Modularized Scheduler
 
 \subsection PreImplementedComponents Pre-implemented Components
 
-StarPU is currently shipped with the following four Scheduling Components : 
+StarPU is currently shipped with the following four Scheduling Components :
 
-	- Flow-control Components : Fifo, Prio \n 
+	- Flow-control Components : Fifo, Prio \n
 	Components which store tasks. They can also prioritize them if
 	they have a defined priority. It is possible to define a threshold
 	for those Components following two criterias : the number of tasks
@@ -148,19 +148,19 @@ StarPU is currently shipped with the following four Scheduling Components :
 	Each Worker Component modelize a concrete worker.
 
 	- Special-Purpose Components : Perfmodel_Select, Best_Implementation \n
-	Components dedicated to original purposes. The Perfmodel_Select 
-	Component decides which Resource-Mapping Component should be used to 
+	Components dedicated to original purposes. The Perfmodel_Select
+	Component decides which Resource-Mapping Component should be used to
 	schedule a task. The Best_Implementation Component chooses which
 	implementation of a task should be used on the choosen resource.
 
 \subsection ProgressionAndValidationRules Progression And Validation Rules
 
-Some rules must be followed to ensure the correctness of a Modularized 
+Some rules must be followed to ensure the correctness of a Modularized
 Scheduler :
 
-	- At least one Flow-control Component without threshold per Worker Component 
-	is needed in a Modularized Scheduler, to store incoming tasks from StarPU 
-	and to give tasks to Worker Components who asks for it. It is possible to 
+	- At least one Flow-control Component without threshold per Worker Component
+	is needed in a Modularized Scheduler, to store incoming tasks from StarPU
+	and to give tasks to Worker Components who asks for it. It is possible to
 	use one Flow-control Component per Worker Component, or one for all Worker
 	Components, depending on how the Scheduling Tree is defined.
 
@@ -168,7 +168,7 @@ Scheduler :
 	Scheduler. Resource-Mapping Components are the only ones who can make
 	scheduling choices, and so the only ones who can have several child.
 
-\subsection ImplementAModularizedScheduler Implement a Modularized Scheduler
+\subsection ImplementAModularizedScheduler Implementing a Modularized Scheduler
 
 The following code shows how the Tree-Eager-Prefetching Scheduler
 shown in Section \ref ExampleTreeEagerPrefetchingStrategy is implemented :
@@ -188,7 +188,7 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
     (sched_ctx_id, STARPU_WORKER_LIST);
 
   /* Create the Scheduling Tree */
-  struct starpu_sched_tree * t = 
+  struct starpu_sched_tree * t =
     starpu_sched_tree_create(sched_ctx_id);
 
   /* The Root Component is a Flow-control Fifo Component */
@@ -199,16 +199,16 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
   struct starpu_sched_component * eager_component =
     starpu_sched_component_eager_create(NULL);
 
-  /* Create links between Components : the Eager Component is the child 
+  /* Create links between Components : the Eager Component is the child
    * of the Root Component */
   t->root->add_child
     (t->root, eager_component);
   eager_component->add_father
     (eager_component, t->root);
 
-  /* A task threshold is set for the Flow-control Components which will 
-   * be connected to Worker Components. By doing so, this Modularized 
-   * Scheduler will be able to perform some prefetching on the resources 
+  /* A task threshold is set for the Flow-control Components which will
+   * be connected to Worker Components. By doing so, this Modularized
+   * Scheduler will be able to perform some prefetching on the resources
    */
   struct starpu_sched_component_fifo_data fifo_data =
   {
@@ -218,11 +218,11 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
 
   unsigned i;
   for(i = 0;
-    i < starpu_worker_get_count() + 
+    i < starpu_worker_get_count() +
     starpu_combined_worker_get_count();
     i++)
   {
-    /* Each Worker Component has a Flow-control Fifo Component as 
+    /* Each Worker Component has a Flow-control Fifo Component as
      * father */
     struct starpu_sched_component * worker_component =
 	  starpu_sched_component_worker_get(i);
@@ -233,8 +233,8 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
     worker_component->add_father
       (worker_component, fifo_component);
 
-    /* Each Flow-control Fifo Component associated to a Worker 
-     * Component is linked to the Eager Component as one of its 
+    /* Each Flow-control Fifo Component associated to a Worker
+     * Component is linked to the Eager Component as one of its
      * children */
     eager_component->add_child
       (eager_component, fifo_component);
@@ -276,7 +276,7 @@ struct starpu_sched_policy _starpu_sched_tree_eager_prefetching_policy =
 };
 \endcode
 
-\section WriteASchedulingComponent Write a Scheduling Component
+\section WriteASchedulingComponent Writing a Scheduling Component
 
 \subsection GenericSchedulingComponent Generic Scheduling Component
 
@@ -284,10 +284,10 @@ Each Scheduling Component is instantiated from a Generic Scheduling Component,
 which implements a generic version of the Interface. The generic implementation
 of Pull, Can_Pull and Can_Push functions are recursive calls to their parents
 (respectively to their children). However, as a Generic Scheduling Component do
-not know how much children it will have when it will be instantiated, it does 
+not know how much children it will have when it will be instantiated, it does
 not implement the Push function.
 
-\subsection InstantiationRedefineInterface Instantiation : Redefine the Interface
+\subsection InstantiationRedefineInterface Instantiation : Redefining the Interface
 
 A Scheduling Component must implement all the functions of the Interface. It is
 so necessary to implement a Push function to instantiate a Scheduling Component.
@@ -297,7 +297,7 @@ to the Scheduling Component he is implementing, it is possible to reimplement
 all the functions of the Interface. For example, a Flow-control Component
 reimplements the Pull and the Can_Push functions of the Interface, allowing him
 to catch the generic recursive calls of these functions. The Pull function of
-a Flow-control Component can, for example, pop a task from the local storage 
+a Flow-control Component can, for example, pop a task from the local storage
 queue of the Component, and give it to the calling Component which asks for it.
 
 \subsection DetailedProgressionAndValidationRules Detailed Progression and Validation Rules
@@ -307,18 +307,18 @@ queue of the Component, and give it to the calling Component which asks for it.
 	Areas in the Scheduling Tree.
 
 	- A Pump is the engine source of the Scheduler : it pushes/pulls tasks
-	to/from a Scheduling Component to an other. Native Pumps of a Scheduling 
-	Tree are located at the root of the Tree (incoming Push calls from StarPU), 
-	and at the leafs of the Tree (Pop calls coming from StarPU Workers). 
-	Pre-implemented Scheduling Components currently shipped with Pumps are 
-	Flow-Control Components and the Resource-Mapping Component Heft, within 
+	to/from a Scheduling Component to an other. Native Pumps of a Scheduling
+	Tree are located at the root of the Tree (incoming Push calls from StarPU),
+	and at the leafs of the Tree (Pop calls coming from StarPU Workers).
+	Pre-implemented Scheduling Components currently shipped with Pumps are
+	Flow-Control Components and the Resource-Mapping Component Heft, within
 	their defined Can_Push functions.
 
-	- A correct Scheduling Tree requires a Pump per Scheduling Area and per 
-	Execution Flow. 
+	- A correct Scheduling Tree requires a Pump per Scheduling Area and per
+	Execution Flow.
 
 
-The Tree-Eager-Prefetching Scheduler shown in Section 
+The Tree-Eager-Prefetching Scheduler shown in Section
 \ref ExampleTreeEagerPrefetchingStrategy follows the previous assumptions :
 
 <pre>
@@ -335,7 +335,7 @@ The Tree-Eager-Prefetching Scheduler shown in Section
                                         v  |
  Area 2                           Eager_Component
                                         |  ^
-                                        |  |    
+                                        |  |
                                         v  |
                       --------><-------------------><---------
                       |  ^                                |  ^
@@ -350,4 +350,3 @@ The Tree-Eager-Prefetching Scheduler shown in Section
 </pre>
 
 */
-

+ 110 - 0
doc/doxygen/chapters/360_debugging_tools.doxy

@@ -0,0 +1,110 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2011, 2012 INRIA
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page DebuggingTools Debugging Tools
+
+StarPU provides several tools to help debugging applications. Execution traces
+can be generated and displayed graphically, see \ref GeneratingTracesWithFxT.
+
+\section DebuggingInGeneral TroubleShooting In General
+
+Generally-speaking, if you have troubles, pass \ref enable-debug "--enable-debug" to
+<c>./configure</c> to enable some checks which impact performance, but will
+catch common issues, possibly earlier than the actual problem you are observing,
+which may just be a consequence of a bug that happened earlier. If your program
+is valgrind-safe, you can use it, see \ref UsingOtherDebugger.
+
+Then, if your program crashes with an assertion error, a segfault, etc. you can send us the result of
+
+\verbatim
+thread apply all bt
+\endverbatim
+
+run in <c>gdb</c> at the point of the crash.
+
+In case your program just hangs, but it may also be useful in case of a crash
+too, it helps to source <c>gdbinit</c> as described in the next section to be
+able to run and send us the output of the following commands:
+
+\verbatim
+starpu-workers
+starpu-tasks
+starpu-print-requests
+starpu-print-prequests
+starpu-print-frrequests
+starpu-print-irrequests
+\endverbatim
+
+To give us an idea of what is happening within StarPU. If the outputs are not too long, you can even run
+
+\verbatim
+starpu-all-tasks
+starpu-print-all-tasks
+starpu-print-datas-summary
+starpu-print-datas
+\endverbatim
+
+\section UsingGdb Using The Gdb Debugger
+
+Some <c>gdb</c> helpers are provided to show the whole StarPU state:
+
+\verbatim
+(gdb) source tools/gdbinit
+(gdb) help starpu
+\endverbatim
+
+For instance,
+<ul>
+<li> one can print all tasks with <c>starpu-print-all-tasks</c>, </li>
+<li> print all datas with <c>starpu-print-datas</c>, </li>
+<li> print all pending data transfers with <c>starpu-print-prequests</c>, <c>starpu-print-requests</c>, <c>starpu-print-frequests</c>, <c>starpu-print-irequests</c>,</li>
+<li> print pending MPI requests with <c>starpu-mpi-print-detached-requests</c></li>
+</ul>
+
+Some functions can only work if \ref enable-debug "--enable-debug"
+was passed to <c>./configure</c>
+(because they impact performance)
+
+\section UsingOtherDebugger Using Other Debugging Tools
+
+Valgrind can be used on StarPU: valgrind.h just needs to be found at <c>./configure</c>
+time, to tell valgrind about some known false positives and disable host memory
+pinning. Other known false positives can be suppressed by giving the suppression
+files in <c>tools/valgrind/*.suppr</c> to valgrind's <c>--suppressions</c> option.
+
+The environment variable \ref STARPU_DISABLE_KERNELS can also be set to <c>1</c> to make
+StarPU does everything (schedule tasks, transfer memory, etc.) except actually
+calling the application-provided kernel functions, i.e. the computation will not
+happen. This permits to quickly check that the task scheme is working properly.
+
+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
+
+StarPU can connect to Temanejo >= 1.0rc2 (see
+http://www.hlrs.de/temanejo), to permit
+nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
+install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
+<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
+
+It permits to visualize the task graph, add breakpoints, continue execution
+task-by-task, and run <c>gdb</c> on a given task, etc.
+
+\image html temanejo.png
+\image latex temanejo.png "" width=\textwidth
+
+Make sure to specify at least the same number of CPUs in the dialog box as your
+machine has, otherwise an error will happen during execution. Future versions
+of Temanejo should be able to tell StarPU the number of CPUs to use.
+
+Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
+Temanejo (so as to distinguish them from tasks).
+
+
+
+*/

+ 72 - 21
doc/doxygen/chapters/12online_performance_tools.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -57,8 +57,8 @@ the function starpu_task_get_current().
 \subsection Per-codeletFeedback Per-codelet Feedback
 
 The field starpu_codelet::per_worker_stats is
-an array of counters. The i-th entry of the array is incremented every time a
-task implementing the codelet is executed on the i-th worker.
+an array of counters. The <c>i</c>-th entry of the array is incremented every time a
+task implementing the codelet is executed on the <c>i</c>-th worker.
 This array is not reinitialized when profiling is enabled or disabled.
 
 \subsection Per-workerFeedback Per-worker Feedback
@@ -82,15 +82,43 @@ wasted in pure StarPU overhead.
 Calling starpu_profiling_worker_get_info() resets the profiling
 information associated to a worker.
 
+To easily display all this information, the environment variable
+\ref STARPU_WORKER_STATS can be set to <c>1</c> (in addition to setting
+\ref STARPU_PROFILING to 1). A summary will then be displayed at program termination:
+
+\verbatim
+Worker stats:
+CUDA 0.0 (4.7 GiB)
+	480 task(s)
+	total: 1574.82 ms executing: 1510.72 ms sleeping: 0.00 ms overhead 64.10 ms
+	325.217970 GFlop/s
+
+CPU 0
+	22 task(s)
+	total: 1574.82 ms executing: 1364.81 ms sleeping: 0.00 ms overhead 210.01 ms
+	7.512057 GFlop/s
+
+CPU 1
+	14 task(s)
+	total: 1574.82 ms executing: 1500.13 ms sleeping: 0.00 ms overhead 74.69 ms
+	6.675853 GFlop/s
+
+CPU 2
+	14 task(s)
+	total: 1574.82 ms executing: 1553.12 ms sleeping: 0.00 ms overhead 21.70 ms
+	7.152886 GFlop/s
+\endverbatim
+
+The number of GFlops is available because the starpu_task::flops field of the
+tasks were filled (or \ref STARPU_FLOPS used in starpu_task_insert()).
+
 When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
-possible to use the tool <c>starpu_workers_activity</c> (see \ref
-MonitoringActivity) to generate a graphic showing the evolution of
+possible to use the tool <c>starpu_workers_activity</c> (see
+\ref MonitoringActivity) to generate a graphic showing the evolution of
 these values during the time, for the different workers.
 
 \subsection Bus-relatedFeedback Bus-related Feedback
 
-TODO: ajouter \ref STARPU_BUS_STATS
-
 // how to enable/disable performance monitoring
 // what kind of information do we get ?
 
@@ -110,6 +138,27 @@ CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
 CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
 \endverbatim
 
+Statistics about the data transfers which were performed and temporal average
+of bandwidth usage can be obtained by setting the environment variable
+\ref STARPU_BUS_STATS to <c>1</c>; a summary will then be displayed at program termination:
+
+\verbatim
+Data transfer stats:
+	RAM 0 -> CUDA 0	319.92 MB	213.10 MB/s	(transfers : 91 - avg 3.52 MB)
+	CUDA 0 -> RAM 0	214.45 MB	142.85 MB/s	(transfers : 61 - avg 3.52 MB)
+	RAM 0 -> CUDA 1	302.34 MB	201.39 MB/s	(transfers : 86 - avg 3.52 MB)
+	CUDA 1 -> RAM 0	133.59 MB	88.99 MB/s	(transfers : 38 - avg 3.52 MB)
+	CUDA 0 -> CUDA 1	144.14 MB	96.01 MB/s	(transfers : 41 - avg 3.52 MB)
+	CUDA 1 -> CUDA 0	130.08 MB	86.64 MB/s	(transfers : 37 - avg 3.52 MB)
+	RAM 0 -> CUDA 2	312.89 MB	208.42 MB/s	(transfers : 89 - avg 3.52 MB)
+	CUDA 2 -> RAM 0	133.59 MB	88.99 MB/s	(transfers : 38 - avg 3.52 MB)
+	CUDA 0 -> CUDA 2	151.17 MB	100.69 MB/s	(transfers : 43 - avg 3.52 MB)
+	CUDA 2 -> CUDA 0	105.47 MB	70.25 MB/s	(transfers : 30 - avg 3.52 MB)
+	CUDA 1 -> CUDA 2	175.78 MB	117.09 MB/s	(transfers : 50 - avg 3.52 MB)
+	CUDA 2 -> CUDA 1	203.91 MB	135.82 MB/s	(transfers : 58 - avg 3.52 MB)
+Total transfers: 2.27 GB
+\endverbatim
+
 \subsection StarPU-TopInterface StarPU-Top Interface
 
 StarPU-Top is an interface which remotely displays the on-line state of a StarPU
@@ -162,7 +211,7 @@ starpu_top_task_prevision(task, workerid, begin, end);
 \endcode
 
 Starting StarPU-Top (StarPU-Top is started via the binary
-<c>starpu_top</c>.) and the application can be done two ways:
+<c>starpu_top</c>) and the application can be done in two ways:
 
 <ul>
 <li> The application is started by hand on some machine (and thus already
@@ -286,8 +335,8 @@ using the function starpu_perfmodel_update_history().
 The following is a small code example.
 
 If e.g. the code is recompiled with other compilation options, or several
-variants of the code are used, the symbol string should be changed to reflect
-that, in order to recalibrate a new model from zero. The symbol string can even
+variants of the code are used, the <c>symbol</c> string should be changed to reflect
+that, in order to recalibrate a new model from zero. The <c>symbol</c> string can even
 be constructed dynamically at execution time, as long as this is done before
 submitting any task using it.
 
@@ -311,10 +360,10 @@ struct starpu_codelet cl = {
 <li>
 Measured at runtime and refined by regression (model types
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
-still assumes performance regularity, but works 
+still assumes performance regularity, but works
 with various data input sizes, by applying regression over observed
-execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
-form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
+execution times. ::STARPU_REGRESSION_BASED uses an <c>a*n^b</c> regression
+form, ::STARPU_NL_REGRESSION_BASED uses an <c>a*n^b+c</c> (more precise than
 ::STARPU_REGRESSION_BASED, but costs a lot more to compute).
 
 For instance,
@@ -331,8 +380,8 @@ so as to feed the performance model for a variety of
 inputs. The application can also provide the measurements explictly by
 using the function starpu_perfmodel_update_history(). The tools
 <c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
-be used to observe how much the performance model is calibrated (\ref
-PerformanceModelCalibration); when their output look good,
+be used to observe how much the performance model is calibrated
+(\ref PerformanceModelCalibration); when their output look good,
 \ref STARPU_CALIBRATE can be reset to <c>0</c> to let
 StarPU use the resulting performance model without recording new measures, and
 \ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
@@ -356,9 +405,11 @@ see for instance
 
 <li>
 Provided explicitly by the application (model type ::STARPU_PER_ARCH):
+either field starpu_perfmodel::arch_cost_function, or
 the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
 filled with pointers to functions which return the expected duration
-of the task in micro-seconds, one per architecture.
+of the task in micro-seconds, one per architecture, see for instance
+<c>tests/datawizard/locality.c</c>
 </li>
 </ul>
 
@@ -384,7 +435,7 @@ time), and thus particularly useful for ::STARPU_REGRESSION_BASED or
 variance of the actual performance is unknown (irregular performance behavior,
 etc.), and thus only useful for ::STARPU_HISTORY_BASED.
 starpu_task_data_footprint() can be used as a base and combined with other
-parameters through starpu_hash_crc32c_be for instance.
+parameters through starpu_hash_crc32c_be() for instance.
 
 StarPU will automatically determine when the performance model is calibrated,
 or rather, it will assume the performance model is calibrated until the
@@ -402,10 +453,10 @@ variable to <c>1</c>, or even reset by setting it to <c>2</c>.
 How to use schedulers which can benefit from such performance model is explained
 in \ref TaskSchedulingPolicy.
 
-The same can be done for task power consumption estimation, by setting
-the field starpu_codelet::power_model the same way as the field
+The same can be done for task energy consumption estimation, by setting
+the field starpu_codelet::energy_model the same way as the field
 starpu_codelet::model. Note: for now, the application has to give to
-the power consumption performance model a name which is different from
+the energy consumption performance model a name which is different from
 the execution time performance model.
 
 The application can request time estimations from the StarPU performance
@@ -414,7 +465,7 @@ it. The data handles can be created by calling any of the functions
 <c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
 node and the desired data sizes, and need to be unregistered as usual.
 The functions starpu_task_expected_length() and
-starpu_task_expected_power() can then be called to get an estimation
+starpu_task_expected_energy() can then be called to get an estimation
 of the task cost on a given arch. starpu_task_footprint() can also be
 used to get the footprint used for indexing history-based performance
 models. starpu_task_destroy() needs to be called to destroy the dummy

+ 78 - 28
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -90,8 +90,8 @@ starpu_shutdown(). The trace is a binary file whose name has the form
 <c>/tmp/</c> directory by default, or by the directory specified by
 the environment variable \ref STARPU_FXT_PREFIX.
 
-The additional configure option \ref enable-fxt-lock "--enable-fxt-lock" can 
-be used to generate trace events which describes the locks behaviour during 
+The additional configure option \ref enable-fxt-lock "--enable-fxt-lock" can
+be used to generate trace events which describes the locks behaviour during
 the execution.
 
 \subsection CreatingAGanttDiagram Creating a Gantt Diagram
@@ -123,7 +123,9 @@ Details of the codelet execution can be obtained by passing
 configuring StarPU and using a recent enough version of ViTE (at least
 r1430).
 
-In the MPI execution case, collect the trace files from the MPI nodes, and
+In the MPI execution case, \ref STARPU_GENERATE_TRACE will not work as expected
+(each node will try to generate paje.trace, thus mixing outputs...), you have to
+collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 
 \verbatim
@@ -134,12 +136,12 @@ By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
 To identify tasks precisely, the application can set the starpu_task::tag_id field of the
-task (or use STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
-enough version of vite (>= r1430) and the
+task (or use \ref STARPU_TAG_ONLY when using starpu_task_insert()), and with a recent
+enough version of ViTE (>= r1430) and the
 \ref enable-paje-codelet-details "--enable-paje-codelet-details"
 StarPU configure option, the value of the tag will show up in the trace.
 
-It can also set the starpu_task::name field of the task (or use STARPU_NAME)
+It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME)
 when using starpu_task_insert()), to replace in traces the name of the codelet
 with an arbitrarily chosen name.
 
@@ -149,7 +151,7 @@ Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for ins
 $ fxt_print -o -f /tmp/prof_file_something
 \endverbatim
 
-Timings are in nanoseconds (while timings as seen in <c>vite</c> are in milliseconds).
+Timings are in nanoseconds (while timings as seen in ViTE are in milliseconds).
 
 \subsection CreatingADAGWithGraphviz Creating a DAG With Graphviz
 
@@ -168,7 +170,7 @@ graphical output of the graph by using the graphviz library:
 $ dot -Tpdf dag.dot -o output.pdf
 \endverbatim
 
-\subsection TraceTaskDetails Getting task details
+\subsection TraceTaskDetails Getting Task Details
 
 When the FxT trace file <c>prof_file_something</c> has been generated, details on the
 executed tasks can be retrieved by calling:
@@ -180,9 +182,19 @@ $ starpu_fxt_tool -i /tmp/prof_file_something
 This will create a <c>tasks.rec</c> file in the current directory.  This file
 is in the recutils format, i.e. <c>Field: value</c> lines, and empty lines to
 separate each task.  This can be used as a convenient input for various ad-hoc
-analysis tools.  The performance models can be opened for instance by using
-\ref starpu_perfmodel_load_symbol and then using
-\ref starpu_perfmodel_history_based_expected_perf
+analysis tools. By default it only contains information about the actual
+execution. Performance models can be obtained by running
+<c>starpu_tasks_rec_complete</c> on it:
+
+\verbatim
+$ starpu_tasks_rec_complete tasks.rec tasks2.rec
+\endverbatim
+
+which will add <c>EstimatedTime</c> lines which contain the performance
+model-estimated time (in µs) for each worker starting from 0. Since it needs
+the performance models, it needs to be run the same way as the application
+execution, or at least with <c>STARPU_HOSTNAME</c> set to the hostname of the
+machine used for execution, to get the performance models of that machine.
 
 \subsection MonitoringActivity Monitoring Activity
 
@@ -217,7 +229,7 @@ evolution of the number of tasks available in the system during the execution.
 Ready tasks are shown in black, and tasks that are submitted but not
 schedulable yet are shown in grey.
 
-\subsection Animation Getting modular schedular animation
+\subsection Animation Getting Modular Schedular Animation
 
 When using modular schedulers (i.e. schedulers which use a modular architecture,
 and whose name start with "modular-"), the command
@@ -230,6 +242,31 @@ will also produce a <c>trace.html</c> file which can be viewed in a
 javascript-enabled web browser. It shows the flow of tasks between the
 components of the modular scheduler.
 
+\subsection LimitingScopeTrace Limiting The Scope Of The Trace
+
+For computing statistics, it is useful to limit the trace to a given portion of
+the time of the whole execution. This can be achieved by calling
+
+\code{.c}
+starpu_fxt_autostart_profiling(0)
+\endcode
+
+before calling starpu_init(), to
+prevent tracing from starting immediately. Then
+
+\code{.c}
+starpu_fxt_start_profiling();
+\endcode
+
+and
+
+\code{.c}
+starpu_fxt_stop_profiling();
+\endcode
+
+can be used around the portion of code to be traced. This will show up as marks
+in the trace, and states of workers will only show up for that portion.
+
 \section PerformanceOfCodelets Performance Of Codelets
 
 The performance model of codelets (see \ref PerformanceModelExample)
@@ -303,8 +340,9 @@ run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 \image html starpu_non_linear_memset_regression_based.png
 \image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
 
-When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
-directly draw a GFlops curve, by simply adding the <c>-f</c> option:
+When the field starpu_task::flops is set (or \ref STARPU_FLOPS is passed to
+starpu_task_insert()), <c>starpu_perfmodel_plot</c> can directly draw a GFlops
+curve, by simply adding the <c>-f</c> option:
 
 \verbatim
 $ starpu_perfmodel_plot -f -s chol_model_11
@@ -356,16 +394,17 @@ histogram of the codelet execution time distribution.
 \image html distrib_data_histo.png
 \image latex distrib_data_histo.eps "" width=\textwidth
 
-\section TraceStatistics Trace statistics
+\section TraceStatistics Trace Statistics
 
 More than just codelet performance, it is interesting to get statistics over all
 kinds of StarPU states (allocations, data transfers, etc.). This is particularly
 useful to check what may have gone wrong in the accurracy of the simgrid
 simulation.
 
-This requires the <c>R</c> statistical tool, with the plyr, ggplot2 and
-data.table packages. If your system distribution does not have packages for
-these, one can fetch them from CRAN:
+This requires the <c>R</c> statistical tool, with the <c>plyr</c>,
+<c>ggplot2</c> and <c>data.table</c> packages. If your system
+distribution does not have packages for these, one can fetch them from
+<c>CRAN</c>:
 
 \verbatim
 $ R
@@ -375,10 +414,10 @@ $ R
 > install.packages("knitr")
 \endverbatim
 
-The pj_dump tool from pajeng is also needed (see
+The <c>pj_dump</c> tool from <c>pajeng</c> is also needed (see
 https://github.com/schnorr/pajeng)
 
-One can then get textual or .csv statistics over the trace states:
+One can then get textual or <c>.csv</c> statistics over the trace states:
 
 \verbatim
 $ starpu_paje_state_stats -v native.trace simgrid.trace
@@ -390,12 +429,23 @@ $ starpu_paje_state_stats -v native.trace simgrid.trace
 $ starpu_paje_state_stats native.trace simgrid.trace
 \endverbatim
 
-An other way to get statistics of StarPU states (without installing R and
-pj_dump) is to use the starpu_trace_state_stats.py script which parses the
-generated trace.rec file instead of the paje.trace file. The output is similar
+An other way to get statistics of StarPU states (without installing <c>R</c> and
+<c>pj_dump</c>) is to use the <c>starpu_trace_state_stats.py</c> script which parses the
+generated <c>trace.rec</c> file instead of the <c>paje.trace</c> file. The output is similar
 to the previous script but it doesn't need any dependencies.
 
-Here's an example how to use it:
+The different prefixes used in <c>trace.rec</c> are:
+
+\verbatim
+E: Event type
+N: Event name
+C: Event category
+W: Worker ID
+T: Thread ID
+S: Start time
+\endverbatim
+
+Here's an example on how to use it:
 
 \verbatim
 $ python starpu_trace_state_stats.py trace.rec | column -t -s ","
@@ -406,7 +456,7 @@ $ python starpu_trace_state_stats.py trace.rec | column -t -s ","
 "chol_model_22"  165	Task	64712.07
 \endverbatim
 
-starpu_trace_state_stats.py can also be used to compute the different
+<c>starpu_trace_state_stats.py</c> can also be used to compute the different
 efficiencies. Refer to the usage description to show some examples.
 
 And one can plot histograms of execution times, of several states for instance:
@@ -428,7 +478,7 @@ $ starpu_paje_summary native.trace simgrid.trace
 it includes gantt charts, execution summaries, as well as state duration charts
 and time distribution histograms.
 
-Other external Pajé analysis tools can be used on these traces, one just needs
+Other external Paje analysis tools can be used on these traces, one just needs
 to sort the traces by timestamp order (which not guaranteed to make recording
 more efficient):
 
@@ -467,7 +517,7 @@ execution time of your tasks. If StarPU was compiled with the library
 <c>glpk</c> installed, starpu_bound_compute() can be used to solve it
 immediately and get the optimized minimum, in ms. Its parameter
 <c>integer</c> allows to decide whether integer resolution should be
-computed and returned 
+computed and returned
 
 The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
 data, and tag dependencies into account. Tags released in a callback

+ 7 - 7
doc/doxygen/chapters/14faq.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -14,7 +14,7 @@ Some libraries need to be initialized once for each concurrent instance that
 may run on the machine. For instance, a C++ computation class which is not
 thread-safe by itself, but for which several instanciated objects of that class
 can be used concurrently. This can be used in StarPU by initializing one such
-object per worker. For instance, the libstarpufft example does the following to
+object per worker. For instance, the <c>libstarpufft</c> example does the following to
 be able to use FFTW on CPUs.
 
 Some global array stores the instanciated objects:
@@ -51,7 +51,7 @@ static void fft(void *descr[], void *_args)
 
 This however is not sufficient for FFT on CUDA: initialization has
 to be done from the workers themselves.  This can be done thanks to
-starpu_execute_on_each_worker().  For instance libstarpufft does the following.
+starpu_execute_on_each_worker().  For instance <c>libstarpufft</c> does the following.
 
 \code{.c}
 static void fft_plan_gpu(void *args)
@@ -164,10 +164,10 @@ and display it e.g. in the callback function.
 Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
 1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
 parallelism using StarPU (no multithreaded tasks), setting the
-environment variable MKL_NUM_THREADS to 1, and using the threaded MKL library,
-with iomp5.
+environment variable <c>MKL_NUM_THREADS</c> to <c>1</c>, and using the threaded MKL library,
+with <c>iomp5</c>.
 
-Using this configuration, StarPU uses only 1 core, no matter the value of
+Using this configuration, StarPU only uses 1 core, no matter the value of
 \ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
 
 The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
@@ -204,7 +204,7 @@ frozen), and stop them from polling for more work.
 Note that this does not prevent you from submitting new tasks, but
 they won't execute until starpu_resume() is called. Also note
 that StarPU must not be paused when you call starpu_shutdown(), and
-that this function pair works in a push/pull manner, ie you need to
+that this function pair works in a push/pull manner, i.e you need to
 match the number of calls to these functions to clear their effect.
 
 

+ 4 - 4
doc/doxygen/chapters/15out_of_core.doxy

@@ -13,7 +13,7 @@ When using StarPU, one may need to store more data than what the main memory
 disk and to use it.
 
 The principle is that one first registers a disk location, seen by StarPU as
-a void*, which can be for instance a Unix path for the stdio or unistd case,
+a <c>void*</c>, which can be for instance a Unix path for the stdio or unistd case,
 or a database file path for a leveldb case, etc. The disk backend opens this
 place with the plug method.
 
@@ -21,9 +21,9 @@ If the disk backend provides an alloc method, StarPU can then start using it
 to allocate room and store data there with the write method, without user
 intervention.
 
-The user can also use starpu_disk_open to explicitly open an object within the
+The user can also use starpu_disk_open() to explicitly open an object within the
 disk, e.g. a file name in the stdio or unistd cases, or a database key in the
-leveldb case, and then use starpu_*_register functions to turn it into a StarPU
+leveldb case, and then use <c>starpu_*_register</c> functions to turn it into a StarPU
 data handle. StarPU will then automatically read and write data as appropriate.
 
 \section UseANewDiskMemory Use a new disk memory
@@ -65,7 +65,7 @@ There are various ways to operate a disk memory node, described by the structure
 starpu_disk_ops. For instance, the variable #starpu_disk_unistd_ops
 uses read/write functions.
 
-All structures are in \ref API_Out_Of_Core .
+All structures are in \ref API_Out_Of_Core.
 
 \section ExampleDiskCopy Examples: disk_copy
 

+ 126 - 20
doc/doxygen/chapters/16mpi_support.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -10,7 +10,7 @@
 
 The integration of MPI transfers within task parallelism is done in a
 very natural way by the means of asynchronous interactions between the
-application and StarPU.  This is implemented in a separate libstarpumpi library
+application and StarPU.  This is implemented in a separate <c>libstarpumpi</c> library
 which basically provides "StarPU" equivalents of <c>MPI_*</c> functions, where
 <c>void *</c> buffers are replaced with ::starpu_data_handle_t, and all
 GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
@@ -21,6 +21,89 @@ An MPI Insert Task function provides an even more seamless transition to a
 distributed application, by automatically issuing all required data transfers
 according to the task graph and an application-provided distribution.
 
+\section ExampleDocumentation Example used in this documentation
+
+The example below will be used as the base for this documentation. It
+initializes a token on node 0, and the token is passed from node to node,
+incremented by one on each step. The code is not using StarPU yet.
+
+\code{.c}
+    for (loop = 0; loop < nloops; loop++) {
+        int tag = loop*size + rank;
+
+        if (loop == 0 && rank == 0)
+        {
+            token = 0;
+            fprintf(stdout, "Start with token value %d\n", token);
+        }
+        else
+        {
+            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+        }
+
+        token++;
+
+        if (loop == last_loop && rank == last_rank)
+        {
+            fprintf(stdout, "Finished: token value %d\n", token);
+        }
+        else
+        {
+            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+        }
+    }
+\endcode
+
+\section NotUsingMPISupport About not using the MPI support
+
+Although StarPU provides MPI support, the application programmer may want to
+keep his MPI communications as they are for a start, and only delegate task
+execution to StarPU.  This is possible by just using starpu_data_acquire(), for
+instance:
+
+\code{.c}
+    for (loop = 0; loop < nloops; loop++) {
+        int tag = loop*size + rank;
+
+	/* Acquire the data to be able to write to it */
+	starpu_data_acquire(token_handle, STARPU_W);
+        if (loop == 0 && rank == 0)
+        {
+            token = 0;
+            fprintf(stdout, "Start with token value %d\n", token);
+        }
+        else
+        {
+            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+        }
+	starpu_data_release(token_handle);
+
+        /* Task delegation to StarPU to increment the token. The execution might
+         * be performed on a CPU, a GPU, etc. */
+        increment_token();
+
+	/* Acquire the update data to be able to read from it */
+	starpu_data_acquire(token_handle, STARPU_R);
+        if (loop == last_loop && rank == last_rank)
+        {
+            fprintf(stdout, "Finished: token value %d\n", token);
+        }
+        else
+        {
+            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+        }
+	starpu_data_release(token_handle);
+    }
+\endcode
+
+In that case, <c>libstarpumpi</c> is not needed. One can also use <c>MPI_Isend()</c> and
+<c>MPI_Irecv()</c>, by calling starpu_data_release() after <c>MPI_Wait()</c> or <c>MPI_Test()</c>
+have notified completion.
+
+It is however better to use <c>libstarpumpi</c>, to save the application from having to
+synchronize with starpu_data_acquire(), and instead just submit all tasks and
+communications asynchronously, and wait for the overall completion.
+
 \section SimpleExample Simple Example
 
 The flags required to compile or link against the MPI layer are
@@ -31,9 +114,6 @@ $ pkg-config --cflags starpumpi-1.3  # options for the compiler
 $ pkg-config --libs starpumpi-1.3    # options for the linker
 \endverbatim
 
-You also need pass the option <c>--static</c> if the application is to
-be linked statically.
-
 \code{.c}
 void increment_token(void)
 {
@@ -65,8 +145,10 @@ int main(int argc, char **argv)
 
         if (loop == 0 && rank == 0)
         {
+            starpu_data_acquire(token_handle, STARPU_W);
             token = 0;
             fprintf(stdout, "Start with token value %d\n", token);
+            starpu_data_release(token_handle);
         }
         else
         {
@@ -101,6 +183,11 @@ int main(int argc, char **argv)
     }
 \endcode
 
+We have here replaced <c>MPI_Recv()</c> and <c>MPI_Send()</c> with starpu_mpi_irecv_detached()
+and starpu_mpi_isend_detached(), which just submit the communication to be
+performed. The only remaining synchronization with starpu_data_acquire() is at
+the beginning and the end.
+
 \section PointToPointCommunication Point To Point Communication
 
 The standard point to point communications of MPI have been
@@ -109,7 +196,7 @@ the DSM provided by StarPU. A MPI request will only be submitted when
 the data is available in the main memory of the node submitting the
 request.
 
-There is two types of asynchronous communications: the classic
+There are two types of asynchronous communications: the classic
 asynchronous communications and the detached communications. The
 classic asynchronous communications (starpu_mpi_isend() and
 starpu_mpi_irecv()) need to be followed by a call to
@@ -146,7 +233,7 @@ does the following:
 <li> it polls the <em>ready requests list</em>. For all the ready
 requests, the appropriate function is called to post the corresponding
 MPI call. For example, an initial call to starpu_mpi_isend() will
-result in a call to <c>MPI_Isend</c>. If the request is marked as
+result in a call to <c>MPI_Isend()</c>. If the request is marked as
 detached, the request will then be added in the <em>detached requests
 list</em>.
 </li>
@@ -154,7 +241,7 @@ list</em>.
 </li>
 <li> it polls the <em>detached requests list</em>. For all the detached
 requests, it tests its completion of the MPI request by calling
-<c>MPI_Test</c>. On completion, the data handle is released, and if a
+<c>MPI_Test()</c>. On completion, the data handle is released, and if a
 callback was defined, it is called.
 </li>
 <li> finally, it checks if a data envelope has been received. If so,
@@ -169,17 +256,23 @@ processed in the first step of the next loop.
 </li>
 </ol>
 
-\ref MPIPtpCommunication "Communication" gives the list of all the
+\ref MPIPtpCommunication gives the list of all the
 point to point communications defined in StarPU-MPI.
 
 \section ExchangingUserDefinedDataInterface Exchanging User Defined Data Interface
 
-New data interfaces defined as explained in \ref
-DefiningANewDataInterface can also be used within StarPU-MPI and
+New data interfaces defined as explained in \ref DefiningANewDataInterface
+can also be used within StarPU-MPI and
 exchanged between nodes. Two functions needs to be defined through the
 type starpu_data_interface_ops. The function
 starpu_data_interface_ops::pack_data takes a handle and returns a
-contiguous memory buffer allocated with starpu_malloc_flags(ptr, size, 0) along with its size where data to be conveyed
+contiguous memory buffer allocated with
+
+\code{.c}
+starpu_malloc_flags(ptr, size, 0)
+\endcode
+
+along with its size where data to be conveyed
 to another node should be copied. The reversed operation is
 implemented in the function starpu_data_interface_ops::unpack_data which
 takes a contiguous memory buffer and recreates the data handle.
@@ -287,7 +380,7 @@ exchange the content of the handle. All MPI nodes then process the whole task
 graph, and StarPU automatically determines which node actually execute which
 task, and trigger the required MPI transfers.
 
-The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
+The list of functions is described in \ref MPIInsertTask.
 
 Here an stencil example showing how to use starpu_mpi_task_insert(). One
 first needs to define a distribution function which specifies the
@@ -411,7 +504,7 @@ the cost of task submission.
 A function starpu_mpi_task_build() is also provided with the aim to
 only construct the task structure. All MPI nodes need to call the
 function, only the node which is to execute the task will return a
-valid task structure, others will return NULL. That node must submit that task.
+valid task structure, others will return <c>NULL</c>. That node must submit that task.
 All nodes then need to call the function starpu_mpi_task_post_build() -- with the same
 list of arguments as starpu_mpi_task_build() -- to post all the
 necessary data communications.
@@ -448,7 +541,7 @@ modify the current value, it can not decide by itself whether to flush the cache
 or not.  The application can however explicitly tell StarPU-MPI to flush the
 cache by calling starpu_mpi_cache_flush() or starpu_mpi_cache_flush_all_data(),
 for instance in case the data will not be used at all any more (see for instance
-the cholesky example in mpi/examples/matrix_decomposition), or at least not in
+the cholesky example in <c>mpi/examples/matrix_decomposition</c>), or at least not in
 the close future. If a newly-submitted task actually needs the value again,
 another transmission of D will be initiated from A to B.  A mere
 starpu_mpi_cache_flush_all_data() can for instance be added at the end of the whole
@@ -459,7 +552,7 @@ for the data deallocation will be the same, but it will additionally release som
 pressure from the StarPU-MPI cache hash table during task submission.
 
 The whole caching behavior can be disabled thanks to the \ref STARPU_MPI_CACHE
-environment variable. The variable \ref STARPU_MPI_CACHE_STATS can be set to 1
+environment variable. The variable \ref STARPU_MPI_CACHE_STATS can be set to <c>1</c>
 to enable the runtime to display messages when data are added or removed
 from the cache holding the received data.
 
@@ -484,9 +577,7 @@ migrate the data, and register the new location.
                                               (uintptr_t)NULL, sizeof(unsigned));
             if (data_handles[x][y]) {
                 /* Migrate the data */
-                starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
-                /* And register the new rank of the matrix */
-                starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
+                starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
             }
         }
     }
@@ -522,7 +613,7 @@ latest value on the original home node.
 
 \section MPICollective MPI Collective Operations
 
-The functions are described in \ref MPICollectiveOperations "MPI Collective Operations".
+The functions are described in \ref MPICollectiveOperations.
 
 \code{.c}
 if (rank == root)
@@ -577,6 +668,21 @@ starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
 
 */
 
+Other collective operations would be easy to define, just ask starpu-devel for
+them!
+
+\section MPIDebug Debugging MPI
+
+Communication trace will be enabled when the environment variable
+\ref STARPU_MPI_COMM is set to 1, and StarPU has been configured with the
+option \ref enable-verbose "--enable-verbose".
+
+Statistics will be enabled for the communication cache when the
+environment variable \ref STARPU_MPI_CACHE_STATS is set to 1. It
+prints messages on the standard output when data are added or removed
+from the received communication cache.
+
+
 \section MPIExamples More MPI examples
 
 MPI examples are available in the StarPU source code in mpi/examples:

doc/doxygen/chapters/17fft_support.doxy → doc/doxygen/chapters/420_fft_support.doxy


+ 23 - 7
doc/doxygen/chapters/18mic_scc_support.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -13,15 +13,31 @@
 SCC support just needs the presence of the RCCE library.
 
 MIC Xeon Phi support actually needs two compilations of StarPU, one for the host and one for
-the device. The PATH environment variable has to include the path to the
-cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c>
+the device. The <c>PATH</c> environment variable has to include the path to the
+cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c> .
+The <c>SINK_PKG_CONFIG_PATH</c> environment variable should include the path to the
+cross-compiled <c>hwloc.pc</c>.
 The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
 calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c> and
 <c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
 recurse into both directories. If different configuration options are needed
-for the host and for the mic, one can use <c>--with-host-param=--with-fxt</b>
+for the host and for the mic, one can use <c>--with-host-param=--with-fxt</c>
 for instance to specify the <c>--with-fxt</c> option for the host only, or
-<c>--with-mic-param=--with-fxt</b> for the mic only.
+<c>--with-mic-param=--with-fxt</c> for the mic only.
+
+One can also run StarPU just natively on the Xeon Phi, i.e. it will only run
+directly on the Phi without any exchange with the host CPU. The binaries in
+<c>build_mic</c> can be run that way.
+
+For MPI support, you will probably have to specify different MPI compiler path
+or option for the host and the device builds, for instance:
+
+<c>./mic-configure --with-mic-param=--with-mpicc="/.../mpiicc -mmic" --with-mic-param=--with-mpicc=/.../mpiicc</c>
+
+In case you have troubles with the coi or scif libraries (the Intel paths are
+really not standard, it seems...), you can still make a build in native mode
+only, by using <c>mic-configure --enable-native-mic</c> (and notably without
+<c>--enable-mic</c> since in that case we don't need mic offloading support).
 
 \section PortingApplicationsToMICSCC Porting Applications To MIC Xeon Phi / SCC
 
@@ -45,8 +61,8 @@ MIC programs are started from the host. StarPU automatically
 starts the same program on MIC devices. It however needs to get
 the MIC-cross-built binary. It will look for the file given by the
 environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
-directory given by the environment variable \ref
-STARPU_MIC_SINK_PROGRAM_PATH, or in the field
+directory given by the environment variable \ref STARPU_MIC_SINK_PROGRAM_PATH,
+or in the field
 starpu_conf::mic_sink_program_path. It will also look in the current
 directory for the same binary name plus the suffix <c>-mic</c> or
 <c>_mic</c>.

doc/doxygen/chapters/19c_extensions.doxy → doc/doxygen/chapters/440_c_extensions.doxy


+ 226 - 0
doc/doxygen/chapters/450_native_fortran_support.doxy

@@ -0,0 +1,226 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2014, 2016 INRIA
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page NativeFortranSupport The StarPU Native Fortran Support
+
+StarPU provides the necessary routines and support to natively access
+most of its functionalities from Fortran 2008+ codes.
+
+All symbols (functions, constants) are defined in <c>fstarpu_mod.f90</c>.
+Every symbol of the Native Fortran support API is prefixed by
+<c>fstarpu_</c>. 
+
+Note: Mixing uses of <c>fstarpu_</c> and <c>starpu_</c>
+symbols in the same Fortran code has unspecified behaviour.
+See \ref APIMIX for a discussion about valid and unspecified
+combinations.
+
+\section Implementation Implementation Details and Specificities
+
+\subsection Prerequisites Prerequisites
+
+The Native Fortran support relies on Fortran 2008 specific constructs,
+as well as on the support of interoperability of assumed-shape arrays
+introduced as part of Fortran's Technical Specification ISO/IEC TS 29113:2012,
+for which no equivalent are available in previous versions of the
+standard. It has currently been tested successfully with GNU GFortran 4.9,
+GFortran 5.x, GFortran 6.x and the Intel Fortran Compiler >= 2016. It is known
+not to work with GNU GFortran < 4.9, Intel Fortran Compiler < 2016.
+
+\subsection Configuration Configuration
+
+The Native Fortran API is enabled and its companion
+<c>fstarpu_mod.f90</c> Fortran module source file is installed
+by default when a Fortran compiler is found, unless the detected Fortran
+compiler is known not to support the requirements for the Native Fortran
+API. The support can be disabled through the configure option \ref
+disable-fortran "--disable-fortran". Conditional compiled source codes
+may check for the availability of the Native Fortran Support by testing
+whether the preprocessor macro <c>STARPU_HAVE_FC</c> is defined or not.
+
+\subsection Examples Examples
+
+Several examples using the Native Fortran API are provided in
+StarPU's <c>examples/native_fortran/</c> examples directory, to showcase
+the Fortran flavor of various basic and more advanced StarPU features.
+
+\subsection AppCompile Compiling a Native Fortran Application
+
+The Fortran module <c>fstarpu_mod.f90</c> installed in StarPU's
+<c>include/</c> directory provides all the necessary API definitions. It
+must be compiled with the same compiler (same vendor, same version) as
+the application itself, and the resulting <c>fstarpu_mod.o</c> object
+file must linked with the application executable.
+
+Each example provided in StarPU's <c>examples/native_fortran/</c>
+examples directory comes with its own dedicated Makefile for out-of-tree
+build. Such example Makefiles may be used as starting points for
+building application codes with StarPU.
+
+\section Idioms Fortran Translation for Common StarPU API Idioms
+
+All these examples assume that the standard Fortran module <c>iso_c_binding</c>
+is in use.
+
+- Specifying a <c>NULL</c> pointer
+\code{.f90}
+        type(c_ptr) :: my_ptr  ! variable to store the pointer
+        ! [...]
+        my_ptr = C_NULL_PTR    ! assign standard constant for NULL ptr
+\endcode
+- Obtaining a pointer to some object:
+\code{.f90}
+        real(8), dimension(:), allocatable, target :: va
+        type(c_ptr) :: p_va  ! variable to store a pointer to array va
+        ! [...]
+        p_va = c_loc(va)
+\endcode
+- Obtaining a pointer to some subroutine:
+\code{.f90}
+        ! pointed routine definition
+        recursive subroutine myfunc () bind(C)
+        ! [...]
+        type(c_funptr) :: p_fun  ! variable to store the routine pointer
+        ! [...]
+        p_fun = c_funloc(my_func)
+\endcode
+- Obtaining the size of some object:
+\code{.f90}
+        real(8) :: a
+        integer(c_size_t) :: sz_a  ! variable to store the size of a
+        ! [...]
+        sz_a = c_sizeof(a)
+\endcode
+- Obtaining the length of an array dimension:
+\code{.f90}
+        real(8), dimension(:,:), allocatable, target :: vb
+        intger(c_int) :: ln_vb_1  ! variable to store the length of vb's dimension 1
+        intger(c_int) :: ln_vb_2  ! variable to store the length of vb's dimension 2
+        ! [...]
+        ln_vb_1 = 1+ubound(vb,1)-lbound(vb,1)  ! get length of dimension 1 of vb
+        ln_vb_2 = 1+ubound(vb,2)-lbound(vb,2)  ! get length of dimension 2 of vb
+\endcode
+- Specifying a string constant:
+\code{.f90}
+        type(c_ptr) :: my_cl  ! a StarPU codelet
+        ! [...]
+
+        ! set the name of a codelet to string 'my_codele't:
+        call fstarpu_codelet_set_name(my_cl, C_CHAR_"my_codelet"//C_NULL_CHAR)
+
+        ! note: using the C_CHAR_ prefix and the //C_NULL_CHAR concatenation at the end ensures
+        ! that the string constant is properly '\0' terminated, and compatible with StarPU's
+        ! internal C routines
+        !
+        ! note: plain Fortran string constants are not '\0' terminated, and as such, must not be
+        ! passed to StarPU routines.
+\endcode
+
+- Combining multiple flag constants with a bitwise 'or':
+\code{.f90}
+        type(c_ptr) :: my_cl  ! a pointer for the codelet structure
+        ! [...]
+
+        ! add a managed buffer to a codelet, specifying both the Read/Write access mode and the Locality hint
+        call fstarpu_codelet_add_buffer(my_cl, FSTARPU_RW.ior.FSTARPU_LOCALITY)
+\endcode
+
+\section InitExit Uses, Initialization and Shutdown
+
+The snippet below show an example of minimal StarPU code using the
+Native Fortran support. The program should <c>use</c> the standard
+module <c>iso_c_binding</c> as well as StarPU's <c>fstarpu_mod</c>. The
+StarPU runtime engine is initialized with a call to function
+<c>fstarpu_init</c>, which returns an integer status of 0 if successful
+or non-0 otherwise. Eventually, a call to <c>fstarpu_shutdown</c> ends
+the runtime engine and frees all internal StarPU data structures.
+
+\snippet nf_initexit.f90 To be included. You should update doxygen if you see this text.
+
+\section InsertTask Fortran Flavor of StarPU's Variadic Insert_task
+
+Fortran does not have a construction similar to C variadic functions on which
+<c>starpu_insert_task</c> relies at the time of this writing. However, Fortran's variable
+length arrays of <c>c_ptr</c> elements enable to emulate much of the
+convenience of C's variadic functions. This is the approach retained for
+implementing <c>fstarpu_insert_task</c>.
+
+The general syntax for using <c>fstarpu_insert_task</c> is as follows:
+\code{.f90}
+        call fstarpu_insert_task((/ <codelet ptr>       &
+            [, <access mode flags>, <data handle>]*     &
+            [, <argument type constant>, <argument>]*   &
+            , C_NULL_PTR /))
+\endcode
+
+There is thus a unique array argument <c>(/ ... /)</c> passed to
+<c>fstarpu_insert_task</c> which itself contains the task settings.
+Each element of the array must be of type <c>type(c_ptr)</c>.
+The last element of the array must be <c>C_NULL_PTR</c>.
+
+Example extracted from nf_vector.f90:
+\code{.f90}
+        call fstarpu_insert_task((/ cl_vec,          &    ! codelet
+            FSTARPU_R, dh_va,                        &    ! a first data handle
+            FSTARPU_RW.ior.FSTARPU_LOCALITY, dh_vb,  &    ! a second data handle
+            C_NULL_PTR /))                                ! no more args
+\endcode
+
+\section Structs Functions and Subroutines Expecting Data Structures Arguments
+
+Several StarPU structures that are expected to be passed to the C API,
+are replaced by function/subroutine wrapper sets to allocate, set fields
+and free such structure. This strategy has been prefered over defining
+native Fortran equivalent of such structures using Fortran's derived
+types, to avoid potential layout mismatch between C and Fortran StarPU
+data structures. Examples of such data structures wrappers include
+<c>fstarpu_conf_allocate</c> and alike, <c>fstarpu_codelet_allocate</c>
+and alike, <c>fstarpu_data_filter_allocate</c> and alike.
+
+Here is an example of allocating, filling and deallocating a codelet
+structure:
+\code{.f90}
+        ! a pointer for the codelet structure
+        type(c_ptr) :: cl_vec
+        ! [...]
+        ! allocate an empty codelet structure
+        cl_vec = fstarpu_codelet_allocate()
+        ! add a CPU implementation function to the codelet
+        call fstarpu_codelet_add_cpu_func(cl_vec, C_FUNLOC(cl_cpu_func_vec))
+        ! set the codelet name
+        call fstarpu_codelet_set_name(cl_vec, C_CHAR_"my_vec_codelet"//C_NULL_CHAR)
+        ! add a Read-only mode data buffer to the codelet
+        call fstarpu_codelet_add_buffer(cl_vec, FSTARPU_R)
+        ! add a Read-Write mode data buffer to the codelet
+        call fstarpu_codelet_add_buffer(cl_vec, FSTARPU_RW.ior.FSTARPU_LOCALITY)
+        ! [...]
+        ! free codelet structure
+        call fstarpu_codelet_free(cl_vec)
+\endcode
+
+\section Notes Additional Notes about the Native Fortran Support
+\subsection OldFortran Using StarPU with Older Fortran Compilers
+
+When using older compilers, Fortran applications may still interoperate
+with StarPU using C marshalling functions as exemplified in StarPU's
+<c>examples/fortran/</c> and <c>examples/fortran90/</c> example
+directories, though the process will be less convenient.
+
+\subsection APIMIX Valid API Mixes and Language Mixes
+
+Mixing uses of
+<c>fstarpu_</c> and <c>starpu_</c> symbols in the same
+Fortran code has unspecified behaviour. Using <c>fstarpu_</c>
+symbols in C code has unspecified behaviour.
+
+For multi-language applications using both C and Fortran source files:
+
+- C source files must use <c>starpu_</c> symbols exclusively
+- Fortran sources must uniformly use either <c>fstarpu_</c> symbols
+exclusively, or <c>starpu_</c> symbols exclusively. Every other
+combination has unspecified behaviour.
+
+*/

+ 3 - 3
doc/doxygen/chapters/20socl_opencl_extensions.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -75,8 +75,8 @@ Number of platforms:	2
 $
 \endverbatim
 
-To enable the use of CPU cores via OpenCL, one can set the STARPU_OPENCL_ON_CPUS
-environment variable to 1 and STARPU_NCPUS to 0 (to avoid using CPUs both via
+To enable the use of CPU cores via OpenCL, one can set the \ref STARPU_OPENCL_ON_CPUS
+environment variable to 1 and \ref STARPU_NCPUS to 0 (to avoid using CPUs both via
 the OpenCL driver and the normal CPU driver).
 
 */

+ 60 - 18
doc/doxygen/chapters/21simgrid.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -9,9 +9,10 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform.
+platform. This was tested with simgrid 3.11, 3.12 and 3.13, other versions may have
+compatibility issues.
 
-\section Preparing Preparing your application for simulation.
+\section Preparing Preparing Your Application For Simulation
 
 There are a few technical details which need to be handled for an application to
 be simulated through Simgrid.
@@ -27,14 +28,15 @@ into starpu_main(), and it is libstarpu which will provide the real main() and
 will call the application's main().
 
 To be able to test with crazy data sizes, one may want to only allocate
-application data if STARPU_SIMGRID is not defined.  Passing a NULL pointer to
+application data if STARPU_SIMGRID is not defined.  Passing a <c>NULL</c> pointer to
 starpu_data_register functions is fine, data will never be read/written to by
 StarPU in Simgrid mode anyway.
 
 To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
-provide it. StarPU will never actually run it in Simgrid mode anyway.
+provide it. StarPU will not actually run it in Simgrid mode anyway by default
+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
 
 \section Calibration Calibration
 
@@ -98,10 +100,10 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
     matvecmult does not have a perfmodel, or is not calibrated enough
 \endverbatim
 
-The number of devices can be chosen as usual with \ref STARPU_NCPU, \ref
-STARPU_NCUDA, and \ref STARPU_NOPENCL, and the amount of GPU memory
-with \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM, \ref
-STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM.
+The number of devices can be chosen as usual with \ref STARPU_NCPU,
+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL, and the amount of GPU memory
+with \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM.
 
 \section SimulationOnAnotherMachine Simulation On Another Machine
 
@@ -118,25 +120,32 @@ If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
 use simgrid to simulate execution with CUDA/OpenCL devices, but the application
 source code will probably disable the CUDA and OpenCL codelets in thatcd sc
 case. Since during simgrid execution, the functions of the codelet are actually
-not called, one can use dummy functions such as the following to still permit
-CUDA or OpenCL execution:
+not called by default, one can use dummy functions such as the following to
+still permit CUDA or OpenCL execution.
 
-\section SimulationExamples Simulation examples
+\section SimulationExamples Simulation Examples
 
 StarPU ships a few performance models for a couple of systems: attila,
 mirage, idgraf, and sirocco. See section \ref SimulatedBenchmarks for the details.
 
-\section Tweaking simulation
+\section FakeSimulations Simulations On Fake Machines
+
+It is possible to build fake machines which do not exist, by modifying the
+platform file in <c>$STARPU_HOME/.starpu/sampling/bus/machine.platform.xml</c>
+by hand: one can add more CPUs, add GPUs (but the performance model file has to
+be extended as well), change the available GPU memory size, PCI memory bandwidth, etc.
+
+\section TweakingSimulation Tweaking Simulation
 
 The simulation can be tweaked, to be able to tune it between a very accurate
 simulation and a very simple simulation (which is thus close to scheduling
-theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST and \ref
-STARPU_SIMGRID_CUDA_QUEUE_COST environment variables.
+theory results), see the \ref STARPU_SIMGRID_CUDA_MALLOC_COST and
+\ref STARPU_SIMGRID_CUDA_QUEUE_COST environment variables.
 
-\section MPI applications
+\section SimulationMPIApplications MPI Applications
 
 StarPU-MPI applications can also be run in simgrid mode. It needs to be compiled
-with smpicc, and run using the starpu_smpirun script, for instance:
+with smpicc, and run using the <c>starpu_smpirun</c> script, for instance:
 
 \verbatim
 $ STARPU_SCHED=dmda starpu_smpirun -platform cluster.xml -hostfile hostfile ./mpi/tests/pingpong
@@ -147,7 +156,7 @@ list of MPI nodes to be used. StarPU currently only supports homogeneous MPI
 clusters: for each MPI node it will just replicate the architecture referred by
 \ref STARPU_HOSTNAME.
 
-\section Debugging applications
+\section SimulationDebuggingApplications Debugging Applications
 
 By default, simgrid uses its own implementation of threads, which prevents gdb
 from being able to inspect stacks of all threads.  To be able to fully debug an
@@ -157,5 +166,38 @@ able to manipulate as usual.
 
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 
+\section SimulationMemoryUsage Memory Usage
+
+Since kernels are not actually run and data transfers are not actually
+performed, the data memory does not actually need to be allocated.  This allows
+for instance to simulate the execution of applications processing very big data
+on a small laptop.
+
+The application can for instance pass <c>1</c> (or whatever bogus pointer)
+to starpu data registration functions, instead of allocating data. This will
+however require the application to take care of not trying to access the data,
+and will not work in MPI mode, which performs transfers.
+
+Another way is to pass the \ref STARPU_MALLOC_SIMULATION_FOLDED flag to the
+starpu_malloc_flags() function. This will make it allocate a memory area which
+one can read/write, but optimized so that this does not actually consume
+memory. Of course, the values read from such area will be bogus, but this allows
+the application to keep e.g. data load, store, initialization as it is, and also
+work in MPI mode.
+
+Note however that notably Linux kernels refuse obvious memory overcommitting by
+default, so a single allocation can typically not be bigger than the amount of
+physical memory, see https://www.kernel.org/doc/Documentation/vm/overcommit-accounting
+This prevents for instance from allocating a single huge matrix. Allocating a
+huge matrix in several tiles is not a problem, however. <c>sysctl
+vm.overcommit_memory=1</c> can also be used to allow such overcommit.
+
+Note however that this folding is done by remapping the same file several times,
+and Linux kernels will also refuse to create too many memory areas. <c>sysctl
+vm.max_map_count</c> can be used to check and change the default (65535). By
+default, StarPU uses a 1MiB file, so it hopefully fits in the CPU cache. This
+however limits the amount of such folded memory to a bit below 64GiB. The
+\ref STARPU_MALLOC_SIMULATION_FOLD environment variable can be used to increase the
+size of the file.
 
 */

doc/doxygen/chapters/22openmp_runtime_support.doxy → doc/doxygen/chapters/480_openmp_runtime_support.doxy


+ 67 - 22
doc/doxygen/chapters/23clustering_a_machine.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2015 Universit@'e de Bordeaux
- * Copyright (C) 2015 CNRS
+ * Copyright (C) 2015, 2016 CNRS
  * Copyright (C) 2015 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -46,7 +46,7 @@ object, of the type <c>hwloc_obj_type_t</c>. More can be found in the
 documentation</a>.
 
 Once a cluster is created, the full machine is represented with an opaque
-structure named <c>starpu_cluster_machine</c>. This can be printed to show the
+structure starpu_cluster_machine. This can be printed to show the
 current machine state.
 
 \code{.c}
@@ -96,31 +96,31 @@ threads (in pink).
 \image latex parallel_worker2.eps "StarPU with an OpenMP cluster" width=0.3\textwidth
 \image html parallel_worker2.png "StarPU with an OpenMP cluster"
 
-Finally, the following code shows how to create OpenMP cooperate with StarPU
+Finally, the following code shows how to force OpenMP to cooperate with StarPU
 and create the aforementioned OpenMP threads constrained in the cluster's
 resources set:
 \code{.c}
 void starpu_openmp_prologue(void * sched_ctx_id)
-        int sched_ctx = *(int*)sched_ctx_id;
-		int *cpuids = NULL;
-		int ncpuids = 0;
-		int workerid = starpu_worker_get_id();
-
-        //we can target only CPU workers
-		if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
-		{
-                //grab all the ids inside the cluster
-				starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
-                //set the number of threads
-				omp_set_num_threads(ncpuids);
+  int sched_ctx = *(int*)sched_ctx_id;
+  int *cpuids = NULL;
+  int ncpuids = 0;
+  int workerid = starpu_worker_get_id();
+
+  //we can target only CPU workers
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+  {
+    //grab all the ids inside the cluster
+    starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
+    //set the number of threads
+    omp_set_num_threads(ncpuids);
 #pragma omp parallel
-                {
-                        //bind each threads to its respective resource
-						starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
-                }
-                free(cpuids);
-		}
-		return;
+    {
+      //bind each threads to its respective resource
+      starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
+    }
+    free(cpuids);
+  }
+  return;
 }
 \endcode
 
@@ -162,4 +162,49 @@ clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET,
                                   0);
 \endcode
 
+\section ClustersWithSchedulingContextsAPI Clusters With Scheduling
+Contexts API As previously mentioned, the cluster API is implemented
+on top of \ref SchedulingContexts. Its main addition is to ease the
+creation of a machine CPU partition with no overlapping by using
+HwLoc, whereas scheduling contexts can use any number of any
+resources.
+
+It is therefore possible, but not recommended, to create clusters
+using the scheduling contexts API. This can be useful mostly in the
+most complex machine configurations where the user has to dimension
+precisely clusters by hand using his own algorithm.
+
+\code{.c}
+/* the list of resources the context will manage */
+int workerids[3] = {1, 3, 10};
+
+/* indicate the list of workers assigned to it, the number of workers,
+the name of the context and the scheduling policy to be used within
+the context */
+int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", 0);
+
+/* let StarPU know that the following tasks will be submitted to this context */
+starpu_sched_ctx_set_task_context(id);
+
+task->prologue_callback_pop_func=runtime_interface_function_here;
+
+/* submit the task to StarPU */
+starpu_task_submit(task);
+\endcode
+
+As this example illustrates, creating a context without scheduling
+policy will create a cluster. The important change is that the user
+will have to specify an interface function between the two runtimes he
+plans to use. This can be done in the
+<c>prologue_callback_pop_func</c> field of the task. Such a function
+can be similar to the OpenMP thread team creation one.
+
+Note that the OpenMP mode is the default one both for clusters and
+contexts. The result of a cluster creation is a woken up master worker
+and sleeping "slaves" which allow the master to run tasks on their
+resources. To create a cluster with woken up workers one can use the
+flag \ref STARPU_SCHED_CTX_AWAKE_WORKERS with the scheduling context
+API and \ref STARPU_CLUSTER_AWAKE_WORKERS with the cluster API as
+parameter to the creation function.
+
 */

+ 157 - 28
doc/doxygen/chapters/40environment_variables.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -85,9 +85,8 @@ execution of all tasks.
 \anchor STARPU_OPENCL_ON_CPUS
 \addindex __env__STARPU_OPENCL_ON_CPUS
 By default, the OpenCL driver only enables GPU and accelerator
-devices. By setting the environment variable \ref
-STARPU_OPENCL_ON_CPUS to 1, the OpenCL driver will also enable CPU
-devices.
+devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
+to 1, the OpenCL driver will also enable CPU devices.
 </dd>
 
 <dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
@@ -95,9 +94,8 @@ devices.
 \anchor STARPU_OPENCL_ONLY_ON_CPUS
 \addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
 By default, the OpenCL driver enables GPU and accelerator
-devices. By setting the environment variable \ref
-STARPU_OPENCL_ONLY_ON_CPUS to 1, the OpenCL driver will ONLY enable
-CPU devices.
+devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
+to 1, the OpenCL driver will ONLY enable CPU devices.
 </dd>
 
 <dt>STARPU_NMIC</dt>
@@ -153,8 +151,8 @@ and <c>STARPU_WORKERS_CPUID = "0 2 1 3"</c>, the CUDA device will be controlled
 by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
 the logical CPUs #1 and #3 will be used by the CPU workers.
 
-If the number of workers is larger than the array given in \ref
-STARPU_WORKERS_CPUID, the workers are bound to the logical CPUs in a
+If the number of workers is larger than the array given in
+\ref STARPU_WORKERS_CPUID, the workers are bound to the logical CPUs in a
 round-robin fashion: if <c>STARPU_WORKERS_CPUID = "0 1"</c>, the first
 and the third (resp. second and fourth) workers will be put on CPU #0
 (resp. CPU #1).
@@ -214,6 +212,13 @@ starpu_conf::use_explicit_workers_scc_deviceid passed to starpu_init()
 is set.
 </dd>
 
+<dt>STARPU_WORKER_TREE</dt>
+<dd>
+\anchor STARPU_WORKER_TREE
+\addindex __env__STARPU_WORKER_TREE
+Define to 1 to enable the tree iterator in schedulers.
+</dd>
+
 <dt>STARPU_SINGLE_COMBINED_WORKER</dt>
 <dd>
 \anchor STARPU_SINGLE_COMBINED_WORKER
@@ -313,6 +318,27 @@ and friends.  The default is Enabled.
 This permits to test the performance effect of memory pinning.
 </dd>
 
+<dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
+<dd>
+\anchor STARPU_MIC_SINK_PROGRAM_NAME
+\addindex __env__STARPU_MIC_SINK_PROGRAM_NAME
+todo
+</dd>
+
+<dt>STARPU_MIC_SINK_PROGRAM_PATH</dt>
+<dd>
+\anchor STARPU_MIC_SINK_PROGRAM_PATH
+\addindex __env__STARPU_MIC_SINK_PROGRAM_PATH
+todo
+</dd>
+
+<dt>STARPU_MIC_PROGRAM_PATH</dt>
+<dd>
+\anchor STARPU_MIC_PROGRAM_PATH
+\addindex __env__STARPU_MIC_PROGRAM_PATH
+todo
+</dd>
+
 </dl>
 
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
@@ -329,6 +355,20 @@ random, stealing, greedy, with performance models, etc.
 Use <c>STARPU_SCHED=help</c> to get the list of available schedulers.
 </dd>
 
+<dt>STARPU_MIN_PRIO</dt>
+<dd>
+\anchor STARPU_MIN_PRIO_env
+\addindex __env__STARPU_MIN_PRIO
+Set the mininum priority used by priorities-aware schedulers.
+</dd>
+
+<dt>STARPU_MAX_PRIO</dt>
+<dd>
+\anchor STARPU_MAX_PRIO_env
+\addindex __env__STARPU_MAX_PRIO
+Set the maximum priority used by priorities-aware schedulers.
+</dd>
+
 <dt>STARPU_CALIBRATE</dt>
 <dd>
 \anchor STARPU_CALIBRATE
@@ -390,14 +430,14 @@ the coefficient to be applied to it before adding it to the computation part.
 <dd>
 \anchor STARPU_SCHED_GAMMA
 \addindex __env__STARPU_SCHED_GAMMA
-Define the execution time penalty of a joule (\ref Power-basedScheduling).
+Define the execution time penalty of a joule (\ref Energy-basedScheduling).
 </dd>
 
 <dt>STARPU_IDLE_POWER</dt>
 <dd>
 \anchor STARPU_IDLE_POWER
 \addindex __env__STARPU_IDLE_POWER
-Define the idle power of the machine (\ref Power-basedScheduling).
+Define the idle power of the machine (\ref Energy-basedScheduling).
 </dd>
 
 <dt>STARPU_PROFILING</dt>
@@ -417,8 +457,8 @@ Enable on-line performance monitoring (\ref EnablingOn-linePerformanceMonitoring
 <dd>
 \anchor SOCL_OCL_LIB_OPENCL
 \addindex __env__SOCL_OCL_LIB_OPENCL
-THE SOCL test suite is only run when the environment variable \ref
-SOCL_OCL_LIB_OPENCL is defined. It should contain the location
+THE SOCL test suite is only run when the environment variable
+\ref SOCL_OCL_LIB_OPENCL is defined. It should contain the location
 of the file <c>libOpenCL.so</c> of the OCL ICD implementation.
 </dd>
 
@@ -487,6 +527,30 @@ When set to 1 (which is the default), CUDA task and transfer queueing costs are
 taken into account in simgrid mode.
 </dd>
 
+<dt>STARPU_PCI_FLAT</dt>
+<dd>
+\anchor STARPU_PCI_FLAT
+\addindex __env__STARPU_PCI_FLAT
+When unset or set to to 0, the platform file created for simgrid will
+contain PCI bandwidths and routes.
+</dd>
+
+<dt>STARPU_SIMGRID_QUEUE_MALLOC_COST</dt>
+<dd>
+\anchor STARPU_SIMGRID_QUEUE_MALLOC_COST
+\addindex __env__STARPU_SIMGRID_QUEUE_MALLOC_COST
+When unset or set to 1, simulate within simgrid the GPU transfer queueing.
+</dd>
+
+<dt>STARPU_MALLOC_SIMULATION_FOLD</dt>
+<dd>
+\anchor STARPU_MALLOC_SIMULATION_FOLD
+\addindex __env__STARPU_MALLOC_SIMULATION_FOLD
+This defines the size of the file used for folding virtual allocation, in
+MiB. The default is 1, thus allowing 64GiB virtual memory when Linux's
+<c>sysctl vm.max_map_count</c> value is the default 65535.
+</dd>
+
 </dl>
 
 \section MiscellaneousAndDebug Miscellaneous And Debug
@@ -502,6 +566,15 @@ configuration files. The default is <c>$HOME</c> on Unix environments,
 and <c>$USERPROFILE</c> on Windows environments.
 </dd>
 
+<dt>STARPU_PATH</dt>
+<dd>
+\anchor STARPU_PATH
+\addindex __env__STARPU_PATH
+Only used on  Windows environments.
+This specifies the main directory in which StarPU is installed
+(\ref RunningABasicStarPUApplicationOnMicrosoft)
+</dd>
+
 <dt>STARPU_PERF_MODEL_DIR</dt>
 <dd>
 \anchor STARPU_PERF_MODEL_DIR
@@ -670,25 +743,25 @@ This specifies then size to be used by StarPU to push data when the main
 memory is getting full. The default is unlimited.
 </dd>
 
-<dt>STARPU_LIMIT_MAX_NSUBMITTED_TASKS</dt>
+<dt>STARPU_LIMIT_MAX_SUBMITTED_TASKS</dt>
 <dd>
-\anchor STARPU_LIMIT_MAX_NSUBMITTED_TASKS
-\addindex __env__STARPU_LIMIT_MAX_NSUBMITTED_TASKS    
+\anchor STARPU_LIMIT_MAX_SUBMITTED_TASKS
+\addindex __env__STARPU_LIMIT_MAX_SUBMITTED_TASKS    
 This variable allows the user to control the task submission flow by specifying
 to StarPU a maximum number of submitted tasks allowed at a given time, i.e. when
 this limit is reached task submission becomes blocking until enough tasks have
-completed, specified by STARPU_LIMIT_MIN_NSUBMITTED_TASKS.
+completed, specified by \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS.
 Setting it enables allocation cache buffer reuse in main memory.
 </dd>
 
-<dt>STARPU_LIMIT_MIN_NSUBMITTED_TASKS</dt>
+<dt>STARPU_LIMIT_MIN_SUBMITTED_TASKS</dt>
 <dd>
-\anchor STARPU_LIMIT_MIN_NSUBMITTED_TASKS
-\addindex __env__STARPU_LIMIT_MIN_NSUBMITTED_TASKS    
+\anchor STARPU_LIMIT_MIN_SUBMITTED_TASKS
+\addindex __env__STARPU_LIMIT_MIN_SUBMITTED_TASKS    
 This variable allows the user to control the task submission flow by specifying
 to StarPU a submitted task threshold to wait before unblocking task submission. This
-variable has to be used in conjunction with \ref
-STARPU_LIMIT_MAX_NSUBMITTED_TASKS which puts the task submission thread to
+variable has to be used in conjunction with \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS
+which puts the task submission thread to
 sleep.  Setting it enables allocation cache buffer reuse in main memory.
 </dd>
 
@@ -724,6 +797,14 @@ When set to 0, disable the display of memory statistics on data which
 have not been unregistered at the end of the execution (\ref MemoryFeedback).
 </dd>
 
+<dt>STARPU_MAX_MEMORY_USE</dt>
+<dd>
+\anchor STARPU_MAX_MEMORY_USE
+\addindex __env__STARPU_MAX_MEMORY_USE
+When set to 1, display at the end of the execution the maximum memory used by
+StarPU for internal data structures during execution.
+</dd>
+
 <dt>STARPU_BUS_STATS</dt>
 <dd>
 \anchor STARPU_BUS_STATS
@@ -738,8 +819,8 @@ starpu_shutdown() (\ref Profiling).
 \addindex __env__STARPU_WORKER_STATS
 When defined, statistics about the workers will be displayed when calling
 starpu_shutdown() (\ref Profiling). When combined with the
-environment variable \ref STARPU_PROFILING, it displays the power
-consumption (\ref Power-basedScheduling).
+environment variable \ref STARPU_PROFILING, it displays the energy
+consumption (\ref Energy-basedScheduling).
 </dd>
 
 <dt>STARPU_STATS</dt>
@@ -755,9 +836,10 @@ end of the execution of an application (\ref DataStatistics).
 \anchor STARPU_WATCHDOG_TIMEOUT
 \addindex __env__STARPU_WATCHDOG_TIMEOUT
 When set to a value other than 0, allows to make StarPU print an error
-message whenever StarPU does not terminate any task for the given time (in µs). Should
-be used in combination with \ref STARPU_WATCHDOG_CRASH (see \ref
-DetectionStuckConditions).
+message whenever StarPU does not terminate any task for the given time (in µs),
+but lets the application continue normally. Should
+be used in combination with \ref STARPU_WATCHDOG_CRASH
+(see \ref DetectionStuckConditions).
 </dd>
 
 <dt>STARPU_WATCHDOG_CRASH</dt>
@@ -769,6 +851,35 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_TASK_BREAK_ON_SCHED</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_SCHED
+\addindex __env__STARPU_TASK_BREAK_ON_SCHED
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being scheduled by the scheduler (at a scheduler-specific
+point), which will be nicely catched by debuggers.
+This only works for schedulers which have such a scheduling point defined
+(see \ref DebuggingScheduling)
+</dd>
+
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
 <dt>STARPU_DISABLE_KERNELS</dt>
 <dd>
 \anchor STARPU_DISABLE_KERNELS
@@ -795,7 +906,25 @@ average.
 The random scheduler and some examples use random numbers for their own
 working. Depending on the examples, the seed is by default juste always 0 or
 the current time() (unless simgrid mode is enabled, in which case it is always
-0). STARPU_RAND_SEED allows to set the seed to a specific value.
+0). \ref STARPU_RAND_SEED allows to set the seed to a specific value.
+</dd>
+
+<dt>STARPU_IDLE_TIME</dt>
+<dd>
+\anchor STARPU_IDLE_TIME
+\addindex __env__STARPU_IDLE_TIME
+When set to a value being a valid filename, a corresponding file
+will be created when shutting down StarPU. The file will contain the
+sum of all the workers' idle time.
+</dd>
+
+<dt>STARPU_GLOBAL_ARBITER</dt>
+<dd>
+\anchor STARPU_GLOBAL_ARBITER
+\addindex __env__STARPU_GLOBAL_ARBITER
+When set to a positive value, StarPU will create a arbiter, which
+implements an advanced but centralized management of concurrent data
+accesses (see \ref ConcurrentDataAccess).
 </dd>
 
 </dl>

+ 15 - 5
doc/doxygen/chapters/41configure_options.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -442,12 +442,22 @@ Enable the use of OpenGL for the rendering of some examples.
 // TODO: rather default to enabled when detected
 </dd>
 
-<dt>--enable-blas-lib</dt>
+<dt>--enable-blas-lib=<c>prefix</c></dt>
 <dd>
 \anchor enable-blas-lib
 \addindex __configure__--enable-blas-lib
-Specify the blas library to be used by some of the examples. The
-library has to be 'atlas' or 'goto'.
+Specify the blas library to be used by some of the examples. Librairies available :
+- none [default] : no BLAS library is used
+- atlas: use ATLAS library
+- goto: use GotoBLAS library
+- mkl: use MKL library (you may need to set specific CFLAGS and LDFLAGS with --with-mkl-cflags and --with-mkl-ldflags)
+</dd>
+
+<dt>--enable-leveldb</dt>
+<dd>
+\anchor enable-leveldb
+\addindex __configure__--enable-leveldb
+Enable linking with LevelDB if available
 </dd>
 
 <dt>--disable-starpufft</dt>
@@ -543,7 +553,7 @@ Disable the build of tests.
 <dd>
 \anchor enable-sc-hypervisor
 \addindex __configure__--enable-sc-hypervisor
-Enable the Scheduling Context Hypervisor plugin(\ref SchedulingContextHypervisor).
+Enable the Scheduling Context Hypervisor plugin (\ref SchedulingContextHypervisor).
 By default, it is disabled.
 </dd>
 

+ 5 - 1
doc/doxygen/chapters/45files.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
 */
@@ -13,6 +13,7 @@
 \file starpu.h
 \file starpu_bitmap.h
 \file starpu_bound.h
+\file starpu_clusters_util.h
 \file starpu_cublas.h
 \file starpu_cuda.h
 \file starpu_data_filters.h
@@ -25,15 +26,18 @@
 \file starpu_fxt.h
 \file starpu_hash.h
 \file starpu_mic.h
+\file starpu_mod.f90
 \file starpu_opencl.h
 \file starpu_openmp.h
 \file starpu_perfmodel.h
 \file starpu_profiling.h
 \file starpu_rand.h
 \file starpu_scc.h
+\file starpu_sched_component.h
 \file starpu_sched_ctx.h
 \file starpu_sched_ctx_hypervisor.h
 \file starpu_scheduler.h
+\file starpu_simgrid_wrap.h
 \file starpu_sink.h
 \file starpu_stdlib.h
 \file starpu_task_bundle.h

doc/doxygen/chapters/50scaling-vector-example.doxy → doc/doxygen/chapters/601_scaling_vector_example.doxy


doc/doxygen/chapters/51fdl-1.3.doxy → doc/doxygen/chapters/610_fdl_1_3.doxy


+ 41 - 30
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -133,6 +133,9 @@ Value to be set in starpu_codelet::cuda_flags to allow asynchronous CUDA kernel
 \def STARPU_OPENCL_ASYNC
 Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL kernel execution.
 
+\def STARPU_CODELET_SIMGRID_EXECUTE
+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
@@ -157,7 +160,7 @@ SCC implementation of a codelet.
 \ingroup API_Codelet_And_Tasks
 MIC kernel for a codelet
 
-\typedef *starpu_scc_kernel_t
+\typedef starpu_scc_kernel_t
 \ingroup API_Codelet_And_Tasks
 SCC kernel for a codelet
 
@@ -309,8 +312,8 @@ Is an array of ::starpu_data_access_mode. It describes the required
 access modes to the data needed by the codelet (e.g. ::STARPU_RW).
 The number of entries in this array must be specified in the field
 starpu_codelet::nbuffers. This field should be used for codelets having a
-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
-SettingManyDataHandlesForATask). When defining a codelet, one
+number of datas greater than \ref STARPU_NMAXBUFS
+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_codelet::modes defined above.
 
 \var unsigned starpu_codelet::specific_nodes
@@ -331,8 +334,8 @@ Optional field. When starpu_codelet::specific_nodes is 1, this specifies
 the memory nodes where each data should be sent to for task execution.
 The number of entries in this array is starpu_codelet::nbuffers.
 This field should be used for codelets having a
-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
-SettingManyDataHandlesForATask). When defining a codelet, one
+number of datas greater than \ref STARPU_NMAXBUFS
+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_codelet::nodes defined
 above.
 
@@ -341,8 +344,8 @@ Optional pointer to the task duration performance model associated to
 this codelet. This optional field is ignored when set to <c>NULL</c> or when
 its field starpu_perfmodel::symbol is not set.
 
-\var struct starpu_perfmodel *starpu_codelet::power_model
-Optional pointer to the task power consumption performance model
+\var struct starpu_perfmodel *starpu_codelet::energy_model
+Optional pointer to the task energy consumption performance model
 associated to this codelet. This optional field is ignored when set to
 <c>NULL</c> or when its field starpu_perfmodel::field is not set. In
 the case of parallel codelets, this has to account for all processing
@@ -358,11 +361,14 @@ starpu_codelet_display_stats() for details).
 Optional name of the codelet. This can be useful for debugging
 purposes.
 
+\var const char *starpu_codelet::flags
+Various flags for the codelet.
+
 \fn void starpu_codelet_init(struct starpu_codelet *cl)
 \ingroup API_Codelet_And_Tasks
 Initialize \p cl with default values. Codelets should
-preferably be initialized statically as shown in \ref
-DefiningACodelet. However such a initialisation is not always
+preferably be initialized statically as shown in
+\ref DefiningACodelet. However such a initialisation is not always
 possible, e.g. when using C++.
 
 \struct starpu_data_descr
@@ -442,14 +448,18 @@ It is an array of ::starpu_data_access_mode. It describes the required
 access modes to the data needed by the codelet (e.g. ::STARPU_RW).
 The number of entries in this array must be specified in the field
 starpu_codelet::nbuffers. This field should be used for codelets having a
-number of datas greater than \ref STARPU_NMAXBUFS (see \ref
-SettingManyDataHandlesForATask). When defining a codelet, one
+number of datas greater than \ref STARPU_NMAXBUFS
+(see \ref SettingManyDataHandlesForATask). When defining a codelet, one
 should either define this field or the field starpu_task::modes defined above.
 
 \var void *starpu_task::cl_arg
 Optional pointer which is passed to the codelet through the second
 argument of the codelet implementation (e.g. starpu_codelet::cpu_func
 or starpu_codelet::cuda_func). The default value is <c>NULL</c>.
+starpu_codelet_pack_args() and starpu_codelet_unpack_args() are helpers that can
+can be used to respectively pack and unpack data into and from it, but the
+application can manage it any way, the only requirement is that the size of the
+data must be set in starpu_task:cl_arg_size .
 
 \var size_t starpu_task::cl_arg_size
 Optional field. For some specific drivers, the pointer
@@ -669,15 +679,15 @@ with the function starpu_task_init() function.
 \def STARPU_TASK_GET_NBUFFERS(task)
 \ingroup API_Codelet_And_Tasks
 Return the number of buffers for this task, i.e. starpu_codelet::nbuffers, or
-starpu_task::nbuffers if the former is STARPU_VARIABLE_BUFFERS.
+starpu_task::nbuffers if the former is STARPU_VARIABLE_NBUFFERS.
 
 \def STARPU_TASK_GET_HANDLE(task, i)
 \ingroup API_Codelet_And_Tasks
 Return the \p i th data handle of the given task. If the task
 is defined with a static or dynamic number of handles, will either
 return the \p i th element of the field starpu_task::handles or the \p
-i th element of the field starpu_task::dyn_handles (see \ref
-SettingManyDataHandlesForATask)
+i th element of the field starpu_task::dyn_handles
+(see \ref SettingManyDataHandlesForATask)
 
 \def STARPU_TASK_SET_HANDLE(task, handle, i)
 \ingroup API_Codelet_And_Tasks
@@ -685,8 +695,8 @@ Set the \p i th data handle of the given task with the given
 dat handle. If the task is defined with a static or dynamic number of
 handles, will either set the \p i th element of the field
 starpu_task::handles or the \p i th element of the field
-starpu_task::dyn_handles (see \ref
-SettingManyDataHandlesForATask)
+starpu_task::dyn_handles
+(see \ref SettingManyDataHandlesForATask)
 
 \def STARPU_CODELET_GET_MODE(codelet, i)
 \ingroup API_Codelet_And_Tasks
@@ -694,8 +704,8 @@ Return the access mode of the \p i th data handle of the given
 codelet. If the codelet is defined with a static or dynamic number of
 handles, will either return the \p i th element of the field
 starpu_codelet::modes or the \p i th element of the field
-starpu_codelet::dyn_modes (see \ref
-SettingManyDataHandlesForATask)
+starpu_codelet::dyn_modes
+(see \ref SettingManyDataHandlesForATask)
 
 \def STARPU_CODELET_SET_MODE(codelet, mode, i)
 \ingroup API_Codelet_And_Tasks
@@ -703,8 +713,8 @@ Set the access mode of the \p i th data handle of the given
 codelet. If the codelet is defined with a static or dynamic number of
 handles, will either set the \p i th element of the field
 starpu_codelet::modes or the \p i th element of the field
-starpu_codelet::dyn_modes (see \ref
-SettingManyDataHandlesForATask)
+starpu_codelet::dyn_modes
+(see \ref SettingManyDataHandlesForATask)
 
 \def STARPU_TASK_GET_MODE(task, i)
 \ingroup API_Codelet_And_Tasks
@@ -712,8 +722,8 @@ Return the access mode of the \p i th data handle of the given
 task. If the task is defined with a static or dynamic number of
 handles, will either return the \p i th element of the field
 starpu_task::modes or the \p i th element of the field
-starpu_task::dyn_modes (see \ref
-SettingManyDataHandlesForATask)
+starpu_task::dyn_modes
+(see \ref SettingManyDataHandlesForATask)
 
 \def STARPU_TASK_SET_MODE(task, mode, i)
 \ingroup API_Codelet_And_Tasks
@@ -721,8 +731,8 @@ Set the access mode of the \p i th data handle of the given
 task. If the task is defined with a static or dynamic number of
 handles, will either set the \p i th element of the field
 starpu_task::modes or the \p i th element of the field
-starpu_task::dyn_modes (see \ref
-SettingManyDataHandlesForATask)
+starpu_task::dyn_modes
+(see \ref SettingManyDataHandlesForATask)
 
 \fn struct starpu_task *starpu_task_create(void)
 \ingroup API_Codelet_And_Tasks
@@ -802,14 +812,15 @@ created automatically by StarPU.
 
 \fn int starpu_task_wait_for_all(void)
 \ingroup API_Codelet_And_Tasks
-This function blocks until all the tasks that were submitted
-(to the current context or the global one if there aren't any) are
-terminated. It does not destroy these tasks.
+This function blocks until all the tasks that were submitted (to the
+current context or the global one if there aren't any) are terminated.
+It does not destroy these tasks.
 
 \fn int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx_id)
 \ingroup API_Codelet_And_Tasks
-This function waits until all the tasks that were already
-submitted to the context \p sched_ctx_id have been executed
+This function waits until all the tasks
+that were already submitted to the context \p sched_ctx_id have been
+executed.
 
 \fn int starpu_task_wait_for_n_submitted(unsigned n)
 \ingroup API_Codelet_And_Tasks

+ 12 - 12
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -245,7 +245,7 @@ by the application internally: this makes it possible to forbid the
 concurrent execution of different tasks accessing the same <c>void</c> data
 in read-write concurrently. 
 
-\fn void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size)
+\fn void starpu_variable_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, size_t size)
 \ingroup API_Data_Interfaces
 Register the \p size byte element pointed to by \p ptr, which is
 typically a scalar, and initialize \p handle to represent this data item.
@@ -263,7 +263,7 @@ Register into the \p handle that to store data on node \p node it should use the
 buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
 (for OpenCL, notably)
 
-\fn void starpu_vector_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
+\fn void starpu_vector_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx elemsize-byte elements pointed to by \p ptr and initialize \p handle to represent it.
 
@@ -280,7 +280,7 @@ Register into the \p handle that to store data on node \p node it should use the
 buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
 (for OpenCL, notably)
 
-\fn void starpu_matrix_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize)
+\fn void starpu_matrix_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx x \p  ny 2D matrix of \p elemsize-byte elements pointed
 by \p ptr and initialize \p handle to represent it. \p ld specifies the number
@@ -301,7 +301,7 @@ Register into the \p handle that to store data on node \p node it should use the
 buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
 (for OpenCL, notably), with \p ld elements between rows.
 
-\fn void starpu_block_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize)
+\fn void starpu_block_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize)
 \ingroup API_Data_Interfaces
 Register the \p nx x \p ny x \p nz 3D matrix of \p elemsize byte elements
 pointed by \p ptr and initialize \p handle to represent it. Again, \p ldy and
@@ -321,7 +321,7 @@ Register into the \p handle that to store data on node \p node it should use the
 buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
 (for OpenCL, notably), with \p ldy elements between rows and \p ldz elements between z planes.
 
-\fn void starpu_bcsr_data_register(starpu_data_handle_t *handle, unsigned home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize)
+\fn void starpu_bcsr_data_register(starpu_data_handle_t *handle, int home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize)
 \ingroup API_Data_Interfaces
 This variant of starpu_data_register() uses the BCSR (Blocked
 Compressed Sparse Row Representation) sparse matrix interface.
@@ -333,12 +333,12 @@ blocks), \p colind[i] is the block-column index for block i in \p nzval,
 \p firstentry is the index of the first entry of the given arrays
 (usually 0 or 1). 
 
-\fn void starpu_csr_data_register(starpu_data_handle_t *handle, unsigned home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
+\fn void starpu_csr_data_register(starpu_data_handle_t *handle, int home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
 \ingroup API_Data_Interfaces
 This variant of starpu_data_register() uses the CSR (Compressed
 Sparse Row Representation) sparse matrix interface. TODO
 
-\fn void starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_node, uint32_t nx, uint32_t ny, uint32_t n_values, uint32_t *columns, uint32_t *rows, uintptr_t values, size_t elemsize);
+\fn void starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node, uint32_t nx, uint32_t ny, uint32_t n_values, uint32_t *columns, uint32_t *rows, uintptr_t values, size_t elemsize);
 \ingroup API_Data_Interfaces
 Register the \p nx x \p ny 2D matrix given in the COO format, using the
 \p columns, \p rows, \p values arrays, which must have \p n_values elements of
@@ -353,8 +353,8 @@ Return the interface associated with \p handle on \p memory_node.
 
 Each data interface is provided with a set of field access functions.
 The ones using a void * parameter aimed to be used in codelet
-implementations (see for example the code in \ref
-VectorScalingUsingStarPUAPI).
+implementations (see for example the code in
+\ref VectorScalingUsingStarPUAPI).
 
 \fn void *starpu_data_handle_to_pointer(starpu_data_handle_t handle, unsigned node)
 \ingroup API_Data_Interfaces
@@ -1012,8 +1012,8 @@ designated by \p interface.
 @name Defining Interface
 \ingroup API_Data_Interfaces
 
-Applications can provide their own interface as shown in \ref
-DefiningANewDataInterface.
+Applications can provide their own interface as shown in
+\ref DefiningANewDataInterface.
 
 \fn uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
 \ingroup API_Data_Interfaces

+ 11 - 4
doc/doxygen/chapters/api/data_management.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -9,8 +9,8 @@
 /*! \defgroup API_Data_Management Data Management
 
 \brief This section describes the data management facilities provided
-by StarPU. We show how to use existing data interfaces in \ref
-API_Data_Interfaces, but developers can design their own data interfaces if
+by StarPU. We show how to use existing data interfaces in
+\ref API_Data_Interfaces, but developers can design their own data interfaces if
 required.
 
 \typedef starpu_data_handle_t
@@ -69,6 +69,13 @@ reads or non-commutative writes).
 \ingroup API_Data_Management
 used in starpu_mpi_insert_task() to specify the data has to be sent
 using a synchronous and non-blocking mode (see starpu_mpi_issend())
+\var starpu_data_access_mode::STARPU_LOCALITY
+\ingroup API_Data_Management
+used to tell the scheduler which data is the most important for the task, and
+should thus be used to try to group tasks on the same core or cache, etc. For
+now only the ws and lws schedulers take this flag into account, and only when
+rebuild with USE_LOCALITY flag defined in the
+src/sched_policies/work_stealing_policy.c source code.
 
 @name Basic Data Management API
 \ingroup API_Data_Management
@@ -101,7 +108,7 @@ data to StarPU, the specified memory node indicates where the piece of
 data initially resides (we also call this memory node the home node of
 a piece of data).
 
-\fn void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node, void *data_interface, struct starpu_data_interface_ops *ops)
+\fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
 \ingroup API_Data_Management
 Register a piece of data into the handle located at the
 \p handleptr address. The \p data_interface buffer contains the initial

+ 6 - 0
doc/doxygen/chapters/api/explicit_dependencies.doxy

@@ -32,6 +32,12 @@ This function can only be called if \p task has not completed yet, otherwise
 the results are undefined. The result may also be outdated if some additional
 dependency has been added in the meanwhile.
 
+\fn int starpu_task_get_task_scheduled_succs(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[])
+\ingroup API_Explicit_Dependencies
+This behaves like starpu_task_get_task_succs(), except that it only reports
+tasks which will go through the scheduler, thus avoiding tasks with not codelet,
+or with explicit placement.
+
 \typedef starpu_tag_t
 \ingroup API_Explicit_Dependencies
 This type defines a task logical identifer. It is possible to

+ 10 - 0
doc/doxygen/chapters/api/fxt_support.doxy

@@ -77,6 +77,12 @@ starpu_shutdown(). starpu_fxt_stop_profiling() can however be used to
 stop it earlier. starpu_fxt_start_profiling() can then be called to
 start recording it again, etc.
 
+\fn void starpu_fxt_autostart_profiling(int autostart)
+\ingroup API_FxT_Support
+Determines whether profiling should be started by starpu_init, or only when
+starpu_fxt_start_profiling is called. \e autostart should be 1 to do so, or 0 to
+prevent it.
+
 \fn void starpu_fxt_write_data_trace(char *filename_in)
 \ingroup API_FxT_Support
 todo
@@ -85,5 +91,9 @@ todo
 \ingroup API_FxT_Support
 Add an event in the execution trace if FxT is enabled.
 
+\fn void starpu_fxt_trace_user_event_string(const char *s)
+\ingroup API_FxT_Support
+Add a string event in the execution trace if FxT is enabled.
+
 */
 

+ 2 - 2
doc/doxygen/chapters/api/implicit_dependencies.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -18,7 +18,7 @@ task that access it in write mode, dependencies will be added between
 the two first tasks and the third one. Implicit data dependencies are
 also inserted in the case of data accesses from the application.
 
-\fn starpu_data_set_default_sequential_consistency_flag(unsigned flag)
+\fn void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 \ingroup API_Implicit_Data_Dependencies
 Set the default sequential consistency flag. If a non-zero
 value is passed, a sequential data consistency will be enforced for

+ 11 - 11
doc/doxygen/chapters/api/initialization.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -69,8 +69,8 @@ provided by the hwloc library in case it is available.
 If this flag is set, the CUDA workers will be attached to the CUDA
 devices specified in the starpu_conf::workers_cuda_gpuid array.
 Otherwise, StarPU affects the CUDA devices in a round-robin fashion.
-This can also be specified with the environment variable \ref
-STARPU_WORKERS_CUDAID. (default = 0)
+This can also be specified with the environment variable
+\ref STARPU_WORKERS_CUDAID. (default = 0)
 \var unsigned starpu_conf::workers_cuda_gpuid[STARPU_NMAXWORKERS]
 If the starpu_conf::use_explicit_workers_cuda_gpuid flag is set, this
 array contains the logical identifiers of the CUDA devices (as used by
@@ -79,8 +79,8 @@ cudaGetDevice()).
 If this flag is set, the OpenCL workers will be attached to the OpenCL
 devices specified in the starpu_conf::workers_opencl_gpuid array.
 Otherwise, StarPU affects the OpenCL devices in a round-robin fashion.
-This can also be specified with the environment variable \ref
-STARPU_WORKERS_OPENCLID. (default = 0)
+This can also be specified with the environment variable
+\ref STARPU_WORKERS_OPENCLID. (default = 0)
 \var unsigned starpu_conf::workers_opencl_gpuid[STARPU_NMAXWORKERS]
 If the starpu_conf::use_explicit_workers_opencl_gpuid flag is set,
 this array contains the logical identifiers of the OpenCL devices to
@@ -89,8 +89,8 @@ be used.
 If this flag is set, the MIC workers will be attached to the MIC
 devices specified in the array starpu_conf::workers_mic_deviceid.
 Otherwise, StarPU affects the MIC devices in a round-robin fashion.
-This can also be specified with the environment variable \ref
-STARPU_WORKERS_MICID.
+This can also be specified with the environment variable
+\ref STARPU_WORKERS_MICID.
 (default = 0)
 \var unsigned starpu_conf::workers_mic_deviceid[STARPU_NMAXWORKERS]
 If the flag starpu_conf::use_explicit_workers_mic_deviceid is set, the
@@ -103,8 +103,8 @@ devices specified in the array starpu_conf::workers_scc_deviceid.
 If the flag starpu_conf::use_explicit_workers_scc_deviceid is set, the
 array contains the logical identifiers of the SCC devices to be used.
 Otherwise, StarPU affects the SCC devices in a round-robin fashion.
-This can also be specified with the environment variable \ref
-STARPU_WORKERS_SCCID.
+This can also be specified with the environment variable
+\ref STARPU_WORKERS_SCCID.
 
 \var int starpu_conf::bus_calibrate
 If this flag is set, StarPU will recalibrate the bus.  If this value
@@ -141,8 +141,8 @@ host program location.
 \var int starpu_conf::disable_asynchronous_copy
 This flag should be set to 1 to disable
 asynchronous copies between CPUs and all accelerators. This
-can also be specified with the environment variable \ref
-STARPU_DISABLE_ASYNCHRONOUS_COPY. The
+can also be specified with the environment variable
+\ref STARPU_DISABLE_ASYNCHRONOUS_COPY. The
 AMD implementation of OpenCL is known to fail when copying
 data asynchronously. When using this implementation, it is
 therefore necessary to disable asynchronous data transfers.

+ 21 - 5
doc/doxygen/chapters/api/insert_task.doxy

@@ -1,14 +1,14 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
 
 /*! \defgroup API_Insert_Task Insert_Task
 
-\fn starpu_insert_task(struct starpu_codelet *cl, ...)
+\fn int starpu_insert_task(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task
 This function does the same as the function starpu_task_insert(). It has been kept to avoid breaking old codes.
 
@@ -31,8 +31,8 @@ specifying the worker on which to execute the task (as specified by
 starpu_task::execute_on_a_specific_worker)
 <li> the specific values ::STARPU_VALUE, ::STARPU_CALLBACK,
 ::STARPU_CALLBACK_ARG, ::STARPU_CALLBACK_WITH_ARG, ::STARPU_PRIORITY,
-::STARPU_TAG, ::STARPU_TAG_ONLY, ::STARPU_FLOPS, ::STARPU_SCHED_CTX followed by the
-appropriated objects as defined elsewhere.
+::STARPU_TAG, ::STARPU_TAG_ONLY, ::STARPU_FLOPS, ::STARPU_SCHED_CTX, ::STARPU_CL_ARGS
+followed by the appropriated objects as defined elsewhere.
 </ul>
 
 When using ::STARPU_DATA_ARRAY, the access mode of the data handles is
@@ -51,6 +51,14 @@ this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to a constant value and the size of the
 constant
 
+\def STARPU_CL_ARGS
+\ingroup API_Insert_Task
+this macro is used when calling starpu_task_insert(), and must
+be followed by a memory buffer containing the arguments to be given to
+the task, and by the size of the arguments. The memory buffer should
+be the result of a previous call to starpu_codelet_pack_args(), and will be
+freed (i.e. starpu_task::cl_arg_free will be set to 1)
+
 \def STARPU_CALLBACK
 \ingroup API_Insert_Task
 this macro is used when calling starpu_task_insert(), and must
@@ -131,7 +139,15 @@ starpu_codelet_unpack_args().
 \fn void starpu_codelet_unpack_args(void *cl_arg, ...)
 \ingroup API_Insert_Task
 Retrieve the arguments of type ::STARPU_VALUE associated to a
-task automatically created using the function starpu_task_insert().
+task automatically created using the function starpu_task_insert(). If
+some parameter is NULL, unpacking will stop there and ignore the remaining
+parameters.
+
+\fn void starpu_codelet_unpack_args_and_copyleft(void *cl_arg, void *buffer, size_t buffer_size, ...)
+\ingroup API_Insert_Task
+Similar to starpu_codelet_unpack_args(), but if some parameter is
+NULL, copy the part of cl_arg that has not been read in buffer which
+can then be used in a later call to one of the unpack functions.
 
 \fn struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task

+ 9 - 7
doc/doxygen/chapters/api/modularized_scheduler.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2013        Simon Archipoff
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2014, 2015        CNRS
+ * Copyright (C) 2014, 2015, 2016        CNRS
  * Copyright (C) 2013, 2014  INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -150,7 +150,7 @@ The actual scheduler
 \ingroup API_Modularized_Scheduler
 	 compatibility with starpu_sched_policy interface
 
-\fn struct starpu_task *starpu_sched_tree_pop_task()
+\fn struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx)
 \ingroup API_Modularized_Scheduler
 	 compatibility with starpu_sched_policy interface
 
@@ -169,7 +169,7 @@ The actual scheduler
 @name Generic Scheduling Component API
 \ingroup API_Modularized_Scheduler
 
-\fn struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree)
+\fn struct starpu_sched_component *starpu_sched_component_create(struct starpu_sched_tree *tree, const char *name)
 \ingroup API_Modularized_Scheduler
 	 allocate and initialize component field with defaults values :
 	.pop_task make recursive call on father
@@ -441,10 +441,12 @@ todo
 \ingroup API_Modularized_Scheduler
 	 this function build a scheduler for \p sched_ctx_id according to \p s and the hwloc topology of the machine.
 
-\fn int starpu_sched_component_push_task(struct starpu_sched_component *component, struct starpu_task *task);
-	Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
+\fn int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task)
+\ingroup API_Modularized_Scheduler
+Push a task to a component. This is a helper for <c>component->push_task(component, task)</c> plus tracing.
 
-\fn struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *component);
-	Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
+\fn struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to)
+\ingroup API_Modularized_Scheduler
+Pull a task from a component. This is a helper for <c>component->pull_task(component)</c> plus tracing.
 
 */

+ 40 - 4
doc/doxygen/chapters/api/mpi.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -23,6 +23,7 @@ Initializes the starpumpi library with the given communicator.
 \p initialize_mpi indicates if MPI should be initialized or not by StarPU.
 If the value is not 0, MPI will be initialized by calling
 <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED, ...)</c>.
+starpu_init() must be called before starpu_mpi_init_comm().
 
 \fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
 \ingroup API_MPI_Support
@@ -56,6 +57,22 @@ the array \p comm_amounts which must have a size greater or equal to
 the world size. Communications statistics must be enabled (see
 \ref STARPU_COMM_STATS).
 
+\fn int starpu_mpi_comm_size(MPI_Comm comm, int *size)
+\ingroup API_MPI_Support
+Return in \p size the size of the communicator \p comm
+
+\fn int starpu_mpi_comm_rank(MPI_Comm comm, int *rank)
+\ingroup API_MPI_Support
+Return in \p rank the rank of the calling process in the communicator \p comm
+
+\fn int starpu_mpi_world_rank(void)
+\ingroup API_MPI_Support
+Return the rank of the calling process in the communicator MPI_COMM_WORLD
+
+\fn int starpu_mpi_world_size(void)
+\ingroup API_MPI_Support
+Return the size of the communicator MPI_COMM_WORLD
+
 @name Communication
 \anchor MPIPtpCommunication
 \ingroup API_MPI_Support
@@ -156,6 +173,10 @@ operation.
 Blocks the caller until all group members of the communicator \p comm
 have called it.
 
+\fn int starpu_mpi_wait_for_all(MPI_Comm comm)
+\ingroup API_MPI_Support
+Wait until all StarPU tasks and communications for the given communicator are completed.
+
 \fn int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Posts a standard-mode, non blocking send of \p data_handle to the node
@@ -270,7 +291,7 @@ Symbol kept for backward compatibility. Calling function starpu_mpi_data_set_ran
 \ingroup API_MPI_Support
 Return the rank of the given data.
 
-\def starpu_data_get_rank(handle)
+\def starpu_data_get_rank(starpu_data_handle_t handle)
 \ingroup API_MPI_Support
 Return the rank of the given data.
 Symbol kept for backward compatibility. Calling function starpu_mpi_data_get_rank
@@ -279,11 +300,20 @@ Symbol kept for backward compatibility. Calling function starpu_mpi_data_get_ran
 \ingroup API_MPI_Support
 Return the tag of the given data.
 
-\def starpu_data_get_tag(handle)
+\def starpu_data_get_tag(starpu_data_handle_t handle)
 \ingroup API_MPI_Support
 Return the tag of the given data.
 Symbol kept for backward compatibility. Calling function starpu_mpi_data_get_tag
 
+\fn void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank)
+\ingroup API_MPI_Support
+Migrate the data onto the \p new_rank MPI node. This means both transferring
+the data to node \p new_rank if it hasn't been transferred already, and setting
+the home node of the data to the new node. Further data transfers triggered by
+starpu_mpi_task_insert() will be done from that new node. This function thus
+needs to be called on all nodes which have registered the data. This also
+flushes the cache for this data to avoid incoherencies.
+
 \def STARPU_EXECUTE_ON_NODE
 \ingroup API_MPI_Support
 this macro is used when calling starpu_mpi_task_insert(), and must be
@@ -296,6 +326,12 @@ this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a data handle to specify that the node owning the given
 data will execute the codelet.
 
+\def STARPU_NODE_SELECTION_POLICY
+\ingroup API_MPI_Support
+this macro is used when calling starpu_mpi_task_insert(), and must be
+followed by a identifier to a node selection policy. This is needed when several
+nodes own data in ::STARPU_W mode.
+
 \fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
 This function does the same as the function starpu_mpi_task_insert(). It has been kept to avoid breaking old codes.
@@ -321,7 +357,7 @@ The internal algorithm is as follows:
         Find out which MPI node is going to execute the codelet.
         <ul>
             <li>If there is only one node owning data in ::STARPU_W mode, it will be selected;
-            <li>If there is several nodes owning data in ::STARPU_W node, a node will be selected according to a given node selection policy (see ::STARPU_NODE_SELECTION_POLICY or starpu_mpi_node_selection_set_current_policy())
+            <li>If there is several nodes owning data in ::STARPU_W mode, a node will be selected according to a given node selection policy (see ::STARPU_NODE_SELECTION_POLICY or starpu_mpi_node_selection_set_current_policy())
             <li>The argument ::STARPU_EXECUTE_ON_NODE followed by an integer can be used to specify the node;
             <li>The argument ::STARPU_EXECUTE_ON_DATA followed by a data handle can be used to specify that the node owing the given data will execute the codelet.
         </ul>

+ 1 - 1
doc/doxygen/chapters/api/multiformat_data_interface.doxy

@@ -43,7 +43,7 @@ todo
 \var uint32_t starpu_multiformat_interface::nx
 \var struct starpu_multiformat_data_interface_ops *starpu_multiformat_interface::ops
 
-\fn void starpu_multiformat_data_register(starpu_data_handle_t *handle, unsigned home_node, void *ptr, uint32_t nobjects, struct starpu_multiformat_data_interface_ops *format_ops)
+\fn void starpu_multiformat_data_register(starpu_data_handle_t *handle, int home_node, void *ptr, uint32_t nobjects, struct starpu_multiformat_data_interface_ops *format_ops)
 \ingroup API_Multiformat_Data_Interface
 Register a piece of data that can be represented in different
 ways, depending upon the processing unit that manipulates it. It

+ 6 - 2
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -117,6 +117,10 @@ has been located on the system, \p located_dir_name the directory
 where it has been located. Otherwise, they are both set to the empty
 string.
 
+\fn void starpu_opencl_load_program_source_malloc(const char *source_file_name, char **located_file_name, char **located_dir_name, char **opencl_program_source)
+\ingroup API_OpenCL_Extensions
+Similar to function starpu_opencl_load_program_source() but it allocates the buffers located_file_name, located_dir_name and opencl_program_source.
+
 \fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options)
 \ingroup API_OpenCL_Extensions
 Compile the OpenCL kernel stored in the file \p source_file_name
@@ -163,7 +167,7 @@ This function allows to collect statistics on a kernel execution.
 After termination of the kernels, the OpenCL codelet should call this
 function to pass it the even returned by clEnqueueNDRangeKernel, to
 let StarPU collect statistics about the kernel execution (used cycles,
-consumed power).
+consumed energy).
 
 @name OpenCL utilities
 \ingroup API_OpenCL_Extensions

+ 5 - 0
doc/doxygen/chapters/api/openmp_runtime_support.doxy

@@ -802,6 +802,11 @@ This function returns the team number of the calling thread.
 \ingroup API_OpenMP_Runtime_Support
 This function checks whether the current device is the initial device or not.
 
+\fn int starpu_omp_get_max_task_priority
+\ingroup API_OpenMP_Runtime_Support
+The omp_get_max_task_priority routine returns the maximum value that can be
+specified in the priority clause.
+
 \return <c>!0</c> if called from the host device.
 \return <c>0</c> otherwise.
 

+ 33 - 41
doc/doxygen/chapters/api/performance_model.doxy

@@ -1,52 +1,13 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
 
 /*! \defgroup API_Performance_Model Performance Model
 
-\enum starpu_perfmodel_archtype
-\ingroup API_Performance_Model
-Enumerates the various types of architectures.
-
-it is possible that we have multiple versions of the same kind of
-workers, for instance multiple GPUs or even different CPUs within
-the same machine so we do not use the archtype enum type directly
-for performance models.
-
-<ul>
-<li> CPU types range within ::STARPU_CPU_DEFAULT (1 CPU),
-::STARPU_CPU_DEFAULT+1 (2 CPUs), ... ::STARPU_CPU_DEFAULT +
-STARPU_MAXCPUS - 1 (STARPU_MAXCPUS CPUs).
-</li>
-<li> CUDA types range within ::STARPU_CUDA_DEFAULT (GPU number 0),
-::STARPU_CUDA_DEFAULT + 1 (GPU number 1), ..., ::STARPU_CUDA_DEFAULT +
-STARPU_MAXCUDADEVS - 1 (GPU number STARPU_MAXCUDADEVS - 1).
-</li>
-<li> OpenCL types range within ::STARPU_OPENCL_DEFAULT (GPU number
-0), ::STARPU_OPENCL_DEFAULT + 1 (GPU number 1), ...,
-::STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS - 1 (GPU number
-STARPU_MAXOPENCLDEVS - 1).
-</ul>
-\var starpu_perfmodel_archtype::STARPU_CPU_DEFAULT
-\ingroup API_Performance_Model
-CPU combined workers between 0 and STARPU_MAXCPUS-1
-\var starpu_perfmodel_archtype::STARPU_CUDA_DEFAULT
-\ingroup API_Performance_Model
-CUDA workers
-\var starpu_perfmodel_archtype::STARPU_OPENCL_DEFAULT
-\ingroup API_Performance_Model
-OpenCL workers
-\var starpu_perfmodel_archtype::STARPU_MIC_DEFAULT
-\ingroup API_Performance_Model
-MIC workers
-\var starpu_perfmodel_archtype::STARPU_SCC_DEFAULT
-\ingroup API_Performance_Model
-SCC workers
-
 \enum starpu_perfmodel_type
 \ingroup API_Performance_Model
 TODO
@@ -66,6 +27,24 @@ Automatic linear regression-based cost model  (alpha * size ^ beta)
 \ingroup API_Performance_Model
 Automatic non-linear regression-based cost model (a * size ^ b + c)
 
+\struct starpu_perfmodel_device
+todo
+\ingroup API_Performance_Model
+\var enum starpu_worker_archtype starpu_perfmodel_device::type
+is the type of the device
+\var int starpu_perfmodel_device::devid
+is the identifier of the precise device
+\var int starpu_perfmodel_device::ncore
+is the number of execution in parallel, minus 1
+
+\struct starpu_perfmodel_arch
+todo
+\ingroup API_Performance_Model
+\var int starpu_perfmodel_arch::ndevices
+is the number of the devices for the given arch
+\var struct starpu_perfmodel_device *starpu_perfmodel_arch::devices
+is the list of the devices for the given arch
+
 \struct starpu_perfmodel
 Contains all information about a performance model. At least the
 type and symbol fields have to be filled when defining a performance
@@ -81,7 +60,9 @@ is the type of performance model
 ::STARPU_NL_REGRESSION_BASED: No other fields needs to be provided,
 this is purely history-based.
 </li>
-<li> ::STARPU_PER_ARCH: field starpu_perfmodel::per_arch has to be
+<li> ::STARPU_PER_ARCH: either field starpu_perfmodel::arch_cost_function has to be
+filled with a function that returns the cost in micro-seconds on the arch given
+as parameter, or field starpu_perfmodel::per_arch has to be
 filled with functions which return the cost in micro-seconds.
 </li>
 <li> ::STARPU_COMMON: field starpu_perfmodel::cost_function has to be
@@ -97,6 +78,9 @@ be ignored.
 \var double (*starpu_perfmodel::cost_function)(struct starpu_task *, unsigned nimpl)
 Used by ::STARPU_COMMON: takes a task and implementation number, and
 must return a task duration estimation in micro-seconds.
+\var double (*starpu_perfmodel::arch_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl)
+Used by ::STARPU_COMMON: takes a task, an arch and implementation number, and
+must return a task duration estimation in micro-seconds on that arch.
 \var size_t (*starpu_perfmodel::size_base)(struct starpu_task *, unsigned nimpl)
 Used by ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED and
 ::STARPU_NL_REGRESSION_BASED. If not NULL, takes a task and
@@ -262,6 +246,10 @@ todo
 \ingroup API_Performance_Model
 todo
 
+\fn int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t footprint, FILE *output)
+\ingroup API_Performance_Model
+todo
+
 \fn void starpu_bus_print_bandwidth(FILE *f)
 \ingroup API_Performance_Model
 prints a matrix of bus bandwidths on \p f.
@@ -270,6 +258,10 @@ prints a matrix of bus bandwidths on \p f.
 \ingroup API_Performance_Model
 prints the affinity devices on \p f.
 
+\fn void starpu_bus_print_filenames(FILE *f)
+\ingroup API_Performance_Model
+prints on \p f the name of the files containing the matrix of bus bandwidths, the affinity devices and the latency.
+
 \fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 \ingroup API_Performance_Model
 This feeds the performance model model with an explicit

+ 4 - 4
doc/doxygen/chapters/api/profiling.doxy

@@ -61,8 +61,8 @@ Number of cycles used by the task, only available in the MoviSim
 \var uint64_t starpu_profiling_task_info::stall_cycles
 Number of cycles stalled within the task, only available in the MoviSim
 
-\var double starpu_profiling_task_info::power_consumed
-Power consumed by the task, only available in the MoviSim
+\var double starpu_profiling_task_info::energy_consumed
+Energy consumed by the task, only available in the MoviSim
 
 \struct starpu_profiling_worker_info
 This structure contains the profiling information associated to
@@ -83,8 +83,8 @@ starpu_profiling_worker_get_info()
         Number of cycles used by the worker, only available in the MoviSim
 \var uint64_t starpu_profiling_worker_info::stall_cycles
         Number of cycles stalled within the worker, only available in the MoviSim
-\var double starpu_profiling_worker_info::power_consumed
-        Power consumed by the worker, only available in the MoviSim
+\var double starpu_profiling_worker_info::energy_consumed
+        Energy consumed by the worker, only available in the MoviSim
 
 \struct starpu_profiling_bus_info
 todo

+ 5 - 1
doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2012, 2013 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -149,6 +149,10 @@ The quantity of data(in bytes) needed by the task to execute
 \var sc_hypervisor_policy_task_pool::next
 Other task kinds
 
+\def STARPU_HYPERVISOR_TAG
+\ingroup API_SC_Hypervisor
+todo
+
 \fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
 \ingroup API_SC_Hypervisor
 Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag

+ 3 - 3
doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012, 2013 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -152,8 +152,8 @@ workers are not allowed to be moved from the context.
 This macro is used when calling sc_hypervisor_ctl() and must be
 followed by 1 argument (int) that indicated the minimum number of
 tasks that have to be executed before the context could be resized.
-This parameter is ignored for the Application Driven strategy (see \ref 
-ResizingStrategies) where the user indicates exactly when the resize
+This parameter is ignored for the Application Driven strategy (see
+\ref ResizingStrategies) where the user indicates exactly when the resize
 should be done.
 
 \def SC_HYPERVISOR_NEW_WORKERS_MAX_IDLE

+ 12 - 2
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -37,6 +37,11 @@ hypervisor how the application and the resources are executing.
 @name Scheduling Contexts Basic API
 \ingroup API_Scheduling_Contexts
 
+\def STARPU_NMAX_SCHED_CTXS
+\ingroup API_Scheduling_Policy
+Define the maximum number of scheduling contexts managed by StarPU. The default value can be
+modified at configure by using the option \ref enable-max-sched-ctxs "--enable-max-sched-ctxs".
+
 \fn unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...)
 \ingroup API_Scheduling_Contexts
 This function creates a scheduling context with the given parameters
@@ -88,6 +93,11 @@ minimum scheduler priority value.
 This macro is used when calling starpu_sched_ctx_create() to specify a
 maximum scheduler priority value.
 
+\def STARPU_SCHED_CTX_POLICY_INIT
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() to specify a
+function pointer allowing to initialize the scheduling policy.
+
 \fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
@@ -168,7 +178,7 @@ Return 1 if the worker belongs to the context and 0 otherwise
 \ingroup API_Scheduling_Contexts
 Return the workerid if the worker belongs to the context and -1 otherwise.
 If the thread calling this function is not a worker the function returns -1
-as it calls the function \ref starpu_worker_get_id()
+as it calls the function starpu_worker_get_id().
 
 \fn unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 \ingroup API_Scheduling_Contexts

+ 40 - 7
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -13,18 +13,45 @@
 implement custom policies to address specific problems. The API
 described below allows users to write their own scheduling policy.
 
+\def STARPU_MAXIMPLEMENTATIONS
+\ingroup API_Scheduling_Policy
+Define the maximum number of implementations per architecture. The default value can be modified at
+configure by using the option \ref enable-maximplementations "--enable-maximplementations".
+
 \struct starpu_sched_policy
 \ingroup API_Scheduling_Policy
 This structure contains all the methods that implement a
 scheduling policy. An application may specify which scheduling
 strategy in the field starpu_conf::sched_policy passed to the function
 starpu_init().
+
+For each task going through the scheduler, the following methods get called in the given order:
+
+<ul>
+<li>starpu_sched_policy::submit_hook when the task is submitted</li>
+<li>starpu_sched_policy::push_task when the task becomes ready. The scheduler is here <b>given</b> the task</li>
+<li>starpu_sched_policy::pop_task when a worker is idle. The scheduler here <b>gives</b> back the task to the core</li>
+<li>starpu_sched_policy::pre_exec_hook right before the worker actually starts the task computation (after transferring any missing data).</li>
+<li>starpu_sched_policy::post_exec_hook right after the worker actually completed the task computation.</li>
+</ul>
+
+For each task not going through the scheduler (because starpu_task::execute_on_a_specific_worker was set), these get called:
+
+<ul>
+<li>starpu_sched_policy::submit_hook when the task is submitted</li>
+<li>starpu_sched_policy::push_task_notify when the task becomes ready. This is just a notification, the scheduler does not have to do anything about the task.</li>
+<li>starpu_sched_policy::pre_exec_hook right before the worker actually starts the task computation (after transferring any missing data).</li>
+<li>starpu_sched_policy::post_exec_hook right after the worker actually completed the task computation.</li>
+</ul>
+
+
 \var void (*starpu_sched_policy::init_sched)(unsigned sched_ctx_id)
-        Initialize the scheduling policy.
+        Initialize the scheduling policy, called before any other method.
 \var void (*starpu_sched_policy::deinit_sched)(unsigned sched_ctx_id)
-        Cleanup the scheduling policy.
+        Cleanup the scheduling policy, called before any other method.
 \var int (*starpu_sched_policy::push_task)(struct starpu_task *)
-        Insert a task into the scheduler.
+        Insert a task into the scheduler, called when the task becomes ready for
+        execution.
 \var void (*starpu_sched_policy::push_task_notify)(struct starpu_task *, int workerid, int perf_workerid, unsigned sched_ctx_id)
         Notify the scheduler that a task was pushed on a given worker.
 	This method is called when a task that was explicitly
@@ -44,11 +71,17 @@ starpu_init().
 	chained by the means of the field starpu_task::prev and
 	starpu_task::next). The mutex associated to the worker is
 	already taken when this method is called. This is currently
-	not used.
+	not used and can be discarded.
+\var void (*starpu_sched_policy::submit_hook)(struct starpu_task *)
+        Optional field. This method is called when a task is submitted.
 \var void (*starpu_sched_policy::pre_exec_hook)(struct starpu_task *)
         Optional field. This method is called every time a task is starting.
 \var void (*starpu_sched_policy::post_exec_hook)(struct starpu_task *)
         Optional field. This method is called every time a task has been executed.
+\var void (*starpu_sched_policy::do_schedule)(unsigned sched_ctx_id)
+        Optional field. This method is called when it is a good time to start
+        scheduling tasks. This is notably called when the application calls
+        starpu_task_wait_for_all or starpu_do_schedule explicitly.
 \var void (*starpu_sched_policy::add_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
         Initialize scheduling structures corresponding to each worker used by the policy.
 \var void (*starpu_sched_policy::remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
@@ -171,9 +204,9 @@ Returns expected data transfer time in micro-seconds.
 \ingroup API_Scheduling_Policy
 Predict the transfer time (in micro-seconds) to move \p handle to a memory node
 
-\fn double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
+\fn double starpu_task_expected_energy(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
-Returns expected power consumption in J
+Returns expected energy consumption in J
 
 \fn double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy

+ 30 - 10
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -45,6 +45,15 @@ according to the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM
 environment variables. If STARPU_MEMORY_WAIT is set, no overflowing will happen,
 starpu_malloc_flags() will wait for other eviction mechanisms to release enough memory.
 
+\def STARPU_MALLOC_SIMULATION_FOLDED
+\ingroup API_Standard_Memory_Library
+Value passed to the function starpu_malloc_flags() to indicate that when
+StarPU is using simgrid, the allocation can be "folded", i.e. a memory area is
+allocated, but its content is actually a replicate of the same memory area, to
+avoid having to actually allocate that much memory . This thus allows to have a
+memory area that does not actually consumes memory, to which one can read from
+and write to normally, but get bogus values.
+
 \fn int starpu_malloc_flags(void **A, size_t dim, int flags)
 \ingroup API_Standard_Memory_Library
 Performs a memory allocation based on the constraints defined
@@ -59,7 +68,8 @@ constraints.
 
 \fn int starpu_malloc(void **A, size_t dim)
 \ingroup API_Standard_Memory_Library
-This function allocates data of the given size in main memory.
+This function allocates data of the given size \p dim in main memory, and
+returns the pointer to the allocated data through \p A.
 It will also try to pin it in CUDA or OpenCL, so that data transfers
 from this buffer can be asynchronous, and thus permit data transfer
 and computation overlapping. The allocated buffer must be freed thanks
@@ -89,20 +99,30 @@ starpu_memory_pin(). Returns 0 on success, -1 on error.
 
 \fn ssize_t starpu_memory_get_total(unsigned node)
 \ingroup API_Standard_Memory_Library
-If a memory limit is defined on the given node (see Section \ref
-HowToLimitMemoryPerNode), return the amount of total memory
+If a memory limit is defined on the given node (see Section
+\ref HowToLimitMemoryPerNode), return the amount of total memory
 on the node. Otherwise return -1.
 
+\fn ssize_t starpu_memory_get_total_all_nodes()
+\ingroup API_Standard_Memory_Library
+return the amount of total memory on all memory nodes for whose a memory limit
+is defined (see Section \ref HowToLimitMemoryPerNode).
+
 \fn ssize_t starpu_memory_get_available(unsigned node)
 \ingroup API_Standard_Memory_Library
-If a memory limit is defined on the given node (see Section \ref
-HowToLimitMemoryPerNode), return the amount of available memory
+If a memory limit is defined on the given node (see Section
+\ref HowToLimitMemoryPerNode), return the amount of available memory
 on the node. Otherwise return -1.
 
+\fn ssize_t starpu_memory_get_available_all_nodes()
+\ingroup API_Standard_Memory_Library
+return the amount of available memory on all memory nodes for whose a memory limit
+is defined (see Section \ref HowToLimitMemoryPerNode).
+
 \fn int starpu_memory_allocate(unsigned node, size_t size, int flags)
 \ingroup API_Standard_Memory_Library
-If a memory limit is defined on the given node (see Section \ref
-HowToLimitMemoryPerNode), try to allocate some of it. This does not actually
+If a memory limit is defined on the given node (see Section
+\ref HowToLimitMemoryPerNode), try to allocate some of it. This does not actually
 allocate memory, but only accounts for it. This can be useful when the
 application allocates data another way, but want StarPU to be aware of the
 allocation size e.g. for memory reclaiming.
@@ -112,8 +132,8 @@ STARPU_MEMORY_OVERFLOW to change this.
 
 \fn void starpu_memory_deallocate(unsigned node, size_t size)
 \ingroup API_Standard_Memory_Library
-If a memory limit is defined on the given node (see Section \ref
-HowToLimitMemoryPerNode), free some of it. This does not actually free memory,
+If a memory limit is defined on the given node (see Section
+\ref HowToLimitMemoryPerNode), free some of it. This does not actually free memory,
 but only accounts for it, like starpu_memory_allocate(). The amount does not
 have to be exactly the same as what was passed to starpu_memory_allocate(),
 only the eventual amount needs to be the same, i.e. one call to

+ 2 - 2
doc/doxygen/chapters/api/task_bundles.doxy

@@ -48,9 +48,9 @@ it when possible.
 \ingroup API_Task_Bundles
 Return the expected duration of \p bundle in micro-seconds.
 
-\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl)
+\fn double starpu_task_bundle_expected_energy(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Task_Bundles
-Return the expected power consumption of \p bundle in J.
+Return the expected energy consumption of \p bundle in J.
 
 \fn double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node)
 \ingroup API_Task_Bundles

+ 5 - 1
doc/doxygen/chapters/api/task_lists.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -64,5 +64,9 @@ Get the end of \p list.
 \ingroup API_Task_Lists
 Get the next task of \p list. This is not erase-safe.
 
+\fn int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
+\ingroup API_Task_Lists
+Test whether the given task \p look is contained in the \p list.
+
 */
 

+ 27 - 21
doc/doxygen/chapters/api/threads.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -193,8 +193,8 @@ function returns immediately. If the mutex is already locked by
 another thread, the function suspends the calling thread until the
 mutex is unlocked.
 
-This function also produces trace when the configure option \ref
-enable-fxt-lock "--enable-fxt-lock" is enabled.
+This function also produces trace when the configure option
+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
 
 \fn int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 \ingroup API_Threads
@@ -202,8 +202,8 @@ This function unlocks the given mutex. The mutex is assumed to be
 locked and owned by the calling thread on entrance to
 starpu_pthread_mutex_unlock().
 
-This function also produces trace when the configure option \ref
-enable-fxt-lock "--enable-fxt-lock" is enabled.
+This function also produces trace when the configure option
+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
 
 \fn int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 \ingroup API_Threads
@@ -213,8 +213,8 @@ already locked by another thread (or by the calling thread in the case
 of a ``fast''  mutex). Instead, the function returns immediately with
 the error code EBUSY.
 
-This function also produces trace when the configure option \ref
-enable-fxt-lock "--enable-fxt-lock" is enabled.
+This function also produces trace when the configure option
+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
 
 \typedef STARPU_PTHREAD_MUTEX_INITIALIZER
 \ingroup API_Threads
@@ -253,7 +253,7 @@ the key.
 This function changes the value associated with \p key in the calling
 thread, storing the given \p pointer instead.
 
-\fn  *starpu_pthread_getspecific(starpu_pthread_key_t key)
+\fn void *starpu_pthread_getspecific(starpu_pthread_key_t key)
 \ingroup API_Threads
 This function returns the value associated with \p key on success, and
 NULL on error.
@@ -262,25 +262,25 @@ NULL on error.
 \ingroup API_Threads
 This macro initializes the condition variable given in parameter.
 
-\fn starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr)
+\fn int starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr)
 \ingroup API_Threads
 This function initializes the condition variable \p cond, using the
 condition attributes specified in \p cond_attr, or default attributes
 if \p cond_attr is NULL.
 
-\fn starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
+\fn int starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
 \ingroup API_Threads
 This function restarts one of the threads that are waiting on the
 condition variable \p cond. If no threads are waiting on \p cond,
 nothing happens. If several threads are waiting on \p cond, exactly
 one is restarted, but it not specified which.
 
-\fn starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
+\fn int starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
 \ingroup API_Threads
 This function restarts all the threads that are waiting on the
 condition variable \p cond. Nothing happens if no threads are waiting on cond.
 
-\fn starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
+\fn int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
 \ingroup API_Threads
 This function atomically unlocks the mutex (as per
 starpu_pthread_mutex_unlock()) and waits for the condition variable \p cond
@@ -290,30 +290,30 @@ be locked by the calling thread on entrance to
 starpu_pthread_cond_wait(). Before returning to the calling thread, the
 function re-acquires mutex (as per starpu_pthread_mutex_lock()).
 
-This function also produces trace when the configure option \ref
-enable-fxt-lock "--enable-fxt-lock" is enabled.
+This function also produces trace when the configure option
+\ref enable-fxt-lock "--enable-fxt-lock" is enabled.
 
-\fn starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
+\fn int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
 \ingroup API_Threads
 This function atomically unlocks \p mutex and waits on \p cond, as
 starpu_pthread_cond_wait() does, but it also bounds the duration of
 the wait.
 
-\fn starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
+\fn int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
 \ingroup API_Threads
 This function destroys a condition variable, freeing the resources it
 might hold. No threads must be waiting on the condition variable on
 entrance to the function.
 
-\fn starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr)
+\fn int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_init().
 
-\fn starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
+\fn int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_destroy().
 
-\fn starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
+\fn int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_lock().
 
@@ -321,7 +321,7 @@ This function is the same as starpu_pthread_mutex_lock().
 \ingroup API_Threads
 todo
 
-\fn starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
+\fn int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_lock().
 
@@ -329,7 +329,7 @@ This function is the same as starpu_pthread_mutex_lock().
 \ingroup API_Threads
 todo
 
-\fn starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
+\fn int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 \ingroup API_Threads
 This function is the same as starpu_pthread_mutex_unlock().
 
@@ -365,5 +365,11 @@ todo
 \ingroup API_Threads
 todo
 
+\fn void starpu_sleep(float nb_sec)
+\ingroup API_Threads
+This is the same as calling Unix' sleep function, except that it takes a float
+to allow sub-second sleeping, and when StarPU is compiled in simgrid mode it
+does not really sleep but just makes simgrid record that the thread has taken
+some time to sleep.
 
 */

+ 3 - 3
doc/doxygen/chapters/api/tree.doxy

@@ -1,6 +1,6 @@
 /*
  * This file is part of the StarPU Handbook.
- * Copyright (C) 2014  CNRS
+ * Copyright (C) 2014, 2016  CNRS
  * See the file version.doxy for copying conditions.
  */
 
@@ -23,7 +23,7 @@ todo
 \var int starpu_tree::is_pu
 todo
 
-\fn void starpu_tree_reset_visited(struct starpu_tree *tree, int *visited)
+\fn void starpu_tree_reset_visited(struct starpu_tree *tree, char *visited)
 \ingroup API_Tree
 todo
 
@@ -35,7 +35,7 @@ todo
 \ingroup API_Tree
 todo
 
-\fn struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, int *visited, int *present)
+\fn struct starpu_tree *starpu_tree_get_neighbour(struct starpu_tree *tree, struct starpu_tree *node, char *visited, char *present)
 \ingroup API_Tree
 todo
 

+ 17 - 1
doc/doxygen/chapters/api/workers.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -12,6 +12,17 @@
 \ingroup API_Workers_Properties
 Define the maximum number of workers managed by StarPU.
 
+\def STARPU_MAXCPUS
+\ingroup API_Workers_Properties
+Define the maximum number of CPU workers managed by StarPU. The default value can be modified at
+configure by using the option \ref enable-maxcpus "--enable-maxcpus".
+
+\def STARPU_MAXNODES
+\ingroup API_Workers_Properties
+Define the maximum number of memory nodes managed by StarPU. The default value can be modified at
+configure by using the option \ref enable-maxnodes "--enable-maxnodes". Reducing it allows to
+considerably reduce memory used by StarPU data structures.
+
 \enum starpu_node_kind
 \ingroup API_Workers_Properties
 TODO
@@ -154,6 +165,11 @@ the one associated to the calling thread. The returned value is either
 from the application outside a task or a callback), or an integer
 between 0 and starpu_worker_get_count() - 1.
 
+\fn unsigned starpu_worker_get_id_check(void)
+\ingroup API_Workers_Properties
+This is the same as starpu_worker_get_id, but aborts when called from outside a
+worker (i.e. when starpu_worker_get_id() would return -1).
+
 \fn int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 \ingroup API_Workers_Properties
 This function gets the list of identifiers of workers with the

+ 35 - 0
doc/doxygen/chapters/code/nf_initexit.f90

@@ -0,0 +1,35 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016  Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+! [To be included. You should update doxygen if you see this text.]
+program nf_initexit
+        use iso_c_binding       ! C interfacing module
+        use fstarpu_mod         ! StarPU interfacing module
+        implicit none           ! Fortran recommended best practice
+
+        integer(c_int) :: err   ! return status for fstarpu_init
+
+        ! initialize StarPU with default settings
+        err = fstarpu_init(C_NULL_PTR)
+        if (err /= 0) then
+                stop 1          ! StarPU initialization failure
+        end if
+
+        ! - add StarPU Native Fortran API calls here
+
+        ! shut StarPU down
+        call fstarpu_shutdown()
+end program nf_initexit
+! [To be included. You should update doxygen if you see this text.]

+ 2 - 3
doc/doxygen/chapters/code/scal_pragma.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013  CNRS
- * Copyright (C) 2010-2013  Université de Bordeaux
+ * Copyright (C) 2010-2013, 2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,8 +31,7 @@ vector_mult_cuda (unsigned n, float *val, float factor)
 }
 
 /* Definition of the task implementation declared in the C file. */
-extern "C" void
-vector_scal_cuda (size_t size, float vector[], float factor)
+extern "C" void vector_scal_cuda (size_t size, float vector[], float factor)
 {
   unsigned threads_per_block = 64;
   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;

+ 24 - 9
doc/doxygen/dev/starpu_check_documented.py

@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 import os
 import sys
@@ -7,17 +7,27 @@ class bcolors:
     FAILURE = '\033[91m'
     NORMAL = '\033[0m'
 
-def loadFunctionsAndDatatypes(flist, dtlist, fname):
-    f = open(fname, 'r')
+def list_files(directory):
+    return list(map(lambda a : directory+a, list(filter(lambda a:a.count(".h") and not a.count("starpu_deprecated_api.h"),os.listdir(directory)))))
+
+def loadFunctionsAndDatatypes(flist, dtlist, file_name):
+    f = open(file_name, 'r')
     for line in f:
         mline = line[:-1]
         if mline.count("\\fn"):
             if mline.count("fft") == 0:
                 func = mline.replace("\\fn ", "")
-                flist.append(list([func, fname]))
+                l = func.split("(")[0].split()
+                func_name = l[len(l)-1].replace("*", "")
+                flist.append(list([func, func_name, file_name]))
         if mline.count("\\struct ") or mline.count("\\def ") or mline.count("\\typedef ") or mline.count("\\enum "):
             datatype = mline.replace("\\struct ", "").replace("\\def ", "").replace("\\typedef ", "").replace("\\enum ","")
-            dtlist.append(list([datatype, fname]))
+            l = datatype.split("(")
+            if len(l) > 1:
+                datatype_name = l[0]
+            else:
+                datatype_name = datatype
+            dtlist.append(list([datatype, datatype_name, file_name]))
     f.close()
 
 functions = []
@@ -30,14 +40,19 @@ for docfile in os.listdir(docfile_dir):
     if docfile.count(".doxy"):
         loadFunctionsAndDatatypes(functions, datatypes, docfile_dir+docfile)
 
-incfiles=dirname+"/../../../include/*.h " + dirname + "/../../../mpi/include/*.h " + dirname + "/../../../starpufft/include/*.h " + dirname + "/../../../sc_hypervisor/include/*.h " + dirname + "/../../../include/starpu_config.h.in"
+list_incfiles = [dirname + "/../../../include/starpu_config.h.in"]
+for d in [dirname+"/../../../include/", dirname + "/../../../mpi/include/", dirname + "/../../../starpufft/include/", dirname + "/../../../sc_hypervisor/include/"]:
+    list_incfiles.extend(list_files(d))
+incfiles=" ".join(list_incfiles)
+
 for function in functions:
     x = os.system("sed 's/ *STARPU_ATTRIBUTE_UNUSED *//g' " + incfiles + "| sed 's/ STARPU_WARN_UNUSED_RESULT//g' | fgrep \"" + function[0] + "\" > /dev/null")
     if x != 0:
-        print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[1] + "> does not exist in StarPU's API"
+        print("Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[2] + "> does not exist in StarPU's API")
+        os.system("grep " + function[1] + " " + dirname+"/../../../include/starpu_deprecated_api.h")
 
 for datatype in datatypes:
     x = os.system("fgrep -l \"" + datatype[0] + "\" " + incfiles + " > /dev/null")
     if x != 0:
-        print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
-
+        print("Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[2] + "> does not exist in StarPU's API")
+        os.system("grep " + datatype[1] + " " + dirname+"/../../../include/starpu_deprecated_api.h")

+ 68 - 0
doc/doxygen/dev/starpu_check_refs.sh

@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2016 CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+stcolor=$(tput sgr0)
+redcolor=$(tput setaf 1)
+greencolor=$(tput setaf 2)
+
+dirname=$(dirname $0)
+
+STARPU_H_FILES=$(find $dirname/../../../include $dirname/../../../mpi/include -name '*.h')
+SC_H_FILES=$(find $dirname/../../../sc_hypervisor/include -name '*.h')
+SRC="$dirname/../../../src $dirname/../../../mpi/src $dirname/../../../sc_hypervisor/src"
+
+#grep --exclude-dir=.svn --exclude-dir=.git --binary-files=without-match -rsF "\ref" $dirname/../chapters|grep -v "\\ref [a-zA-Z]"
+#echo continue && read
+
+GREP="grep --exclude-dir=.svn --exclude-dir=.git --binary-files=without-match -rsF"
+
+REFS=$($GREP "\ref" $dirname/../chapters| tr ':' '\012' | tr '.' '\012'  | tr ',' '\012'  | tr '(' '\012' | tr ')' '\012' | tr ' ' '\012'|grep -F '\ref' -A1 | grep -v '^--$' | sed 's/\\ref/=\\ref/' | tr '\012' ':' | tr '=' '\012' | sort | uniq)
+find $dirname/../chapters -name "*doxy" -exec cat {} \; > /tmp/DOXYGEN_$$
+cat $dirname/../refman.tex >> /tmp/DOXYGEN_$$
+
+for r in $REFS
+do
+    ref=$(echo $r | sed 's/\\ref:\(.*\):/\1/')
+    n=$($GREP -crs "section $ref" /tmp/DOXYGEN_$$)
+    if test $n -eq 0
+    then
+	n=$($GREP -crs "anchor $ref" /tmp/DOXYGEN_$$)
+	if test $n -eq 0
+	then
+	    n=$($GREP -crs "ingroup $ref" /tmp/DOXYGEN_$$)
+	    if test $n -eq 0
+	    then
+		n=$($GREP -crs "def $ref" /tmp/DOXYGEN_$$)
+		if test $n -eq 0
+		then
+		    n=$($GREP -crs "struct $ref" /tmp/DOXYGEN_$$)
+		    if test $n -eq 0
+		    then
+			if test $n -eq 0
+			then
+			    n=$($GREP -crs "label{$ref" /tmp/DOXYGEN_$$)
+			    if test $n -eq 0
+			    then
+				echo $ref missing
+			    fi
+			fi
+		    fi
+		fi
+	    fi
+	fi
+    fi
+done

+ 2 - 2
doc/doxygen/dev/starpu_check_undocumented.sh

@@ -4,7 +4,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011, 2012, 2013, 2014 CNRS
+# Copyright (C) 2011, 2012, 2013, 2014, 2016 CNRS
 # Copyright (C) 2011 INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -94,7 +94,7 @@ fi
 if [ "$1" == "--var" ] || [ "$1" == "" ] ; then
     variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" $SRC| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|tr -d '",'|sort|uniq)
     for variable in $variables ; do
-	x=$(grep "$variable" $dirname/../chapters/40environment_variables.doxy | grep "\\anchor")
+	x=$(grep "$variable" $dirname/../chapters/501_environment_variables.doxy | grep "\\anchor")
 	if test "$x" == "" ; then
 	    echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
 	fi

+ 7 - 2
doc/doxygen/doxygen-config.cfg.in

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2013  Université de Bordeaux
-# Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011, 2012  INRIA
 #
@@ -22,6 +22,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
                          @top_builddir@/doc/doxygen/starpu_config.h \
 			 @top_srcdir@/include/starpu_bitmap.h \
 	 		 @top_srcdir@/include/starpu_bound.h \
+	 		 @top_srcdir@/include/starpu_clusters_util.h \
 			 @top_srcdir@/include/starpu_cublas.h \
 			 @top_srcdir@/include/starpu_cuda.h \
 			 @top_srcdir@/include/starpu_data_filters.h \
@@ -35,15 +36,18 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu.h \
 			 @top_srcdir@/include/starpu_hash.h \
 			 @top_srcdir@/include/starpu_mic.h \
+			 @top_srcdir@/include/starpu_mod.f90 \
 			 @top_srcdir@/include/starpu_opencl.h \
 			 @top_srcdir@/include/starpu_openmp.h \
 			 @top_srcdir@/include/starpu_perfmodel.h \
 			 @top_srcdir@/include/starpu_profiling.h \
 			 @top_srcdir@/include/starpu_rand.h \
 			 @top_srcdir@/include/starpu_scc.h \
+			 @top_srcdir@/include/starpu_sched_component.h \
 			 @top_srcdir@/include/starpu_sched_ctx.h \
 			 @top_srcdir@/include/starpu_sched_ctx_hypervisor.h \
 			 @top_srcdir@/include/starpu_scheduler.h \
+			 @top_srcdir@/include/starpu_simgrid_wrap.h \
 			 @top_srcdir@/include/starpu_sink.h \
 			 @top_srcdir@/include/starpu_stdlib.h \
 			 @top_srcdir@/include/starpu_task_bundle.h \
@@ -56,8 +60,9 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_tree.h \
 			 @top_srcdir@/include/starpu_util.h \
 			 @top_srcdir@/include/starpu_worker.h \
-			 @top_srcdir@/include/starpu_sched_component.h \
+			 @top_srcdir@/include/fstarpu_mod.f90 \
 			 @top_srcdir@/mpi/include/ \
+			 @top_srcdir@/mpi/include/fstarpu_mpi_mod.f90 \
 			 @top_srcdir@/starpufft/include/starpufft.h \
 			 @top_srcdir@/sc_hypervisor/include
 

+ 13 - 7
doc/doxygen/refman.tex

@@ -20,9 +20,9 @@ was last updated on \STARPUUPDATED.\\
 
 Copyright © 2009–2013 Université de Bordeaux\\
 
-Copyright © 2010-2015 CNRS
+Copyright © 2010-2016 CNRS
 
-Copyright © 2011, 2012 INRIA
+Copyright © 2011, 2012, 2016 INRIA
 
 \medskip
 
@@ -97,11 +97,6 @@ Documentation License”.
 \hypertarget{SchedulingContextHypervisor}{}
 \input{SchedulingContextHypervisor}
 
-\chapter{Clustering A Machine}
-\label{ClusteringAMachine}
-\hypertarget{ClusteringAMachine}{}
-\input{ClusteringAMachine}
-
 \chapter{Modularized Scheduler}
 \label{ModularizedScheduler}
 \hypertarget{ModularizedScheduler}{}
@@ -154,6 +149,11 @@ Documentation License”.
 \hypertarget{cExtensions}{}
 \input{cExtensions}
 
+\chapter{Native Fortran Support}
+\label{NativeFortranSupport}
+\hypertarget{NativeFortranSupport}{}
+\input{NativeFortranSupport}
+
 \chapter{SOCL OpenCL Extensions}
 \label{SOCLOpenclExtensions}
 \hypertarget{SOCLOpenclExtensions}{}
@@ -169,6 +169,11 @@ Documentation License”.
 \hypertarget{OpenMPRuntimeSupport}{}
 \input{OpenMPRuntimeSupport}
 
+\chapter{Clustering a Machine}
+\label{ClusteringAMachine}
+\hypertarget{ClusteringAMachine}{}
+\input{ClusteringAMachine}
+
 \part{StarPU Reference API}
 
 \chapter{Execution Configuration Through Environment Variables}
@@ -239,6 +244,7 @@ Documentation License”.
 \input{starpu_8h}
 \input{starpu__bitmap_8h}
 \input{starpu__bound_8h}
+\input{starpu__clusters__util_8h}
 \input{starpu__config_8h}
 \input{starpu__cublas_8h}
 \input{starpu__cuda_8h}

+ 2 - 3
doc/tutorial/vector_scal_plugin_cuda.cu

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012 INRIA
  * Copyright (C) 2010, 2011, 2013  CNRS
- * Copyright (C) 2010  Université de Bordeaux
+ * Copyright (C) 2010, 2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,8 +31,7 @@ vector_mult_cuda (unsigned int n, float *val, float factor)
     val[i] *= factor;
 }
 
-extern "C" void
-vector_scal_cuda (unsigned int size, float vector[], float factor)
+extern "C" void vector_scal_cuda (unsigned int size, float vector[], float factor)
 {
   unsigned threads_per_block = 64;
   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;

+ 150 - 25
examples/Makefile.am

@@ -1,10 +1,10 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2015  Université de Bordeaux
+# Copyright (C) 2009-2016  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
 # Copyright (C) 2011  Télécom-SudParis
 # Copyright (C) 2011-2012  INRIA
-# Copyright (C) 2015  Inria
+# Copyright (C) 2015-2016  Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -17,9 +17,18 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+include $(top_srcdir)/starpu.mk
+
+if STARPU_SIMGRID
+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
+STARPU_HOSTNAME=mirage
+export STARPU_PERF_MODEL_DIR
+export STARPU_HOSTNAME
+endif
+
 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CXXFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
@@ -63,16 +72,21 @@ EXTRA_DIST = 					\
 	reductions/dot_product.h	\
 	reductions/dot_product_opencl_kernels.cl	\
 	scheduler/schedulers.sh				\
-	scheduler/schedulers_context.sh
+	scheduler/schedulers_context.sh			\
+	fortran/Makefile
 
 CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
 
 if STARPU_USE_CUDA
 
+if STARPU_COVERITY
+include $(top_srcdir)/starpu-mynvcc.mk
+else
 NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ -I$(top_srcdir)/examples/  $(HWLOC_CFLAGS)
 
 .cu.o:
 	$(NVCC) $< -c -o $@ $(NVCCFLAGS)
+endif
 
 endif
 
@@ -170,7 +184,14 @@ endif
 # Applications which should only be compiled are added directly in examplebin_PROGRAMS
 # see for instance mandelbrot/mandelbrot
 
-STARPU_EXAMPLES =				\
+STARPU_EXAMPLES =
+
+STARPU_EXAMPLES +=				\
+	sched_ctx/prio				\
+	worker_collections/worker_list_example
+
+if !STARPU_SIMGRID
+STARPU_EXAMPLES +=				\
 	basic_examples/hello_world		\
 	basic_examples/vector_scal		\
 	basic_examples/mult			\
@@ -202,40 +223,54 @@ STARPU_EXAMPLES =				\
 	scheduler/dummy_sched			\
 	scheduler/heteroprio_test		\
 	sched_ctx/sched_ctx			\
-	sched_ctx/prio				\
+	sched_ctx/two_cpu_contexts		\
 	sched_ctx/dummy_sched_with_ctx		\
 	worker_collections/worker_tree_example  \
-	worker_collections/worker_list_example  \
 	reductions/dot_product			\
 	reductions/minmax_reduction
+endif
 
 if !STARPU_SIMGRID
 STARPU_EXAMPLES +=				\
 	scheduler/dummy_sched
 
+if STARPU_HAVE_F77
 if STARPU_HAVE_F77_H
 STARPU_EXAMPLES +=				\
-	basic_examples/vector_scal_fortran	\
 	fortran/hello
 endif
 
+STARPU_EXAMPLES +=				\
+	basic_examples/vector_scal_fortran
+endif
+
 if STARPU_HAVE_FC
 if !STARPU_SANITIZE
 STARPU_EXAMPLES +=				\
-	fortran90/f90_example
+	fortran90/f90_example			\
+	native_fortran/nf_vector		\
+	native_fortran/nf_matrix		\
+	native_fortran/nf_example		\
+	native_fortran/nf_dynbuf		\
+	native_fortran/nf_varbuf		\
+	native_fortran/nf_sched_ctx		\
+	native_fortran/nf_partition
 endif
 endif
 endif
 
 if !NO_BLAS_LIB
 STARPU_EXAMPLES +=				\
-	axpy/axpy				\
 	mult/sgemm 				\
 	mult/dgemm				\
 	cholesky/cholesky_tag			\
 	cholesky/cholesky_tile_tag		\
+	cholesky/cholesky_implicit
+
+if !STARPU_SIMGRID
+STARPU_EXAMPLES +=				\
+	axpy/axpy				\
 	cholesky/cholesky_grain_tag		\
-	cholesky/cholesky_implicit		\
 	lu/lu_example_float			\
 	lu/lu_example_double			\
 	lu/lu_implicit_example_float		\
@@ -244,6 +279,9 @@ STARPU_EXAMPLES +=				\
 	cg/cg					\
 	pipeline/pipeline
 endif
+endif
+
+if !STARPU_SIMGRID
 
 if MKL_BLAS_LIB
 STARPU_EXAMPLES +=				\
@@ -259,11 +297,14 @@ STARPU_EXAMPLES +=				\
 endif
 
 if !STARPU_SIMGRID
+if STARPU_HAVE_F77
 if STARPU_HAVE_F77_H
 STARPU_EXAMPLES +=				\
-	basic_examples/vector_scal_fortran	\
 	fortran/hello
 endif
+STARPU_EXAMPLES +=				\
+	basic_examples/vector_scal_fortran
+endif
 endif
 
 if STARPU_HAVE_OPENMP
@@ -286,6 +327,8 @@ endif
 endif
 endif
 
+endif !STARPU_SIMGRID
+
 ##################
 # Basic examples #
 ##################
@@ -314,7 +357,7 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	basic_examples/vector_scal_opencl_kernel.cl
 endif
 
-if STARPU_HAVE_F77_H
+if STARPU_HAVE_F77
 basic_examples_vector_scal_fortran_SOURCES =	\
 	basic_examples/vector_scal_fortran.F	\
 	basic_examples/vector_scal_c.c		\
@@ -327,11 +370,13 @@ basic_examples_vector_scal_fortran_LDADD =	\
 	$(STARPU_CUDA_FORTRAN_LDFLAGS)
 endif
 
+if STARPU_HAVE_F77_H
 fortran_hello_SOURCES	=		\
 	fortran/hello_c.c		\
 	fortran/hello.F			\
 	fortran/StarPU_fortran.h
 endif
+endif
 
 if STARPU_HAVE_FC
 fortran90_f90_example_SOURCES =	\
@@ -341,6 +386,42 @@ fortran90_f90_example_SOURCES =	\
 	fortran90/mod_compute.f90	\
 	fortran90/marshalling.c		\
 	fortran90/f90_example.f90
+
+native_fortran_nf_vector_SOURCES =	\
+	native_fortran/nf_codelets.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_vector.f90
+
+native_fortran_nf_matrix_SOURCES =	\
+	native_fortran/nf_codelets.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_matrix.f90
+
+native_fortran_nf_example_SOURCES =	\
+	native_fortran/nf_types.f90		\
+	native_fortran/nf_compute.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_example.f90
+
+native_fortran_nf_dynbuf_SOURCES =	\
+	native_fortran/nf_dynbuf_cl.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_dynbuf.f90
+
+native_fortran_nf_varbuf_SOURCES =	\
+	native_fortran/nf_varbuf_cl.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_varbuf.f90
+
+native_fortran_nf_sched_ctx_SOURCES =	\
+	native_fortran/nf_sched_ctx_cl.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_sched_ctx.f90
+
+native_fortran_nf_partition_SOURCES =	\
+	native_fortran/nf_partition_cl.f90		\
+	$(top_srcdir)/include/fstarpu_mod.f90	\
+	native_fortran/nf_partition.f90
 endif
 
 #######################
@@ -947,18 +1028,6 @@ sched_ctx_parallel_tasks_reuse_handle_CFLAGS = \
 
 endif
 
-showcheck:
-	-cat $(TEST_LOGS) /dev/null
-	! grep -q "ERROR: AddressSanitizer: " $(TEST_LOGS) /dev/null
-	! grep -q "WARNING: AddressSanitizer: " $(TEST_LOGS) /dev/null
-	! grep -q "ERROR: ThreadSanitizer: " $(TEST_LOGS) /dev/null
-	! grep -q "WARNING: ThreadSanitizer: " $(TEST_LOGS) /dev/null
-	RET=0 ; \
-	for i in $(SUBDIRS) ; do \
-		make -C $$i showcheck || RET=1 ; \
-	done ; \
-	exit $$RET
-
 if STARPU_HAVE_FC
 # Fortran90 example
 # - list explicit dependences to control proper module files generation
@@ -977,4 +1046,60 @@ mod_compute.o: $(top_srcdir)/examples/fortran90/mod_compute.f90 mod_types.mod mo
 
 f90_example.o: $(top_srcdir)/examples/fortran90/f90_example.f90 $(top_srcdir)/examples/fortran90/marshalling.c mod_types.mod mod_interface.mod mod_compute.mod starpu_mod.mod
 	$(AM_V_FC)$(FC) $(fortran90_f90_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'fortran90/f90_example.f90' || echo '$(srcdir)/'`fortran90/f90_example.f90
+
+# Native Fortran example
+# - list explicit dependences to control proper module files generation
+# - the overriding rule fully disables the corresponing default rule, thus
+#   the default rule body must be copied entirely
+nf_types.mod: nf_types.o
+nf_compute.mod: nf_compute.o
+fstarpu_mod.mod: fstarpu_mod.o
+nf_codelets.mod: nf_codelets.o
+nf_dynbuf_cl.mod: nf_dynbuf_cl.o
+nf_varbuf_cl.mod: nf_varbuf_cl.o
+nf_sched_ctx_cl.mod: nf_sched_ctx_cl.o
+nf_partition_cl.mod: nf_partition_cl.o
+
+fstarpu_mod.o: $(top_srcdir)/include/fstarpu_mod.f90
+	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ '$(top_srcdir)/'include/fstarpu_mod.f90
+
+nf_codelets.o: $(top_srcdir)/examples/native_fortran/nf_codelets.f90 fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_codelets.f90' || echo '$(srcdir)/'`native_fortran/nf_codelets.f90
+
+nf_vector.o: $(top_srcdir)/examples/native_fortran/nf_vector.f90 nf_codelets.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_vector_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_vector.f90' || echo '$(srcdir)/'`native_fortran/nf_vector.f90
+
+nf_matrix.o: $(top_srcdir)/examples/native_fortran/nf_matrix.f90 nf_codelets.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_matrix_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_matrix.f90' || echo '$(srcdir)/'`native_fortran/nf_matrix.f90
+
+nf_compute.o: $(top_srcdir)/examples/native_fortran/nf_compute.f90 nf_types.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_compute.f90' || echo '$(srcdir)/'`native_fortran/nf_compute.f90
+
+nf_example.o: $(top_srcdir)/examples/native_fortran/nf_example.f90 nf_types.mod nf_compute.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_example_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_example.f90' || echo '$(srcdir)/'`native_fortran/nf_example.f90
+
+nf_dynbuf_cl.o: $(top_srcdir)/examples/native_fortran/nf_dynbuf_cl.f90 nf_types.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_dynbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_dynbuf_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_dynbuf_cl.f90
+
+nf_dynbuf.o: $(top_srcdir)/examples/native_fortran/nf_dynbuf.f90 nf_types.mod nf_dynbuf_cl.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_dynbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_dynbuf.f90' || echo '$(srcdir)/'`native_fortran/nf_dynbuf.f90
+
+nf_varbuf_cl.o: $(top_srcdir)/examples/native_fortran/nf_varbuf_cl.f90 nf_types.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_varbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_varbuf_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_varbuf_cl.f90
+
+nf_varbuf.o: $(top_srcdir)/examples/native_fortran/nf_varbuf.f90 nf_types.mod nf_varbuf_cl.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_varbuf_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_varbuf.f90' || echo '$(srcdir)/'`native_fortran/nf_varbuf.f90
+
+nf_sched_ctx_cl.o: $(top_srcdir)/examples/native_fortran/nf_sched_ctx_cl.f90 nf_types.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_sched_ctx_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_sched_ctx_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_sched_ctx_cl.f90
+
+nf_sched_ctx.o: $(top_srcdir)/examples/native_fortran/nf_sched_ctx.f90 nf_types.mod nf_sched_ctx_cl.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_sched_ctx_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_sched_ctx.f90' || echo '$(srcdir)/'`native_fortran/nf_sched_ctx.f90
+
+nf_partition_cl.o: $(top_srcdir)/examples/native_fortran/nf_partition_cl.f90 nf_types.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_partition_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_partition_cl.f90' || echo '$(srcdir)/'`native_fortran/nf_partition_cl.f90
+
+nf_partition.o: $(top_srcdir)/examples/native_fortran/nf_partition.f90 nf_types.mod nf_partition_cl.mod fstarpu_mod.mod
+	$(AM_V_FC)$(FC) $(native_fortran_nf_partition_FCFLAGS) $(FCFLAGS) -c -o $@ `test -f 'native_fortran/nf_partition.f90' || echo '$(srcdir)/'`native_fortran/nf_partition.f90
+
 endif

+ 2 - 1
examples/axpy/axpy_opencl.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012 INRIA
+ * Copyright (C) 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +35,7 @@ void axpy_opencl(void *buffers[], void *_args)
 	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "_axpy_opencl", devid);

+ 2 - 2
examples/basic_examples/block_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -42,7 +42,7 @@ void opencl_codelet(void *descr[], void *_args)
         int ldz = (int) STARPU_BLOCK_GET_LDZ(descr[0]);
         float *multiplier = (float *)_args;
 
-        id = starpu_worker_get_id();
+        id = starpu_worker_get_id_check();
         devid = starpu_worker_get_devid(id);
 
         err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);

+ 1 - 1
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -31,7 +31,7 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
 	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel,

+ 2 - 1
examples/basic_examples/multiformat_opencl.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011  INRIA
+ * Copyright (C) 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -30,7 +31,7 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
 	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel,

+ 8 - 1
examples/basic_examples/variable.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2013, 2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2013, 2015-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -16,10 +16,17 @@
  */
 
 #include <starpu.h>
+#include <config.h>
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
+#ifdef STARPU_QUICK_CHECK
+static unsigned niter = 500;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned niter = 5000;
+#else
 static unsigned niter = 50000;
+#endif
 
 extern void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
 

+ 2 - 2
examples/basic_examples/variable_kernels_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  CNRS
+ * Copyright (C) 2010, 2012, 2016  CNRS
  * Copyright (C) 2011  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,7 +26,7 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_event event;
 	int id, devid, err;
 
-        id = starpu_worker_get_id();
+        id = starpu_worker_get_id_check();
         devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable", devid);

+ 4 - 4
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
- * Copyright (C) 2010-2015  Université de Bordeaux
+ * Copyright (C) 2010-2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -44,10 +44,10 @@ static struct starpu_perfmodel vector_scal_model =
 	.symbol = "vector_scal"
 };
 
-static struct starpu_perfmodel vector_scal_power_model =
+static struct starpu_perfmodel vector_scal_energy_model =
 {
 	.type = STARPU_HISTORY_BASED,
-	.symbol = "vector_scal_power"
+	.symbol = "vector_scal_energy"
 };
 
 static struct starpu_codelet cl =
@@ -93,7 +93,7 @@ static struct starpu_codelet cl =
 	.nbuffers = 1,
 	.modes = {STARPU_RW},
 	.model = &vector_scal_model,
-	.power_model = &vector_scal_power_model
+	.energy_model = &vector_scal_energy_model
 };
 
 #ifdef STARPU_USE_OPENCL

+ 2 - 2
examples/basic_examples/vector_scal_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2012, 2013, 2016  CNRS
  * Copyright (C) 2010  INRIA
  * Copyright (C) 2011, 2014  Université de Bordeaux
  *
@@ -38,7 +38,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	/* OpenCL copy of the vector pointer */
 	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);

+ 28 - 4
examples/binary/binary.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,7 +39,7 @@ struct starpu_codelet cl =
 	.modes = {STARPU_RW}
 };
 
-int compute(char *file_name, int load_as_file)
+int compute(char *file_name, int load_as_file, int with_malloc)
 {
 	float float_array[4] STARPU_ATTRIBUTE_ALIGNED(16) = { 0.0f, 0.0f, 0.0f, 0.0f};
 	starpu_data_handle_t float_array_handle;
@@ -61,6 +61,20 @@ int compute(char *file_name, int load_as_file)
 		ret = starpu_opencl_load_binary_opencl(file_name, &opencl_program);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
 	}
+	else if (with_malloc)
+	{
+		char *located_file_name;
+		char *located_dir_name;
+		char *opencl_program_source;
+		starpu_opencl_load_program_source_malloc(file_name, &located_file_name, &located_dir_name, &opencl_program_source);
+		ret = starpu_opencl_compile_opencl_from_string(opencl_program_source, "incrementer", NULL);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_compile_opencl_from_file");
+		ret = starpu_opencl_load_binary_opencl("incrementer", &opencl_program);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_binary_opencl");
+		free(located_file_name);
+		free(located_dir_name);
+		free(opencl_program_source);
+	}
 	else
 	{
 		char located_file_name[1024];
@@ -89,6 +103,10 @@ int compute(char *file_name, int load_as_file)
 	/* update the array in RAM */
 	starpu_data_unregister(float_array_handle);
 
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_unload_opencl(&opencl_program);
+#endif
+
 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0], float_array[1], float_array[2], float_array[3]);
 
 	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
@@ -118,9 +136,15 @@ int main(int argc, char **argv)
 	}
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 1);
+	ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 1, -1);
 	if (ret == 0)
-		ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 0);
+		ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 0, 0);
+	else
+		FPRINTF(stderr, "Error when calling compute %d\n", ret);
+	if (ret == 0)
+	     ret = compute("examples/incrementer/incrementer_kernels_opencl_kernel.cl", 0, 1);
+	else
+		FPRINTF(stderr, "Error when calling compute %d\n", ret);
 
 	starpu_shutdown();
 	return ret;

+ 3 - 1
examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2014-2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -77,6 +77,8 @@ static starpu_data_handle_t A_handle, b_handle, x_handle;
 static TYPE *A, *b, *x;
 
 #ifdef STARPU_QUICK_CHECK
+static int i_max = 10;
+#elif !defined(STARPU_LONG_CHECK)
 static int i_max = 100;
 #else
 static int i_max = 1000;

+ 1 - 1
examples/cg/cg_kernels.c

@@ -478,7 +478,7 @@ int gemv_kernel(starpu_data_handle_t v1,
 						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
-						 STARPU_TAG_ONLY, (starpu_tag_t) (b2 * nblocks + b1),
+						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
 						 0);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}

+ 10 - 19
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  CNRS
  *
@@ -267,6 +267,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
 {
 	int ret;
+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
 
 #ifdef STARPU_HAVE_MAGMA
 	magma_init();
@@ -289,16 +290,9 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	starpu_cublas_init();
 
-#ifndef STARPU_SIMGRID
 	if (pinned)
-	{
-		starpu_malloc((void **)A, dim*dim*sizeof(float));
-	}
-	else
-	{
-		*A = malloc(dim*dim*sizeof(float));
-	}
-#endif
+		flags |= STARPU_MALLOC_PINNED;
+	starpu_malloc_flags((void **)A, dim*dim*sizeof(float), flags);
 }
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
@@ -323,16 +317,13 @@ int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, un
 	return ret;
 }
 
-static void shutdown_system(float **matA, unsigned pinned)
+static void shutdown_system(float **matA, unsigned dim, unsigned pinned)
 {
+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
 	if (pinned)
-	{
-	     starpu_free(*matA);
-	}
-	else
-	{
-	     free(*matA);
-	}
+		flags |= STARPU_MALLOC_PINNED;
+
+	starpu_free_flags(*matA, dim*dim*sizeof(float), flags);
 
 	starpu_cublas_shutdown();
 	starpu_shutdown();
@@ -433,6 +424,6 @@ int main(int argc, char **argv)
 	free(test_mat);
 #endif
 
-	shutdown_system(&mat, pinned);
+	shutdown_system(&mat, size, pinned);
 	return ret;
 }

+ 8 - 10
examples/cholesky/cholesky_implicit.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -188,12 +188,11 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 static void execute_cholesky(unsigned size, unsigned nblocks)
 {
-	int ret;
 	float *mat = NULL;
 	unsigned i,j;
 
+	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 #ifndef STARPU_SIMGRID
-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
 	for (i = 0; i < size; i++)
 	{
 		for (j = 0; j < size; j++)
@@ -225,7 +224,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	}
 #endif
 
-	ret = cholesky(mat, size, size, nblocks);
+	cholesky(mat, size, size, nblocks);
 
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
@@ -303,7 +302,7 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	        }
 		free(test_mat);
 	}
-	starpu_free(mat);
+	starpu_free_flags(mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 }
 
 int main(int argc, char **argv)
@@ -324,10 +323,9 @@ int main(int argc, char **argv)
 
 	int ret;
 	ret = starpu_init(NULL);
-	starpu_fxt_stop_profiling();
+	//starpu_fxt_stop_profiling();
 
-	if (ret == -ENODEV)
-                return 77;
+	if (ret == -ENODEV) return 77;
         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 #ifdef STARPU_USE_CUDA
@@ -359,5 +357,5 @@ int main(int argc, char **argv)
 	starpu_cublas_shutdown();
 	starpu_shutdown();
 
-	return ret;
+	return 0;
 }

+ 11 - 19
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
@@ -230,6 +230,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int initialize_system(float **A, unsigned dim, unsigned pinned)
 {
 	int ret;
+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
 
 #ifdef STARPU_HAVE_MAGMA
 	magma_init();
@@ -252,16 +253,10 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
 	starpu_cublas_init();
 
-#ifndef STARPU_SIMGRID
 	if (pinned)
-	{
-		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
-	}
-	else
-	{
-		*A = malloc(dim*dim*sizeof(float));
-	}
-#endif
+		flags |= STARPU_MALLOC_PINNED;
+	starpu_malloc_flags((void **)A, dim*dim*sizeof(float), flags);
+
 	return 0;
 }
 
@@ -294,16 +289,13 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	starpu_data_unregister(dataA);
 }
 
-static void shutdown_system(float **matA, unsigned pinned)
+static void shutdown_system(float **matA, unsigned dim, unsigned pinned)
 {
+	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
 	if (pinned)
-	{
-		starpu_free(*matA);
-	}
-	else
-	{
-		free(*matA);
-	}
+		flags |= STARPU_MALLOC_PINNED;
+
+	starpu_free_flags(*matA, dim*dim*sizeof(float), flags);
 
 	starpu_cublas_shutdown();
 	starpu_shutdown();
@@ -404,6 +396,6 @@ int main(int argc, char **argv)
 	free(test_mat);
 #endif
 
-	shutdown_system(&mat, pinned);
+	shutdown_system(&mat, size, pinned);
 	return 0;
 }

+ 4 - 8
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -249,21 +249,17 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
-#ifndef STARPU_SIMGRID
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
 		if (x <= y)
 		{
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
-#else
-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
-#endif
+			starpu_malloc_flags((void **)&A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 			assert(A[y][x]);
 		}
 	}
 
+#ifndef STARPU_SIMGRID
 	/* create a simple definite positive symetric matrix example
 	 *
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
@@ -304,7 +300,7 @@ int main(int argc, char **argv)
 		if (x <= y)
 		{
 			starpu_data_unregister(A_state[y][x]);
-			free(A[y][x]);
+			starpu_free_flags(A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 		}
 	}
 

+ 3 - 2
examples/filters/custom_mf/conversion_opencl.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012 INRIA
+ * Copyright (C) 2016 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +35,7 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 	struct point *aop;
 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel,
@@ -56,7 +57,7 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 		STARPU_OPENCL_REPORT_ERROR(err);
 		assert(0);
 	}
-	
+
 
 	{
 		size_t global=n;

+ 3 - 2
examples/filters/custom_mf/custom_interface.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012 INRIA
+ * Copyright (C) 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -413,7 +414,7 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
 	 */
 	cl_context context;
 	cl_command_queue queue;
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	int devid = starpu_worker_get_devid(id);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_context(devid, &context);
@@ -456,7 +457,7 @@ static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
 	 */
 	cl_context context;
 	cl_command_queue queue;
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	int devid = starpu_worker_get_devid(id);
 	starpu_opencl_get_queue(devid, &queue);
 	starpu_opencl_get_context(devid, &context);

+ 3 - 2
examples/filters/custom_mf/custom_opencl.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012 INRIA
+ * Copyright (C) 2016 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +34,7 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 	struct point *aop;
 	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel,
@@ -55,7 +56,7 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 		STARPU_OPENCL_REPORT_ERROR(err);
 		assert(0);
 	}
-	
+
 
 	{
 		size_t global=n;

+ 2 - 2
examples/filters/fblock_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2011, 2014-2015  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -46,7 +46,7 @@ void opencl_func(void *buffers[], void *cl_arg)
         unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
         unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
 
-	id = starpu_worker_get_id();
+	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
 
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "fblock_opencl", devid);

+ 1 - 1
examples/filters/fmatrix.c

@@ -119,7 +119,7 @@ int main(int argc, char **argv)
                 for(i=0 ; i<NX ; i++)
 		{
                         FPRINTF(stderr, "%4d ", matrix[(j*NX)+i]);
-			if (matrix[(j*NX)+i] != n*12)
+			if (matrix[(j*NX)+i] != (int) n*12)
 			{
 				FPRINTF(stderr, "Incorrect result %4d != %4d", matrix[(j*NX)+i], n*12);
 				ret=1;

+ 5 - 2
examples/filters/shadow2d.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012-2014  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -121,6 +121,8 @@ void cpu_func(void *buffers[], void *cl_arg)
 #ifdef STARPU_USE_CUDA
 void cuda_func(void *buffers[], void *cl_arg)
 {
+	cudaError_t cures;
+
         /* length of the shadowed source matrix */
         unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
         unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
@@ -138,7 +140,8 @@ void cuda_func(void *buffers[], void *cl_arg)
 	/* If things go right, sizes should match */
 	STARPU_ASSERT(n == n2);
 	STARPU_ASSERT(m == m2);
-	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cures = cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+        if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 }
 #endif
 

+ 5 - 1
examples/heat/dw_factolu.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -546,6 +546,8 @@ void dw_callback_codelet_update_u11(void *argcb)
 			ret = starpu_task_submit(task21);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
+
+		free(remaining);
 	}
 }
 
@@ -636,6 +638,8 @@ void dw_callback_codelet_update_u12_21(void *argcb)
 				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 			}
 		}
+
+		free(remaining);
 	}
 }
 

+ 4 - 4
examples/heat/dw_factolu_grain.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010-2011, 2014-2016  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -316,7 +316,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 		 */
 
 		unsigned ndeps_tags = (nblocks - maxk)*(nblocks - maxk);
-		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
+		starpu_tag_t *tag_array = calloc(ndeps_tags, sizeof(starpu_tag_t));
 		STARPU_ASSERT(tag_array);
 
 		unsigned ind = 0;
@@ -326,7 +326,7 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 			tag_array[ind++] = TAG22(maxk-1, i, j, tag_prefix);
 		}
 
-		starpu_tag_wait_array(ndeps_tags, tag_array);
+		starpu_tag_wait_array(ind, tag_array);
 
 		free(tag_array);
 

+ 31 - 31
examples/heat/dw_factolu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2012, 2014-2015  Université de Bordeaux
- * Copyright (C) 2010, 2011  CNRS
+ * Copyright (C) 2010, 2011, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,7 +41,7 @@ void display_stat_heat(void)
 	unsigned worker;
 	for (worker = 0; worker < nworkers; worker++)
 	{
-		count_total_per_worker[worker] = count_11_per_worker[worker] 
+		count_total_per_worker[worker] = count_11_per_worker[worker]
 					+ count_12_per_worker[worker]
 					+ count_21_per_worker[worker]
 					+ count_22_per_worker[worker];
@@ -59,7 +59,7 @@ void display_stat_heat(void)
 		{
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
-			
+
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
 		}
 	}
@@ -71,12 +71,12 @@ void display_stat_heat(void)
 		{
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
-			
+
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
 		}
 	}
-	
-	
+
+
 	FPRINTF(stderr, "\t21 (TRSM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
@@ -84,11 +84,11 @@ void display_stat_heat(void)
 		{
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
-			
+
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
 		}
 	}
-	
+
 	FPRINTF(stderr, "\t22 (SGEMM)\n");
 	for (worker = 0; worker < nworkers; worker++)
 	{
@@ -96,14 +96,14 @@ void display_stat_heat(void)
 		{
 			char name[32];
 			starpu_worker_get_name(worker, name, 32);
-			
+
 			FPRINTF(stderr, "\t\t%s -> %u / %u (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
 		}
 	}
 }
 
 /*
- *   U22 
+ *   U22
  */
 
 static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
@@ -127,7 +127,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU
 	switch (s)
 	{
 		case 0:
-			STARPU_SGEMM("N", "N",	dy, dx, dz, 
+			STARPU_SGEMM("N", "N",	dy, dx, dz,
 				-1.0f, left, ld21, right, ld12,
 					     1.0f, center, ld22);
 			break;
@@ -152,7 +152,7 @@ void dw_cpu_codelet_update_u22(void *descr[], void *_args)
 {
 	dw_common_cpu_codelet_update_u22(descr, 0, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_22_per_worker[id]++;
 }
 
@@ -161,7 +161,7 @@ void dw_cublas_codelet_update_u22(void *descr[], void *_args)
 {
 	dw_common_cpu_codelet_update_u22(descr, 1, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_22_per_worker[id]++;
 }
 #endif /* STARPU_USE_CUDA */
@@ -175,7 +175,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 	float *sub11;
 	float *sub12;
 
-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);	
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 	sub12 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
 
 	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
@@ -183,7 +183,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, STARPU_ATT
 
 	unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
 	unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
-	
+
 #ifdef STARPU_USE_CUDA
 	cublasStatus status;
 #endif
@@ -215,7 +215,7 @@ void dw_cpu_codelet_update_u12(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u12(descr, 0, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_12_per_worker[id]++;
 }
 
@@ -224,12 +224,12 @@ void dw_cublas_codelet_update_u12(void *descr[], void *_args)
 {
 	 dw_common_codelet_update_u12(descr, 1, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_12_per_worker[id]++;
 }
 #endif /* STARPU_USE_CUDA */
 
-/* 
+/*
  * U21
  */
 
@@ -246,7 +246,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, STARPU_ATT
 
 	unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
-	
+
 #ifdef STARPU_USE_CUDA
 	cublasStatus status;
 #endif
@@ -275,7 +275,7 @@ void dw_cpu_codelet_update_u21(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u21(descr, 0, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_21_per_worker[id]++;
 }
 
@@ -284,10 +284,10 @@ void dw_cublas_codelet_update_u21(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u21(descr, 1, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_21_per_worker[id]++;
 }
-#endif 
+#endif
 
 /*
  *	U11
@@ -304,15 +304,15 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 		}
 		FPRINTF(stderr, "\n");
 	}
-	
+
 	FPRINTF(stderr, "\n");
 }
 
-static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
+static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	float *sub11;
 
-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 
 	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
 	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
@@ -327,9 +327,9 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 				float pivot;
 				pivot = sub11[z+z*ld];
 				STARPU_ASSERT(pivot != 0.0f);
-		
+
 				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
-		
+
 				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
 						&sub11[z+(z+1)*ld], ld,
 						&sub11[(z+1)+z*ld], 1,
@@ -345,9 +345,9 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATT
 				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(pivot != 0.0f);
-				
+
 				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
-				
+
 				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
 								&sub11[z+(z+1)*ld], ld,
 								&sub11[(z+1)+z*ld], 1,
@@ -369,7 +369,7 @@ void dw_cpu_codelet_update_u11(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u11(descr, 0, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_11_per_worker[id]++;
 }
 
@@ -378,7 +378,7 @@ void dw_cublas_codelet_update_u11(void *descr[], void *_args)
 {
 	dw_common_codelet_update_u11(descr, 1, _args);
 
-	int id = starpu_worker_get_id();
+	int id = starpu_worker_get_id_check();
 	count_11_per_worker[id]++;
 }
 #endif /* STARPU_USE_CUDA */

+ 2 - 2
examples/heat/dw_sparse_cg.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -204,7 +204,7 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	unsigned iter = problem->i;
 
-	unsigned long long maskiter = (iter*1024);
+	unsigned long long maskiter = ((unsigned long long)iter*1024);
 
 	/* q = A d */
 	struct starpu_task *task4 = create_task(maskiter | 4UL);

+ 31 - 31
examples/heat/heat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -131,7 +131,7 @@ static void parse_args(int argc, char **argv)
 
 
 /*
- * The Finite element method code 
+ * The Finite element method code
  *
  *   B              C
  *	**********
@@ -365,38 +365,38 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
 		LUB = malloc(subsize*sizeof(float));
 	}
 
-		/* L */
-		STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
-	
-		/* U */
-	        STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
-	
-		STARPU_ASSERT(DIM == size);
-	
+	/* L */
+	STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
+
+	/* U */
+	STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
+
+	STARPU_ASSERT(DIM == size);
+
 	if (check)
 	{
 		/* compute the error on (LUB - savedB) which should be 0 */
-	
+
 		/* LUB = B */
 		memcpy(LUB, B, subsize*sizeof(float));
-	
-	
+
+
 		/* LUB = U * LUB */
 		STARPU_STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
-		
+
 		/* LUB = L * LUB */
 		STARPU_STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
-	
+
 		/* LUB -= B */
 		STARPU_SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
-	
+
 		/* check if LUB is close to the 0 vector */
 		int maxind = STARPU_ISAMAX(subsize, LUB, 1);
 		FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
 		float sum = STARPU_SASUM(subsize, LUB, 1);
 		FPRINTF(stderr,"avg. error %e\n", sum/subsize);
-	
+
 		free(LUB);
 		free(savedB);
 	}
@@ -430,7 +430,7 @@ unsigned compute_pivot_array(int *RefArray, int *RefArrayBack, unsigned size)
 	/* first inner nodes */
 	for (theta = 1; theta < ntheta - 1 ; theta++)
 	{
-		for (thick = 1; thick < nthick - 1; thick++) 
+		for (thick = 1; thick < nthick - 1; thick++)
 		{
 			/* inner nodes are unknown */
 			RefArrayBack[NODE_NUMBER(theta, thick)] = index;
@@ -447,7 +447,7 @@ unsigned compute_pivot_array(int *RefArray, int *RefArrayBack, unsigned size)
 		/* Lower boundary "South" */
 		RefArrayBack[NODE_NUMBER(theta, 0)] = index;
 		RefArray[index++] = NODE_NUMBER(theta, 0);
-		
+
 		/* Upper boundary "North" */
 		RefArrayBack[NODE_NUMBER(theta, nthick-1)] = index;
 		RefArray[index++] = NODE_NUMBER(theta, nthick-1);
@@ -494,7 +494,7 @@ void build_mesh(point *mesh)
 				case 1:
 					mesh[NODE_NUMBER(theta,thick)].x =
 							-100 + RMIN+((RMAX-RMIN)*theta)/(ntheta - 1);
-					mesh[NODE_NUMBER(theta,thick)].y = 
+					mesh[NODE_NUMBER(theta,thick)].y =
 							RMIN+((RMAX-RMIN)*thick)/(nthick - 1);
 					break;
 				case 2:
@@ -527,7 +527,7 @@ static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned n
 				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
 				{
 					/* we got a possible neighbour */
-					unsigned pnode = 
+					unsigned pnode =
 						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
 
 					neighbours[nneighbours++] = TRANSLATEBACK(pnode);
@@ -602,7 +602,7 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 
 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
 		{
-			unsigned n = neighbours[neighbour]; 
+			unsigned n = neighbours[neighbour];
 			if (n >= newsize)
 			{
 				B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
@@ -611,7 +611,7 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 	}
 }
 
-static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uint32_t **colind, 
+static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uint32_t **colind,
 						uint32_t *rowptr, unsigned newsize, int *RefArray, int *RefArrayBack)
 {
 	unsigned j;
@@ -641,12 +641,12 @@ static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uin
 			{
 
 				val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
-	
+
 				if (val != 0.0f)
 				{
 					*nzval = realloc(*nzval, (pos+1)*sizeof(float));
 					*colind = realloc(*colind, (pos+1)*sizeof(uint32_t));
-	
+
 					(*nzval)[pos] = val;
 					(*colind)[pos] = nodeneighbour;
 
@@ -714,13 +714,13 @@ int main(int argc, char **argv)
 
 	build_mesh(pmesh);
 
-	/* now simplify that problem given the boundary conditions 
+	/* now simplify that problem given the boundary conditions
 	 * to do so, we remove the already known variables from the system
 	 * by pivoting the various know variable, RefArray keep track of that
-	 * pivoting */ 
+	 * pivoting */
 	newsize = compute_pivot_array(RefArray, RefArrayBack, DIM);
 
-	/* we can either use a direct method (LU decomposition here) or an 
+	/* we can either use a direct method (LU decomposition here) or an
 	 * iterative method (conjugate gradient here) */
 	if (use_cg)
 	{
@@ -748,17 +748,17 @@ int main(int argc, char **argv)
 		{
 			result[TRANSLATE(i)] = B[i];
 		}
-	
+
 		for (i = newsize ; i < DIM; i++)
 		{
 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
 		}
-	
+
 	}
 	else
 	{
 
-		/* unfortunately CUDA does not allow late memory registration, 
+		/* unfortunately CUDA does not allow late memory registration,
 		 * we need to do the malloc using CUDA itself ... */
 		initialize_system(&A, &B, newsize, pinned);
 

+ 14 - 2
examples/incrementer/incrementer.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011, 2013-2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2009-2011, 2013-2016  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,8 +19,16 @@
  * This is just a small example which increments two values of a vector several times.
  */
 #include <starpu.h>
+#include <config.h>
 
+#ifdef STARPU_QUICK_CHECK
+static unsigned niter = 500;
+#elif !defined(STARPU_LONG_CHECK)
+static unsigned niter = 5000;
+#else
 static unsigned niter = 50000;
+#endif
+
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 #ifdef STARPU_USE_CUDA
@@ -113,6 +121,10 @@ int main(int argc, char **argv)
 
 	end = starpu_timing_now();
 
+#ifdef STARPU_USE_OPENCL
+	starpu_opencl_unload_opencl(&opencl_program);
+#endif
+
 	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
                 float_array[1], float_array[2], float_array[3]);
 

+ 0 - 0
examples/incrementer/incrementer_kernels_opencl.c


Some files were not shown because too many files changed in this diff