Prechádzať zdrojové kódy

Merge @10584:11363 and several bug fixes resulting from the merge

Marc Sergent 11 rokov pred
rodič
commit
c23d69272f
100 zmenil súbory, kde vykonal 96708 pridanie a 984 odobranie
  1. 2 3
      AUTHORS
  2. 35 2
      ChangeLog
  3. 21 5
      INSTALL
  4. 8 10
      Makefile.am
  5. 153 29
      configure.ac
  6. 30 0
      doc/Makefile.am
  7. 37 30
      doc/doxygen/Makefile.am
  8. 22 22
      doc/doxygen/chapters/advanced_examples.doxy
  9. 54 7
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  10. 6 0
      doc/doxygen/chapters/api/cuda_extensions.doxy
  11. 27 1
      doc/doxygen/chapters/api/data_interfaces.doxy
  12. 1 1
      doc/doxygen/chapters/api/data_management.doxy
  13. 31 6
      doc/doxygen/chapters/api/data_out_of_core.doxy
  14. 22 11
      doc/doxygen/chapters/api/insert_task.doxy
  15. 5 0
      doc/doxygen/chapters/api/mic_extensions.doxy
  16. 14 4
      doc/doxygen/chapters/api/mpi.doxy
  17. 4 0
      doc/doxygen/chapters/api/multiformat_data_interface.doxy
  18. 10 0
      doc/doxygen/chapters/api/opencl_extensions.doxy
  19. 15 15
      doc/doxygen/chapters/api/performance_model.doxy
  20. 15 0
      doc/doxygen/chapters/api/profiling.doxy
  21. 198 0
      doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor.doxy
  22. 191 0
      doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
  23. 5 0
      doc/doxygen/chapters/api/scc_extensions.doxy
  24. 0 304
      doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
  25. 90 89
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  26. 11 15
      doc/doxygen/chapters/api/scheduling_policy.doxy
  27. 10 0
      doc/doxygen/chapters/api/standard_memory_library.doxy
  28. 2 2
      doc/doxygen/chapters/api/task_bundles.doxy
  29. 306 0
      doc/doxygen/chapters/api/threads.doxy
  30. 111 0
      doc/doxygen/chapters/api/toolbox.doxy
  31. 47 0
      doc/doxygen/chapters/api/workers.doxy
  32. 1 1
      doc/doxygen/chapters/code/disk_copy.c
  33. 7 0
      doc/doxygen/chapters/configure_options.doxy
  34. 94199 0
      doc/doxygen/chapters/data_trace.eps
  35. BIN
      doc/doxygen/chapters/data_trace.pdf
  36. BIN
      doc/doxygen/chapters/data_trace.png
  37. 106 0
      doc/doxygen/chapters/environment_variables.doxy
  38. 1 0
      doc/doxygen/chapters/files.doxy
  39. 22 4
      doc/doxygen/chapters/mpi_support.doxy
  40. 13 4
      doc/doxygen/chapters/optimize_performance.doxy
  41. 20 2
      doc/doxygen/chapters/performance_feedback.doxy
  42. 98 25
      doc/doxygen/chapters/scheduling_context_hypervisor.doxy
  43. 49 25
      doc/doxygen/chapters/scheduling_contexts.doxy
  44. 2 0
      doc/doxygen/doxygen-config.cfg.in
  45. 2 1
      doc/doxygen/doxygen.cfg
  46. 5 4
      doc/doxygen/refman.tex
  47. 77 0
      doc/tutorial/hello_world_mvsc.c
  48. 10 6
      examples/Makefile.am
  49. 4 4
      examples/basic_examples/dynamic_handles.c
  50. 1 1
      examples/binary/binary.c
  51. 104 0
      examples/callback/prologue.c
  52. 12 12
      examples/cg/cg_kernels.c
  53. 11 3
      examples/cholesky/cholesky.h
  54. 14 0
      examples/cholesky/cholesky_grain_tag.c
  55. 20 7
      examples/cholesky/cholesky_implicit.c
  56. 19 39
      examples/cholesky/cholesky_models.c
  57. 15 0
      examples/cholesky/cholesky_tag.c
  58. 14 0
      examples/cholesky/cholesky_tile_tag.c
  59. 1 1
      examples/cpp/incrementer_cpp.cpp
  60. 26 0
      examples/heat/dw_factolu.c
  61. 0 5
      examples/heat/dw_factolu.h
  62. 5 0
      examples/heat/dw_factolu_grain.c
  63. 5 0
      examples/heat/dw_factolu_tag.c
  64. 26 84
      examples/heat/lu_kernels_model.c
  65. 20 0
      examples/heat/lu_kernels_model.h
  66. 14 16
      examples/interface/complex.c
  67. 3 1
      examples/interface/complex_codelet.h
  68. 4 2
      examples/lu/lu_example.c
  69. 3 3
      examples/mandelbrot/mandelbrot.c
  70. 8 8
      examples/pipeline/pipeline.c
  71. 1 1
      examples/sched_ctx/dummy_sched_with_ctx.c
  72. 2 2
      examples/sched_ctx/parallel_code.c
  73. 60 0
      examples/sched_ctx/prio.c
  74. 3 3
      examples/sched_ctx/sched_ctx.c
  75. 2 2
      examples/sched_ctx_utils/sched_ctx_utils.c
  76. 4 2
      examples/stencil/stencil-kernels.c
  77. 7 1
      examples/stencil/stencil-tasks.c
  78. 6 6
      gcc-plugin/src/tasks.c
  79. 4 4
      gcc-plugin/tests/base.c
  80. 2 2
      gcc-plugin/tests/lib-user.c
  81. 10 10
      gcc-plugin/tests/mocks.h
  82. 3 3
      gcc-plugin/tests/opencl.c
  83. 2 2
      gcc-plugin/tests/output-pointer.c
  84. 4 4
      gcc-plugin/tests/pointers.c
  85. 1 3
      include/starpu.h
  86. 0 2
      include/starpu_data.h
  87. 6 1
      include/starpu_data_interfaces.h
  88. 1 1
      include/starpu_deprecated_api.h
  89. 15 9
      include/starpu_disk.h
  90. 3 2
      include/starpu_fxt.h
  91. 25 41
      include/starpu_perfmodel.h
  92. 1 0
      include/starpu_profiling.h
  93. 27 42
      include/starpu_sched_ctx.h
  94. 51 0
      include/starpu_sched_ctx_hypervisor.h
  95. 12 7
      include/starpu_scheduler.h
  96. 7 0
      include/starpu_task.h
  97. 18 13
      include/starpu_task_util.h
  98. 5 5
      include/starpu_thread.h
  99. 2 2
      include/starpu_thread_util.h
  100. 0 0
      include/starpu_util.h

+ 2 - 3
AUTHORS

@@ -1,4 +1,3 @@
-Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
 William Braik <wbraik@gmail.com>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
@@ -9,11 +8,9 @@ Jean-Marie Couteyen <jm.couteyen@gmail.com>
 Nathalie Furmento <nathalie.furmento@labri.fr>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
-Andra Hugo <andra.hugo@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
 Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
-Joris Pablo <joris.pablo@orange.fr>
 Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
@@ -23,3 +20,5 @@ Ludovic Stordeur <ludovic.stordeur@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>
+Andra Hugo <andra.hugo@inria.fr>
+Joris Pablo <joris.pablo@orange.fr>

+ 35 - 2
ChangeLog

@@ -40,15 +40,39 @@ New features:
     handle (sequential consistency will be enabled or disabled based
     on the value of the function parameter and the value of the
     sequential consistency defined for the given data)
+  * Performance models files are now stored in a directory whose name
+    include the version of the performance model format. The version
+    number is also written in the file itself.
+    When updating the format, the internal variable
+    _STARPU_PERFMODEL_VERSION should be updated. It is then possible
+    to switch easily between differents versions of StarPU having
+    different performance model formats.
+  * Tasks can now define a optional prologue callback which is executed
+    on the host when the task becomes ready for execution, before getting
+    scheduled.
+  * Small CUDA allocations (<= 4MiB) are now batched to avoid the huge
+    cudaMalloc overhead.
+  * Prefetching is now done for all schedulers when it can be done whatever
+    the scheduling decision.
 
 Small features:
-  * Add cl_arg_free field to enable automatic free(cl_arg) on task
-    destroy.
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
   * New configure option --enable-fxt-lock which enables additional
     trace events focused on locks behaviour during the execution
+  * New function starpu_perfmodel_directory() to print directory
+    storing performance models. Available through the new option -d of
+    the tool starpu_perfmodel_display
+  * New batch files to execute StarPU applications under Microsoft
+    Visual Studio (They are installed in path_to_starpu/bin/mvsc)/
+  * Add cl_arg_free, callback_arg_free, prologue_callback_arg_free fields to
+    enable automatic free(cl_arg); free(callback_arg);
+    free(prologue_callback_arg) on task destroy.
+  * New function starpu_task_build
+  * Functions starpu_insert_task and starpu_mpi_insert_task are
+    renamed in starpu_task_insert and starpu_mpi_task_insert. Old
+    names are kept to avoid breaking old codes.
 
 Changes:
   * Fix of the livelock issue discovered while executing applications
@@ -56,6 +80,15 @@ Changes:
     threshold before a blocking lock.
   * Data interfaces (variable, vector, matrix and block) now define
     pack und unpack functions
+  * Fix for properly dealing with NAN on windows systems
+  * StarPU-MPI: Fix for being able to receive data which have not yet
+    been registered by the application (i.e it did not call
+    starpu_data_set_tag(), data are received as a raw memory)
+  * StarPU-MPI: Fix for being able to receive data with the same tag
+    from several nodes (see mpi/tests/gather.c)
+  * Function starpu_sched_ctx_create() now takes a variable argument
+    list to define the scheduler to be used, and the minimum and
+    maximum priority values
 
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================

+ 21 - 5
INSTALL

@@ -220,15 +220,15 @@ Running StarPU Applications on Microsoft Visual C
 -------------------------------------------------
 
 Batch files are provided to run StarPU applications under Microsoft
-Visual C. They are installed in path_to_starpu/bin/msvc.
+Visual C. They are installed in path_to_starpu/bin/mvsc.
 
 To execute a StarPU application, you first need to set the environment
 variable STARPUPATH.
 
-cd c:\cygwin\home\ci\starpu\
-set STARPUPATH=c:\cygwin\home\ci\starpu\
-cd bin\msvc
-starpu_exec.bat starpu_simple.c
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPUPATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\mvsc
+c:\....> starpu_open.bat starpu_simple.c
 
 The batch script will run Microsoft Visual C with a basic project file
 to run the given application.
@@ -236,3 +236,19 @@ to run the given application.
 The batch script starpu_clean.bat can be used to delete all
 compilation generated files.
 
+The batch script starpu_exec.bat can be used to compile and execute a
+StarPU application from the command prompt.
+
+c:\....> cd c:\cygwin\home\ci\starpu\
+c:\....> set STARPUPATH=c:\cygwin\home\ci\starpu\
+c:\....> cd bin\mvsc
+c:\....> starpu_exec.bat ..\..\..\..\examples\basic_examples\hello_world.c
+
+MVSC StarPU Execution
+...
+/out:hello_world.exe
+...
+Hello world (params = {1, 2.00000})
+Callback function got argument 0000042
+c:\....>
+

+ 8 - 10
Makefile.am

@@ -19,10 +19,7 @@ CLEANFILES = *.gcno *.gcda *.linkinfo
 
 SUBDIRS = src
 SUBDIRS += tools tests
-
-if BUILD_DOC
-SUBDIRS += doc/doxygen
-endif
+SUBDIRS += doc
 
 if USE_MPI
 SUBDIRS += mpi
@@ -49,7 +46,7 @@ SUBDIRS += sc_hypervisor
 endif
 
 pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = libstarpu.pc starpu-1.0.pc starpu-1.1.pc
+pkgconfig_DATA = libstarpu.pc starpu-1.0.pc starpu-1.1.pc starpu-1.2.pc
 
 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
 versinclude_HEADERS = 				\
@@ -76,6 +73,7 @@ versinclude_HEADERS = 				\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
 	include/starpu_sched_ctx.h		\
+	include/starpu_sched_ctx_hypervisor.h	\
 	include/starpu_top.h			\
 	include/starpu_deprecated_api.h         \
 	include/starpu_hash.h			\
@@ -100,14 +98,14 @@ all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
-	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top$(EXEEXT)
 # TODO: resources
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
-	$(INSTALL_STRIP_PROGRAM) starpu-top/starpu_top $(DESTDIR)$(bindir)
+	-$(INSTALL_STRIP_PROGRAM) starpu-top/starpu_top$(EXEEXT) $(DESTDIR)$(bindir)
 uninstall-local:
-	$(RM) $(DESTDIR)$(bindir)/starpu_top
-	$(RM) starpu-top/starpu_top
+	$(RM) $(DESTDIR)$(bindir)/starpu_top$(EXEEXT)
+	$(RM) starpu-top/starpu_top$(EXEEXT)
 	$(RM) starpu-top/Makefile
 
 if STARPU_HAVE_HELP2MAN
@@ -124,7 +122,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
 
 DISTCLEANFILES = STARPU-REVISION
 

+ 153 - 29
configure.ac

@@ -54,8 +54,8 @@ AC_CANONICAL_SYSTEM
 dnl Automake 1.11 introduced `silent-rules' and `color-tests'.  Use them
 dnl when they're available.
 m4_ifdef([AM_SILENT_RULES],
-  [AM_INIT_AUTOMAKE([1.11 -Wall -Werror foreign silent-rules color-tests parallel-tests])],
-  [AM_INIT_AUTOMAKE([1.10 -Wall -Werror foreign])])
+  [AM_INIT_AUTOMAKE([1.11 -Wall foreign silent-rules color-tests parallel-tests])],
+  [AM_INIT_AUTOMAKE([1.10 -Wall foreign])])
 
 m4_ifdef([AM_SILENT_RULES],
   [AM_SILENT_RULES(yes)])
@@ -255,8 +255,20 @@ AM_CONDITIONAL([STARPU_LONG_CHECK], [test "x$enable_long_check" = "xyes"])
 
 AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to 1 if you have the <malloc.h> header file.])])
 
-AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
-AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
+AC_ARG_ENABLE(valgrind, [AS_HELP_STRING([--disable-valgrind],
+				   [Do not check the availability of valgrind.h and helgrind.h])],
+				   enable_valgrind=$enableval, enable_valgrind=yes)
+if test "$enable_valgrind" != "no" ; then
+   AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
+   AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
+fi
+if test "$enable_valgrind" = "full" ; then
+   AC_DEFINE([STARPU_VALGRIND_FULL], [1], [Define to 1 to disable STARPU_SKIP_IF_VALGRIND when running tests.])
+fi
+
+AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
+
+AC_CHECK_HEADERS([aio.h])
 
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
@@ -342,11 +354,6 @@ if test x$enable_cpu = xyes; then
 	AC_DEFINE(STARPU_USE_CPU, [1], [CPU driver is activated])
 fi
 
-# How many parallel worker can we support ?
-nmaxcombinedworkers=`expr 2 \* $maxcpus`
-AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
-	[$nmaxcombinedworkers], [Maximum number of worker combinations])
-
 ###############################################################################
 #                                                                             #
 #                                 CUDA settings                               #
@@ -497,7 +504,7 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 	STARPU_CHECK_CUDA("$CUDA_ROOT", "$CUDA_PATH", "$CUDA_INC_PATH", "$CUDA_LIB_PATH")
     fi
     if test "$have_valid_cuda" = "no" ; then
-	for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_ROOT" "$CUDA_PATH" "$CUDA_INC_PATH/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
+	for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_ROOT" "$CUDA_PATH" "$CUDA_INC_PATH/.." "$CUDA_INC/.." "$CUDA_BIN/.." "$CUDA_SDK/.." "$CUDA_INSTALL_PATH" "$CUDA_TOOLKIT"; do
 	    if test -n "$f" ; then
 		STARPU_CHECK_CUDA("$f", "no", "no")
 		if test "$have_valid_cuda" = "yes" ; then
@@ -945,7 +952,7 @@ AC_ARG_ENABLE(blocking-drivers, [AS_HELP_STRING([--enable-blocking-drivers], [en
 				enable_blocking=$enableval, enable_blocking=no)
 AC_MSG_RESULT($enable_blocking)
 
-if test x$enable_blocking = xno -a x$enable_simgrid != xyes ; then
+if test x$enable_blocking = xno ; then
 	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 fi
 
@@ -967,7 +974,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 AC_MSG_CHECKING(maximum number of MIC threads)
 AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
 			[maximum number of MIC threads])],
-			nmaxmicthreads=$enableval, nmaxmicthreads=128)
+			nmaxmicthreads=$enableval, nmaxmicthreads=940)
 AC_MSG_RESULT($nmaxmicthread)
 
 AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
@@ -996,7 +1003,6 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     __coi_dir=$1
     __coi_include_dir=$2
     __coi_lib_dir=$3
-    __coi_lib_name=$4
 
     if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
 	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
@@ -1025,14 +1031,14 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
 
     if test "$have_valid_coi" = "yes" ; then
-	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
 
         if test "$have_valid_coi" = "no" ; then
             if test "$3" = "no" -a "$__coi_dir" != "no" ; then
 		# ${__coi_dir}/lib didn't work, let's try with lib64
                 __coi_lib_dir="$__coi_dir/lib64"
 		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
-	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
             fi
         fi
     fi
@@ -1041,8 +1047,89 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
         STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
     fi
 
-    if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
-        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
+    if test "$have_valid_coi" = "yes" ; then
+        if test "$__coi_lib_dir" != "no"; then
+	    STARPU_COI_LDFLAGS="-L$__coi_lib_dir"
+        fi
+	STARPU_COI_LDFLAGS="${STARPU_COI_LDFLAGS} -l$4"
+    fi
+
+    CPPFLAGS="${SAVED_CPPFLAGS}"
+    LDFLAGS="${SAVED_LDFLAGS}"
+])
+
+AC_ARG_WITH(scif-dir,
+	[AS_HELP_STRING([--with-scif-dir=<path>],
+	[specify the MIC's SCIF installation directory])],
+	[scif_dir="$withval"],
+	[scif_dir=no])
+
+AC_ARG_WITH(scif-include-dir,
+	[AS_HELP_STRING([--with-scif-include-dir=<path>],
+	[specify where the MIC's SCIF headers are installed])],
+	[scif_include_dir="$withval"],
+	[scif_include_dir=no])
+
+AC_ARG_WITH(scif-lib-dir,
+	[AS_HELP_STRING([--with-scif-lib-dir=<path>],
+	[specify where the MIC's SCIF libraries are installed])],
+	[scif_lib_dir="$withval"],
+	[scif_lib_dir=no])
+
+AC_DEFUN([STARPU_CHECK_SCIF_RUNTIME],
+[
+    __scif_dir=$1
+    __scif_include_dir=$2
+    __scif_lib_dir=$3
+
+    if test "$__scif_dir" != "no" -a "$__scif_dir" != "" ; then
+	AC_MSG_CHECKING(whether MIC's SCIF runtime is available in $__scif_dir)
+    else
+	AC_MSG_CHECKING(whether MIC's SCIF runtime is available)
+    fi
+    AC_MSG_RESULT()
+
+    if test "$__scif_include_dir" = "no" -a "$__scif_dir" != "no" ; then
+        __scif_include_dir="${__scif_dir}/include"
+    fi
+    if test "$__scif_lib_dir" = "no" -a "$__scif_dir" != "no" ; then
+        __scif_lib_dir="${__scif_dir}/lib"
+    fi
+
+    SAVED_CPPFLAGS="$CPPFLAGS"
+    SAVED_LDFLAGS="$LDFLAGS"
+
+    if test "$__scif_include_dir" != "no" ; then
+        CPPFLAGS="${CPPFLAGS} -I$__scif_include_dir"
+    fi
+    if test "$__scif_lib_dir" != "no" ; then
+	LDFLAGS="${LDFLAGS} -L$__scif_lib_dir"
+    fi
+
+#    AC_CHECK_HEADER([source/SCIFEngine_source.h],[have_valid_scif=yes],[have_valid_scif=no])
+
+#    if test "$have_valid_scif" = "yes" ; then
+	AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
+
+        if test "$have_valid_scif" = "no" ; then
+            if test "$3" = "no" -a "$__scif_dir" != "no" ; then
+		# ${__scif_dir}/lib didn't work, let's try with lib64
+                __scif_lib_dir="$__scif_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__scif_lib_dir"
+	        AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
+            fi
+        fi
+#    fi
+
+    if test "$have_valid_scif" = "yes" -a "$__scif_include_dir" != "no"; then
+        STARPU_SCIF_CPPFLAGS="-I$__scif_include_dir"
+    fi
+
+    if test "$have_valid_scif" = "yes" ; then
+        if test "$__scif_lib_dir" != "no"; then
+	    STARPU_SCIF_LDFLAGS="-L$__scif_lib_dir"
+        fi
+	STARPU_SCIF_LDFLAGS="${STARPU_SCIF_LDFLAGS} -lscif"
     fi
 
     CPPFLAGS="${SAVED_CPPFLAGS}"
@@ -1051,20 +1138,27 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
 if test x$enable_mic = xyes ; then
 
-    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
 
     # Host runtime is not compatible, we are probably cross-compiling
     # Let's have a look for the device runtime which lib has a different name
     if test "$have_valid_coi" = "no" ; then
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
     fi
 
+    STARPU_CHECK_SCIF_RUNTIME($scif_dir, $scif_include_dir, $scif_lib_dir)
+
     if test "$have_valid_coi" = "no" ; then
 	AC_MSG_ERROR([cannot find MIC's COI runtime])
     fi
+    if test "$have_valid_scif" = "no" ; then
+	AC_MSG_ERROR([cannot find MIC's SCIF runtime])
+    fi
 
     AC_SUBST(STARPU_COI_CPPFLAGS)
     AC_SUBST(STARPU_COI_LDFLAGS)
+    AC_SUBST(STARPU_SCIF_CPPFLAGS)
+    AC_SUBST(STARPU_SCIF_LDFLAGS)
 fi
 
 ###############################################################################
@@ -1207,14 +1301,20 @@ AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug], [enable debug mode])],
 			enable_debug=$enableval, enable_debug=no)
 AC_MSG_RESULT($enable_debug)
 
+AC_ARG_ENABLE(spinlock_check, [AS_HELP_STRING([--enable-spinlock-check], [enable spinlock check])], enable_spinlock_check=$enableval, enable_spinlock_check=no)
+
 if test x$enable_debug = xyes; then
 	CFLAGS="$CFLAGS -O0"
-	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+	enable_spinlock_check=yes
 else
 	CFLAGS="-O3 $CFLAGS"
 fi
 CFLAGS+=" -gdwarf-2 -g3 "
 
+if test x$enable_spinlock_check = xyes; then
+	AC_DEFINE(STARPU_SPINLOCK_CHECK, [1], [check spinlock use])
+fi
+
 AC_MSG_CHECKING(whether extra checks should be performed)
 AC_ARG_ENABLE(fast, [AS_HELP_STRING([--enable-fast],
 			[do not enforce assertions])],
@@ -1242,6 +1342,7 @@ AC_SUBST(COVERAGE, $enable_coverage)
 AM_CONDITIONAL(STARPU_COVERAGE_ENABLED, [test "x$enable_coverage" = "xyes"])
 if test x$enable_coverage = xyes; then
 	CFLAGS="${CFLAGS} --coverage"
+	CXXFLAGS="${CXXFLAGS} --coverage"
 	LDFLAGS="${LDFLAGS} --coverage"
 fi
 
@@ -1476,6 +1577,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
+# Computes the maximun number of combined worker
+nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`  
+AC_MSG_CHECKING(Maximum number of workers combinations)
+AC_MSG_RESULT($nmaxcombinedworkers)
+AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
+	[$nmaxcombinedworkers], [Maximum number of worker combinations])
+
+
+
 # Computes the maximum number of implementations per arch
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
@@ -1485,6 +1595,13 @@ AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
 
+AC_LANG_PUSH([C++])
+AC_CHECK_HEADERS([leveldb/db.h], [AC_DEFINE([STARPU_HAVE_LEVELDB], [1], [Define to 1 if you have the <leveldb/db.h> header file.])])
+STARPU_HAVE_LIBRARY(LEVELDB, [leveldb])
+AM_CONDITIONAL(STARPU_HAVE_LEVELDB, test "x$ac_cv_lib_leveldb_main" = "xyes")
+AC_LANG_POP([C++])
+
+
 ###############################################################################
 #                                                                             #
 #                                    MPI                                      #
@@ -2175,22 +2292,24 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
 			enable_build_doc=$enableval, enable_build_doc=yes)
 
-# Check whether doxygen and pdflatex are installed
-AC_PATH_PROG(doxygencommand, doxygen)
-if test "$doxygencommand" = "" ; then
-	enable_build_doc="no"
-else
-	DOXYGEN_VERSION_MAJOR=`$doxygencommand --version| cut -d '.' -f1`
+if test "$enable_build_doc" = "yes" ; then
+   # Check whether doxygen and pdflatex are installed
+   AC_PATH_PROG(doxygencommand, doxygen)
+   if test "$doxygencommand" = "" ; then
+      	enable_build_doc="no"
+   else
+   	DOXYGEN_VERSION_MAJOR=`$doxygencommand --version| cut -d '.' -f1`
 	DOXYGEN_VERSION_MINOR=`$doxygencommand --version| cut -d '.' -f2`
 	if test $DOXYGEN_VERSION_MAJOR -ge 1 -a $DOXYGEN_VERSION_MINOR -ge 8 ; then
 	   	enable_build_doc="yes"
 	else
 	   	enable_build_doc="no"
 	fi
-fi
-AC_PATH_PROG(pdflatex, pdflatex)
-if test "pdflatexcommand" = "" ; then
+   fi
+   AC_PATH_PROG(pdflatex, pdflatex)
+   if test "pdflatexcommand" = "" ; then
 	enable_build_doc="no"
+   fi
 fi
 AC_MSG_CHECKING(whether documentation should be compiled)
 AC_MSG_RESULT($enable_build_doc)
@@ -2256,13 +2375,16 @@ AC_OUTPUT([
 	libstarpu.pc
 	starpu-1.0.pc
 	starpu-1.1.pc
+	starpu-1.2.pc
 	mpi/libstarpumpi.pc
 	mpi/starpumpi-1.0.pc
 	mpi/starpumpi-1.1.pc
+	mpi/starpumpi-1.2.pc
 	starpufft/Makefile
 	starpufft/libstarpufft.pc
 	starpufft/starpufft-1.0.pc
 	starpufft/starpufft-1.1.pc
+	starpufft/starpufft-1.2.pc
 	examples/Makefile
 	examples/stencil/Makefile
 	tests/Makefile
@@ -2282,9 +2404,11 @@ AC_OUTPUT([
 	sc_hypervisor/Makefile
 	sc_hypervisor/src/Makefile
 	sc_hypervisor/examples/Makefile
+	doc/Makefile
 	doc/doxygen/Makefile
 	doc/doxygen/doxygen-config.cfg
 	doc/doxygen/doxygen_filter.sh
+	tools/mvsc/starpu_var.bat
 ])
 
 AC_MSG_NOTICE([

+ 30 - 0
doc/Makefile.am

@@ -0,0 +1,30 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2013  Centre National de la Recherche Scientifique
+#
+# Permission is granted to copy, distribute and/or modify this document
+# under the terms of the GNU Free Documentation License, Version 1.3
+# or any later version published by the Free Software Foundation;
+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+#
+# See the GNU Free Documentation License in COPYING.GFDL for more details.
+
+if BUILD_DOC
+SUBDIRS = doxygen
+endif
+
+EXTRA_DIST =    tutorial/hello_world.c \
+		tutorial/hello_world_plugin.c \
+		tutorial/hello_world_mvsc.c \
+		tutorial/Makefile \
+		tutorial/README \
+		tutorial/vector_scal.c \
+		tutorial/vector_scal_cpu.c \
+		tutorial/vector_scal_cuda.cu \
+		tutorial/vector_scal_opencl.c \
+		tutorial/vector_scal_opencl_kernel.cl \
+		tutorial/vector_scal_plugin.c \
+		tutorial/vector_scal_plugin_cuda.cu
+
+txtdir = ${docdir}/tutorial
+txt_DATA = $(EXTRA_DIST)

+ 37 - 30
doc/doxygen/Makefile.am

@@ -86,7 +86,6 @@ chapters =	\
 	chapters/api/performance_model.doxy \
 	chapters/api/profiling.doxy \
 	chapters/api/running_driver.doxy \
-	chapters/api/scheduling_context_hypervisor.doxy \
 	chapters/api/scheduling_contexts.doxy \
 	chapters/api/scheduling_policy.doxy \
 	chapters/api/standard_memory_library.doxy \
@@ -94,61 +93,68 @@ chapters =	\
 	chapters/api/task_lists.doxy \
 	chapters/api/top.doxy \
 	chapters/api/versioning.doxy \
-	chapters/api/workers.doxy
+	chapters/api/workers.doxy \
+	chapters/api/threads.doxy \
+	chapters/api/toolbox.doxy \
+	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
+	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy
 
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	sed 's/#undef \(.*\)/#define \1 1/' $< > $@
 
 chapters/version.sty: $(chapters)
-	@for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
-        done | sort -r | head -1 > timestamp
-	@if test -s timestamp ; then \
-		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
-		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
+	for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
+        done | sort -r | head -1 > timestamp_sty
+	if test -s timestamp_sty ; then \
+		LC_ALL=C date --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
+		LC_ALL=C date --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
 	fi
-	@if test -s timestamp_updated ; then \
-		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
+	if test -s timestamp_sty_updated ; then \
+		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_sty_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	else \
 		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
-	@echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
-	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
+	for f in timestamp_sty timestamp_sty_updated timestamp_sty_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
 chapters/version.html: $(chapters)
-	@for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
-        done | sort -r | head -1 > timestamp
-	@if test -s timestamp ; then \
-		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
-		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
+	for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
+        done | sort -r | head -1 > timestamp_html
+	if test -s timestamp_html ; then \
+		LC_ALL=C date --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
+		LC_ALL=C date --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
 	fi
-	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
-	@if test -s timestamp_updated ; then \
-		echo "Its contents was last updated on "`cat timestamp_updated`"." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
+	echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
+	if test -s timestamp_html_updated ; then \
+		echo "Its contents was last updated on "`cat timestamp_html_updated`"." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	else \
 		echo "Its contents was last updated on <em>unknown_date</em>." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	fi
-	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	for f in timestamp_html timestamp_html_updated timestamp_html_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
-EXTRA_DIST	= 		\
-	$(chapters) 		\
-	chapters/version.sty	\
-	chapters/version.html	\
-	chapters/tasks_size_overhead.png	\
-	chapters/tasks_size_overhead.eps	\
-	chapters/tasks_size_overhead.pdf	\
+EXTRA_DIST	= 					\
+	$(chapters) 					\
+	chapters/version.sty				\
+	chapters/version.html				\
+	chapters/data_trace.eps				\
+	chapters/data_trace.pdf				\
+	chapters/data_trace.png				\
 	chapters/starpu_non_linear_memset_regression_based.png	\
 	chapters/starpu_non_linear_memset_regression_based.eps	\
 	chapters/starpu_non_linear_memset_regression_based.pdf	\
 	chapters/starpu_starpu_slu_lu_model_11.png	\
 	chapters/starpu_starpu_slu_lu_model_11.eps	\
 	chapters/starpu_starpu_slu_lu_model_11.pdf	\
-	doxygen.cfg 		\
+	chapters/tasks_size_overhead.png		\
+	chapters/tasks_size_overhead.eps		\
+	chapters/tasks_size_overhead.pdf		\
+	doxygen.cfg 					\
 	refman.tex
 
 dox_inputs = $(DOX_CONFIG) 				\
@@ -180,6 +186,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_scheduler.h	\
 	$(top_srcdir)/include/starpu_sched_node.h	\
 	$(top_srcdir)/include/starpu_sched_ctx.h	\
+	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h		\
 	$(top_srcdir)/include/starpu_top.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_rand.h		\

+ 22 - 22
doc/doxygen/chapters/advanced_examples.doxy

@@ -399,11 +399,11 @@ the number of iterations in the base.
 StarPU will automatically determine when the performance model is calibrated,
 or rather, it will assume the performance model is calibrated until the
 application submits a task for which the performance can not be predicted. For
-::STARPU_HISTORY_BASED, StarPU will require 10 (::_STARPU_CALIBRATION_MINIMUM)
+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
 measurements for a given size before estimating that an average can be taken as
 estimation for further executions with the same size. For
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
-10 (::_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
 data size is smaller than 90% of the maximum measured data size (i.e. the
 measurement interval is large enough for a regression to have a meaning).
 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
@@ -481,7 +481,7 @@ to a less optimal solution. This increases even more computation time.
 
 \section InsertTaskUtility Insert Task Utility
 
-StarPU provides the wrapper function starpu_insert_task() to ease
+StarPU provides the wrapper function starpu_task_insert() to ease
 the creation and submission of tasks.
 
 Here the implementation of the codelet:
@@ -508,17 +508,17 @@ struct starpu_codelet mycodelet = {
 };
 \endcode
 
-And the call to the function starpu_insert_task():
+And the call to the function starpu_task_insert():
 
 \code{.c}
-starpu_insert_task(&mycodelet,
+starpu_task_insert(&mycodelet,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
                    STARPU_VALUE, &ffactor, sizeof(ffactor),
                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
                    0);
 \endcode
 
-The call to starpu_insert_task() is equivalent to the following
+The call to starpu_task_insert() is equivalent to the following
 code:
 
 \code{.c}
@@ -540,7 +540,7 @@ int ret = starpu_task_submit(task);
 Here a similar call using ::STARPU_DATA_ARRAY.
 
 \code{.c}
-starpu_insert_task(&mycodelet,
+starpu_task_insert(&mycodelet,
                    STARPU_DATA_ARRAY, data_handles, 2,
                    STARPU_VALUE, &ifactor, sizeof(ifactor),
                    STARPU_VALUE, &ffactor, sizeof(ffactor),
@@ -554,11 +554,11 @@ instance, assuming that the index variable <c>i</c> was registered as handle
 
 \code{.c}
 /* Compute which portion we will work on, e.g. pivot */
-starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
 
 /* And submit the corresponding task */
 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
-                       starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
 \endcode
 
 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
@@ -637,7 +637,7 @@ dot products with partitioned vectors:
 
 \code{.c}
 for (b = 0; b < nblocks; b++)
-    starpu_insert_task(&dot_kernel_cl,
+    starpu_task_insert(&dot_kernel_cl,
         STARPU_REDUX, dtq_handle,
         STARPU_R, starpu_data_get_sub_data(v1, 1, b),
         STARPU_R, starpu_data_get_sub_data(v2, 1, b),
@@ -659,9 +659,9 @@ the initial status <c>register(NULL)</c>.
 The example <c>cg</c> also uses reduction for the blocked gemv kernel,
 leading to yet more relaxed dependencies and more parallelism.
 
-::STARPU_REDUX can also be passed to starpu_mpi_insert_task() in the MPI
+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
 case. That will however not produce any MPI communication, but just pass
-::STARPU_REDUX to the underlying starpu_insert_task(). It is up to the
+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
 application to call starpu_mpi_redux_data(), which posts tasks that will
 reduce the partial results among MPI nodes into the MPI node which owns the
 data. For instance, some hypothetical application which collects partial results
@@ -670,11 +670,11 @@ with a new reduction:
 
 \code{.c}
 for (i = 0; i < 100; i++) {
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
                STARPU_R, B, STARPU_REDUX, res, 0);
     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
-    starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
 }
 \endcode
 
@@ -705,9 +705,9 @@ unregistration.
 
 \code{.c}
 starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
-starpu_insert_task(&produce_data, STARPU_W, handle, 0);
-starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
-starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
 starpu_data_unregister_submit(handle);
 \endcode
 
@@ -725,7 +725,7 @@ provides per-worker buffers without content consistency.
 \code{.c}
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
 for (i = 0; i < N; i++)
-    starpu_insert_task(&compute, STARPU_R, input[i],
+    starpu_task_insert(&compute, STARPU_R, input[i],
                        STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
 \endcode
 
@@ -1028,7 +1028,7 @@ starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
                             output, num_bytes / sizeof(float4), sizeof(float4));
 
 /* The handle can now be used as usual */
-starpu_insert_task(&cl, STARPU_RW, handle, 0);
+starpu_task_insert(&cl, STARPU_RW, handle, 0);
 
 /* ... */
 
@@ -1122,7 +1122,7 @@ Complex data interfaces can then be registered to StarPU.
 \code{.c}
 double real = 45.0;
 double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
-starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
 \endcode
 
 and used by codelets.
@@ -1186,7 +1186,7 @@ for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 {
 	handles[i] = handle;
 }
-starpu_insert_task(&dummy_big_cl,
+starpu_task_insert(&dummy_big_cl,
         	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
 		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
 		 0);

+ 54 - 7
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -56,7 +56,6 @@ The task is waiting for a task.
 \ingroup API_Codelet_And_Tasks
 The task is waiting for some data.
 
-
 \def STARPU_CPU
 \ingroup API_Codelet_And_Tasks
 This macro is used when setting the field starpu_codelet::where
@@ -110,26 +109,40 @@ this macro indicates the codelet will have several implementations.
 The use of this macro is deprecated. One should always only define the
 field starpu_codelet::opencl_funcs.
 
-\def starpu_cpu_func_t
+\def STARPU_NMAXBUFS
+\ingroup API_Codelet_And_Tasks
+Defines the maximum number of buffers that tasks will be able to take
+as parameters. The default value is 8, it can be changed by using the
+configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
+\typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
 
-\def starpu_cuda_func_t
+\typedef starpu_cuda_func_t
 \ingroup API_Codelet_And_Tasks
 CUDA implementation of a codelet.
 
-\def starpu_opencl_func_t
+\typedef starpu_opencl_func_t
 \ingroup API_Codelet_And_Tasks
 OpenCL implementation of a codelet.
 
-\def starpu_mic_func_t
+\typedef starpu_mic_func_t
 \ingroup API_Codelet_And_Tasks
 MIC implementation of a codelet.
 
-\def starpu_scc_func_t
+\typedef starpu_scc_func_t
 \ingroup API_Codelet_And_Tasks
 SCC implementation of a codelet.
 
+\typedef starpu_mic_kernel_t
+\ingroup API_Codelet_And_Tasks
+MIC kernel for a codelet
+
+\typedef *starpu_scc_kernel_t
+\ingroup API_Codelet_And_Tasks
+SCC kernel for a codelet
+
 \struct starpu_codelet
 The codelet structure describes a kernel that is possibly
 implemented on various targets. For compatibility, make sure to
@@ -137,7 +150,7 @@ initialize the whole structure to zero, either by using explicit
 memset, or the function starpu_codelet_init(), or by letting the
 compiler implicitly do it in e.g. static storage case.
 \ingroup API_Codelet_And_Tasks
-\var starpu_codelet::where.
+\var starpu_codelet::where
 Optional field to indicate which types of processing units are able to
 execute the codelet. The different values ::STARPU_CPU, ::STARPU_CUDA,
 ::STARPU_OPENCL can be combined to specify on which types of processing
@@ -393,6 +406,33 @@ Optional field, the default value is <c>NULL</c>. This is the pointer
 passed to the callback function. This field is ignored if the field
 starpu_task::callback_func is set to <c>NULL</c>.
 
+\var starpu_task::callback_arg_free
+Optional field. In case starpu_task::callback_arg was allocated by the
+application through <c>malloc()</c>, setting starpu_task::callback_arg_free
+to 1 makes StarPU automatically call <c>free(callback_arg)</c> when
+destroying the task.
+
+\var starpu_task::prologue_callback_func
+Optional field, the default value is <c>NULL</c>. This is a function
+pointer of prototype <c>void (*f)(void *)</c> which specifies a
+possible callback. 
+If this pointer is non-null, the callback function
+is executed on the host when the task becomes ready for execution,
+before getting scheduled. The callback is passed the
+value contained in the starpu_task::prologue_callback_arg field. No callback is
+executed if the field is set to NULL.
+
+\var starpu_task::prologue_callback_arg (optional) (default: NULL)
+Optional field, the default value is <c>NULL</c>. This is the pointer
+passed to the prologue callback function. This field is ignored if the field
+starpu_task::prologue_callback_func is set to <c>NULL</c>.
+
+\var starpu_task::prologue_callback_arg_free
+Optional field. In case starpu_task::prologue_callback_arg was allocated by the
+application through <c>malloc()</c>, setting starpu_task::prologue_callback_arg_free
+to 1 makes StarPU automatically call <c>free(prologue_callback_arg)</c> when
+destroying the task.
+
 \var starpu_task::use_tag
 Optional field, the default value is 0. If set, this flag indicates
 that the task should be associated with the tag contained in the
@@ -636,6 +676,13 @@ starpu_task_submit() can be called from anywhere, including codelet
 functions and callbacks, provided that the field
 starpu_task::synchronous is set to 0.
 
+\fn int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
+\ingroup API_Codelet_And_Tasks
+This function submits a task to StarPU to the context <c> sched_ctx_id </c>.
+By default starpu_task_submit submits the task to a global context that is
+created automatically by StarPU.
+
+
 \fn int starpu_task_wait_for_all(void)
 \ingroup API_Codelet_And_Tasks
 This function blocks until all the tasks that were submitted

+ 6 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -14,6 +14,12 @@ This macro is defined when StarPU has been installed with CUDA
 support. It should be used in your code to detect the availability of
 CUDA as shown in \ref FullSourceCodeVectorScal.
 
+\def STARPU_MAXCUDADEVS
+\ingroup API_CUDA_Extensions
+This macro defines the maximum number of CUDA devices that are
+supported by StarPU.
+
+
 \fn cudaStream_t starpu_cuda_get_local_stream(void)
 \ingroup API_CUDA_Extensions
 This function gets the current worker’s CUDA stream. StarPU

+ 27 - 1
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -186,6 +186,32 @@ to manage asynchronicity. This must return -EAGAIN if any of the
 starpu_interface_copy() calls has returned -EAGAIN (i.e. at least some
 transfer is still ongoing), and return 0 otherwise.
 
+\enum starpu_data_interface_id
+\ingroup API_Data_Interfaces
+Identifier for all predefined StarPU data interfaces
+\var starpu_data_interface_id::STARPU_UNKNOWN_INTERFACE_ID
+Unknown interface
+\var starpu_data_interface_id::STARPU_MATRIX_INTERFACE_ID
+Identifier for the matrix data interface
+\var starpu_data_interface_id::STARPU_BLOCK_INTERFACE_ID
+Identifier for block data interface
+\var starpu_data_interface_id::STARPU_VECTOR_INTERFACE_ID
+Identifier for the vector data interface
+\var starpu_data_interface_id::STARPU_CSR_INTERFACE_ID
+Identifier for the csr data interface
+\var starpu_data_interface_id::STARPU_BCSR_INTERFACE_ID
+Identifier for the bcsr data interface
+\var starpu_data_interface_id::STARPU_VARIABLE_INTERFACE_ID
+Identifier for the variable data interface
+\var starpu_data_interface_id::STARPU_VOID_INTERFACE_ID
+Identifier for the void data interface
+\var starpu_data_interface_id::STARPU_MULTIFORMAT_INTERFACE_ID
+Identifier for the multiformat data interface
+\var starpu_data_interface_id::STARPU_COO_INTERFACE_ID
+Identifier for the coo data interface
+\var starpu_data_interface_id::STARPU_MAX_INTERFACE_ID
+Maximum number of data interfaces
+
 @name Registering Data
 \ingroup API_Data_Interfaces
 
@@ -726,7 +752,7 @@ addition to this.
 Return a pointer to the row pointer array of the matrix
 designated by \p interface.
 
-\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(interface)
+\def STARPU_BCSR_GET_ROWPTR_DEV_HANDLE(interface)
 \ingroup API_Data_Interfaces
 Return a device handle for the row pointer array of the matrix
 designated by \p interface. The offset documented below has to be used in

+ 1 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -251,7 +251,7 @@ This is the same as starpu_data_acquire_cb(), except that the
 data will be available on the given memory node instead of main
 memory.
 
-\int int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
+\fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 \ingroup API_Data_Management
 This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
 data will be available on the given memory node instead of main

+ 31 - 6
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -7,25 +7,43 @@
 
 /*! \defgroup API_Out_Of_Core Out Of Core
 
-
-
 \struct starpu_disk_ops
 \ingroup API_Out_Of_Core
 This is a set of functions to manipulate datas on disk.
+\var starpu_disk_ops::alloc
+Create a new location for datas
+\var starpu_disk_ops::free
+Free an allocated data
+\var starpu_disk_ops::open
+Open an existing location of datas
+\var starpu_disk_ops::close
+Close without delete a location of datas
+\var starpu_disk_ops::read
+Read a data
+\var starpu_disk_ops::write
+Write a data
+\var starpu_disk_ops::plug
+Connect a disk memory
+\var starpu_disk_ops::unplug
+Disconnect a disk memory
+\var starpu_disk_ops::copy
+Copy disk to disk
+\var starpu_disk_ops::bandwidth
+Measue the bandwidth and the latency for the disk
 
-\fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size) 
+\fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size)
 \ingroup API_Out_Of_Core
 Register a disk memory node with a set of functions to manipulate datas. <br />
 SUCCESS: return the disk node. <br />
 FAIL: return an error code. <br />
 The \p size must be at least 1 MB !
 
-\fn void * starpu_disk_open (unsigned node, void *pos, size_t size)
+\fn void *starpu_disk_open(unsigned node, void *pos, size_t size)
 \ingroup API_Out_Of_Core
 Add an existing file memory in a disk node. The \p pos is defined in the starpu_disk_ops. \p size: this is a size of your file.
 \p pos is the name of the file.
 
-\fn void starpu_disk_close (unsigned node, void *obj, size_t size)
+\fn void starpu_disk_close(unsigned node, void *obj, size_t size)
 \ingroup API_Out_Of_Core
 Close an existing file memory opened with starpu_disk_open.
 
@@ -33,6 +51,7 @@ Close an existing file memory opened with starpu_disk_open.
 \ingroup API_Out_Of_Core
 This set uses the stdio library (fwrite, fread...) to read/write on disk. <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
+It doesn't support asynchronous transfers.
 
 \var starpu_disk_unistd_ops
 \ingroup API_Out_Of_Core
@@ -43,6 +62,12 @@ This set uses the unistd library (write, read...) to read/write on disk. <br />
 \ingroup API_Out_Of_Core
 This set uses the unistd library (write, read...) to read/write on disk with the O_DIRECT flag. <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
-Only available on Linux.
+Only available on Linux systems.
+
+\var starpu_disk_leveldb_ops
+\ingroup API_Out_Of_Core
+This set uses the leveldb created by Google <br />
+Show here: https://code.google.com/p/leveldb/ <br />
+It doesn't support asynchronous transfers.
 
 */

+ 22 - 11
doc/doxygen/chapters/api/insert_task.doxy

@@ -8,12 +8,16 @@
 
 /*! \defgroup API_Insert_Task Insert_Task
 
-\fn int starpu_insert_task(struct starpu_codelet *cl, ...)
+\def starpu_insert_task
+\ingroup API_Insert_Task
+Convenience macro for the function starpu_task_insert() which used to be called starpu_insert_task.
+
+\fn int starpu_task_insert(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task
 Create and submit a task corresponding to \p cl with the
 following arguments. The argument list must be zero-terminated.
 
-The arguments following the codelets can be of the following types:
+The arguments following the codelet can be of the following types:
 <ul>
 <li> ::STARPU_R, ::STARPU_W, ::STARPU_RW, ::STARPU_SCRATCH,
 ::STARPU_REDUX an access mode followed by a data handle;
@@ -35,18 +39,18 @@ implementation to retrieve them.
 
 \def STARPU_VALUE
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to a constant value and the size of the
 constant
 
 \def STARPU_CALLBACK
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to a callback function
 
 \def STARPU_CALLBACK_WITH_ARG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by two pointers: one to a callback function, and the other
 to be given as an argument to the callback function; this is
 equivalent to using both ::STARPU_CALLBACK and
@@ -54,13 +58,13 @@ equivalent to using both ::STARPU_CALLBACK and
 
 \def STARPU_CALLBACK_ARG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a pointer to be given as an argument to the callback
 function
 
 \def STARPU_PRIORITY
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by a integer defining a priority level
 
 \def STARPU_DATA_ARRAY
@@ -69,18 +73,18 @@ TODO
 
 \def STARPU_TAG
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must be followed by a tag.
+this macro is used when calling starpu_task_insert(), and must be followed by a tag.
 
 \def STARPU_FLOPS
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by an amount of floating point operations, as a double.
 Users <b>MUST</b> explicitly cast into double, otherwise parameter
 passing will not work.
 
 \def STARPU_SCHED_CTX
 \ingroup API_Insert_Task
-this macro is used when calling starpu_insert_task(), and must
+this macro is used when calling starpu_task_insert(), and must
 be followed by the id of the scheduling context to which we want to
 submit the task.
 
@@ -93,6 +97,13 @@ starpu_codelet_unpack_args().
 \fn void starpu_codelet_unpack_args(void *cl_arg, ...)
 \ingroup API_Insert_Task
 Retrieve the arguments of type ::STARPU_VALUE associated to a
-task automatically created using the function starpu_insert_task().
+task automatically created using the function starpu_task_insert().
+
+\fn struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
+\ingroup API_Insert_Task
+Create a task corresponding to \p cl with the following arguments.
+The argument list must be zero-terminated. The arguments
+following the codelet are the same as the ones for the function
+starpu_task_insert().
 
 */

+ 5 - 0
doc/doxygen/chapters/api/mic_extensions.doxy

@@ -13,6 +13,11 @@
 This macro is defined when StarPU has been installed with MIC support.
 It should be used in your code to detect the availability of MIC.
 
+\def STARPU_MAXMICDEVS
+\ingroup API_MIC_Extensions
+This macro defines the maximum number of MIC devices that are
+supported by StarPU.
+
 \typedef starpu_mic_func_symbol_t
 \ingroup API_MIC_Extensions
 Type for MIC function symbols

+ 14 - 4
doc/doxygen/chapters/api/mpi.doxy

@@ -11,6 +11,12 @@
 @name Initialisation
 \ingroup API_MPI_Support
 
+\def STARPU_USE_MPI
+\ingroup API_MPI_Support
+This macro is defined when StarPU has been installed with MPI
+support. It should be used in your code to detect the availability of
+MPI.
+
 \fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
 \ingroup API_MPI_Support
 Initializes the starpumpi library. \p initialize_mpi indicates if MPI
@@ -198,23 +204,27 @@ Returns the last value set by starpu_data_set_rank().
 
 \def STARPU_EXECUTE_ON_NODE
 \ingroup API_MPI_Support
-this macro is used when calling starpu_mpi_insert_task(), and must be
+this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a integer value which specified the node on which to
 execute the codelet.
 
 \def STARPU_EXECUTE_ON_DATA
 \ingroup API_MPI_Support
-this macro is used when calling starpu_mpi_insert_task(), and must be
+this macro is used when calling starpu_mpi_task_insert(), and must be
 followed by a data handle to specify that the node owning the given
 data will execute the codelet.
 
-\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
+\def starpu_mpi_insert_task
+\ingroup API_MPI_Support
+Convenience macro for the function starpu_mpi_task_insert() which used to be called starpu_mpi_insert_task.
+
+\fn int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
 Create and submit a task corresponding to codelet with the following
 arguments. The argument list must be zero-terminated.
 
 The arguments following the codelets are the same types as for the
-function starpu_insert_task(). The extra argument
+function starpu_task_insert(). The extra argument
 ::STARPU_EXECUTE_ON_NODE followed by an integer allows to specify the
 MPI node to execute the codelet. It is also possible to specify that
 the node owning a specific data will execute the codelet, by using

+ 4 - 0
doc/doxygen/chapters/api/multiformat_data_interface.doxy

@@ -64,6 +64,10 @@ returns the local pointer to the data with CUDA format.
 \ingroup API_Multiformat_Data_Interface
 returns the local pointer to the data with OpenCL format.
 
+\def STARPU_MULTIFORMAT_GET_MIC_PTR(interface)
+\ingroup API_Multiformat_Data_Interface
+returns the local pointer to the data with MIC format.
+
 \def STARPU_MULTIFORMAT_GET_NX(interface)
 \ingroup API_Multiformat_Data_Interface
 returns the number of elements in the data.

+ 10 - 0
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -14,6 +14,16 @@ This macro is defined when StarPU has been installed with
 OpenCL support. It should be used in your code to detect the
 availability of OpenCL as shown in \ref FullSourceCodeVectorScal.
 
+\def STARPU_MAXOPENCLDEVS
+\ingroup API_OpenCL_Extensions
+This macro defines the maximum number of OpenCL devices that are
+supported by StarPU.
+
+\def STARPU_OPENCL_DATADIR
+\ingroup API_OpenCL_Extensions
+This macro defines the directory in which the OpenCL codelets of the
+applications provided with StarPU have been installed.
+
 \struct starpu_opencl_program
 \ingroup API_OpenCL_Extensions
 Stores the OpenCL programs as compiled for the different OpenCL

+ 15 - 15
doc/doxygen/chapters/api/performance_model.doxy

@@ -219,15 +219,15 @@ external tools that should read the performance model files.
 unloads the given model which has been previously loaded
 through the function starpu_perfmodel_load_symbol()
 
-\fn void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl)
+\fn void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl)
 \ingroup API_Performance_Model
 returns the path to the debugging information for the performance model.
 
-\fn void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl)
+\fn void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl)
 \ingroup API_Performance_Model
 returns the architecture name for \p arch
 
-\fn enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
+\fn struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid)
 \ingroup API_Performance_Model
 returns the architecture type of a given worker.
 
@@ -235,7 +235,11 @@ returns the architecture type of a given worker.
 \ingroup API_Performance_Model
 prints a list of all performance models on \p output
 
-\fn void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
+\fn void starpu_perfmodel_directory(FILE *output)
+\ingroup API_Performance_Model
+prints the directory name storing performance models on \p output
+
+\fn void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 \ingroup API_Performance_Model
 todo
 
@@ -251,7 +255,7 @@ prints a matrix of bus bandwidths on \p f.
 \ingroup API_Performance_Model
 prints the affinity devices on \p f.
 
-\fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
+\fn void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 \ingroup API_Performance_Model
 This feeds the performance model model with an explicit
 measurement measured (in µs), in addition to measurements done by StarPU
@@ -260,20 +264,16 @@ existing set of measurements done in good conditions, that StarPU
 could benefit from instead of doing on-line measurements. And example
 of use can be seen in \ref PerformanceModelExample.
 
-\fn double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the bandwidth of data transfer between two memory nodes
 
-\fn double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
-
-\fn double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
-\ingroup API_Performance_Mode
-Used to compute the execution time of tasks
+Return the latency of data transfer between two memory nodes
 
-\fn double starpu_get_latency_CUDA_RAM(unsigned cudadev)
+\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the estimated time to transfer a given size between two memory nodes.
 
 */

+ 15 - 0
doc/doxygen/chapters/api/profiling.doxy

@@ -98,6 +98,16 @@ todo
 \var starpu_profiling_bus_info::transfer_count
         Number of transfers during profiling.
 
+\typedef STARPU_PROFILING_DISABLE
+\ingroup API_Profiling
+This value is used when calling the function
+starpu_profiling_status_set() to disable profiling.
+
+\typedef STARPU_PROFILING_ENABLE
+\ingroup API_Profiling
+This value is used when calling the function
+starpu_profiling_status_set() to enable profiling.
+
 \fn int starpu_profiling_status_set(int status)
 \ingroup API_Profiling
 This function sets the profiling status. Profiling is activated
@@ -114,6 +124,11 @@ previous status is returned.
 Return the current profiling status or a negative value in case
 there was an error.
 
+\fn void starpu_profiling_init(void)
+\ingroup API_Profiling
+This function resets performance counters and enable profiling if the
+environment variable \ref STARPU_PROFILING is set to a positive value.
+
 \fn void starpu_profiling_set_id(int new_id)
 \ingroup API_Profiling
 This function sets the ID used for profiling trace filename. It

+ 198 - 0
doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor.doxy

@@ -0,0 +1,198 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_SC_Hypervisor Scheduling Context Hypervisor - Building a new resizing policy
+
+\struct sc_hypervisor_policy
+\ingroup API_SC_Hypervisor
+This structure contains all the methods that implement a hypervisor resizing policy.
+\var sc_hypervisor_policy::name
+        Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
+\var sc_hypervisor_policy::custom
+        Indicates whether the policy is custom or not
+\var sc_hypervisor_policy::size_ctxs
+	Distribute workers to contexts even at the beginning of the program
+\var sc_hypervisor_policy::resize_ctxs
+	Require explicit resizing
+\var sc_hypervisor_policy::handle_idle_cycle
+        It is called whenever the indicated worker executes another idle cycle in sched_ctx
+\var sc_hypervisor_policy::handle_pushed_task
+        It is called whenever a task is pushed on the worker’s queue corresponding to the context sched_ctx
+\var sc_hypervisor_policy::handle_poped_task
+        It is called whenever a task is poped from the worker’s queue corresponding to the context sched_ctx
+\var sc_hypervisor_policy::handle_poped_task
+	The hypervisor takes a decision when another task was poped from this worker in this ctx
+\var sc_hypervisor_policy::handle_idle_end
+        It is called whenever a task is executed on the indicated worker and context after a long period of idle time
+\var sc_hypervisor_policy::handle_post_exec_hook
+        It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
+\var sc_hypervisor_policy::handle_submitted_job
+	The hypervisor takes a decision when a job was submitted in this ctx
+\var sc_hypervisor_policy::end_ctx
+	The hypervisor takes a decision when a certain ctx was deleted
+
+\struct sc_hypervisor_policy_config
+\ingroup API_SC_Hypervisor
+This structure contains all configuration information of a
+context. It contains configuration information for each context, which
+can be used to construct new resize strategies.
+\var sc_hypervisor_policy_config::min_nworkers
+        Indicates the minimum number of workers needed by the context
+\var sc_hypervisor_policy_config::max_nworkers
+        Indicates the maximum number of workers needed by the context
+\var sc_hypervisor_policy_config::granularity
+        Indicates the workers granularity of the context
+\var sc_hypervisor_policy_config::priority
+        Indicates the priority of each worker in the context
+\var sc_hypervisor_policy_config::max_idle
+        Indicates the maximum idle time accepted before a resize is triggered
+\var sc_hypervisor_policy_config::min_working
+	Indicates that underneath this limit the priority of the worker is reduced
+\var sc_hypervisor_policy_config::fixed_workers
+        Indicates which workers can be moved and which ones are fixed
+\var sc_hypervisor_policy_config:: new_workers_max_idle
+        Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
+\var sc_hypervisor_policy_config::ispeed_w_sample
+         Indicates the sample used to compute the instant speed per worker
+\var sc_hypervisor_policy_config::ispeed_ctx_sample
+        Indicates the sample used to compute the instant speed per ctxs
+\var sc_hypervisor_policy_config::time_sample
+        todo
+
+\struct sc_hypervisor_wrapper
+\ingroup API_SC_Hypervisor
+This structure is a wrapper of the contexts available in StarPU
+and contains all information about a context obtained by incrementing
+the performance counters.
+\var sc_hypervisor_wrapper::sched_ctx
+        The context wrapped
+\var sc_hypervisor_wrapper::config
+        The corresponding resize configuration
+\var sc_hypervisor_wrapper::current_idle_time
+        The idle time counter of each worker of the context
+\var sc_hypervisor_wrapper::idle_time
+	The time the workers were idle from the last resize
+\var sc_hypervisor_wrapper::idle_start_time
+	The moment when the workers started being idle
+\var sc_hypervisor_wrapper::worker_to_be_removed
+	The list of workers that will leave this contexts (lazy resizing process)
+\var sc_hypervisor_wrapper::pushed_tasks
+        The number of pushed tasks of each worker of the context
+\var sc_hypervisor_wrapper::poped_tasks
+        The number of poped tasks of each worker of the context
+\var sc_hypervisor_wrapper::total_flops
+        The total number of flops to execute by the context
+\var sc_hypervisor_wrapper::total_elapsed_flops
+        The number of flops executed by each workers of the context
+\var sc_hypervisor_wrapper::elapsed_flops
+        The number of flops executed by each worker of the context from last resize
+\var sc_hypervisor_wrapper::elapsed_data
+	The quantity of data (in bytes) used to execute tasks on each worker in this ctx
+\var sc_hypervisor_wrapper::elapsed_tasks
+	The nr of tasks executed on each worker in this ctx
+\var sc_hypervisor_wrapper::ref_speed
+	The average speed of the workers (type of workers) when they belonged to this context
+	0 - cuda 1 - cpu
+\var sc_hypervisor_wrapper::submitted_flops
+	The number of flops submitted to this ctx
+\var sc_hypervisor_wrapper::remaining_flops
+        The number of flops that still have to be executed by the workers in the context
+\var sc_hypervisor_wrapper::ready_flops
+	The number of flops corresponding to the ready tasks in this context
+\var sc_hypervisor_wrapper::start_time
+        The time when he started executed
+\var sc_hypervisor_wrapper::real_start_time
+	The first time a task was pushed to this context
+\var sc_hypervisor_wrapper::resize_ack
+        The structure confirming the last resize finished and a new one can be done
+\var sc_hypervisor_wrapper::mutex
+	The mutex needed to synchronize the acknowledgment of the workers into 
+	the receiver context
+\var sc_hypervisor_wrapper::total_flops_available
+	A boolean indicating if the hypervisor can use the flops corresponding to 
+	the entire execution of the context
+\var sc_hypervisor_wrapper::nready_tasks
+	The number of ready tasks in a context
+
+\struct sc_hypervisor_resize_ack
+\ingroup API_SC_Hypervisor
+This structures checks if the workers moved to another context
+are actually taken into account in that context.
+\var sc_hypervisor_resize_ack::receiver_sched_ctx
+        The context receiving the new workers
+\var sc_hypervisor_resize_ack::moved_workers
+        The workers moved to the receiver context
+\var sc_hypervisor_resize_ack::nmoved_workers
+        The number of workers moved
+\var sc_hypervisor_resize_ack::acked_workers
+        If the value corresponding to a worker is 1, this one is taken
+	into account in the new context if 0 not yet
+
+\struct sc_hypervisor_policy_task_pool
+task wrapper linked list
+\ingroup API_SC_Hypervisor
+\var sc_hypervisor_policy_task_pool::cl
+Which codelet has been executed
+\var sc_hypervisor_policy_task_pool::footprint
+Task footprint key
+\var sc_hypervisor_policy_task_pool::sched_ctx_id
+Context the task belongs to
+\var sc_hypervisor_policy_task_pool::n
+Number of tasks of this kind
+\var sc_hypervisor_policy_task_pool::data_size
+The quantity of data(in bytes) needed by the task to execute
+\var sc_hypervisor_policy_task_pool::next
+Other task kinds
+
+\fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
+\ingroup API_SC_Hypervisor
+Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag
+finished executing 
+
+\fn unsigned sc_hypervisor_get_size_req(unsigned **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
+\ingroup API_SC_Hypervisor
+Check if there are pending demands of resizing
+
+\fn void sc_hypervisor_save_size_req(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+\ingroup API_SC_Hypervisor
+Save a demand of resizing
+
+\fn void sc_hypervisor_free_size_req(void)
+\ingroup API_SC_Hypervisor
+Clear the list of pending demands of resizing
+
+\fn unsigned sc_hypervisor_can_resize(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor
+Check out if a context can be resized
+
+\fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor
+Returns the configuration structure of a context
+
+\fn void sc_hypervisor_set_config(unsigned sched_ctx, void *config)
+\ingroup API_SC_Hypervisor
+Set a certain configuration to a contexts
+
+\fn unsigned *sc_hypervisor_get_sched_ctxs()
+\ingroup API_SC_Hypervisor
+    Gets the contexts managed by the hypervisor
+
+\fn int sc_hypervisor_get_nsched_ctxs()
+\ingroup API_SC_Hypervisor
+    Gets the number of contexts managed by the hypervisor
+
+\fn struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor
+    Returns the wrapper corresponding the context \p sched_ctx
+
+\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w)
+\ingroup API_SC_Hypervisor
+    Returns the flops of a context elapsed from the last resize
+
+
+*/

+ 191 - 0
doc/doxygen/chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy

@@ -0,0 +1,191 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_SC_Hypervisor_usage Scheduling Context Hypervisor - Regular usage
+
+\fn void *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
+\ingroup API_SC_Hypervisor_usage
+There is a single hypervisor that is in charge of resizing contexts
+and the resizing strategy is chosen at the initialization of the
+hypervisor. A single resize can be done at a time.
+
+The Scheduling Context Hypervisor Plugin provides a series of
+performance counters to StarPU. By incrementing them, StarPU can help
+the hypervisor in the resizing decision making process.
+
+This function initializes the hypervisor to use the strategy provided as parameter
+and creates the performance counters (see starpu_sched_ctx_performance_counters).
+These performance counters represent actually some callbacks that will
+be used by the contexts to notify the information needed by the
+hypervisor.
+
+Note: The Hypervisor is actually a worker that takes this role once
+certain conditions trigger the resizing process (there is no
+additional thread assigned to the hypervisor).
+
+\fn void sc_hypervisor_shutdown(void)
+\ingroup API_SC_Hypervisor_usage
+The hypervisor and all information concerning it is cleaned. There is
+no synchronization between this function and starpu_shutdown(). Thus,
+this should be called after starpu_shutdown(), because the performance
+counters will still need allocated callback functions.
+
+\fn void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
+\ingroup API_SC_Hypervisor_usage
+Scheduling Contexts that have to be resized by the hypervisor must be
+first registered to the hypervisor. 
+This function registers the context to the hypervisor, and indicate the number of
+flops the context will execute (used for Gflops rate based strategy
+or any other custom strategy needing it, for the others we can pass 0.0)
+
+\fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor_usage
+Whenever we want to exclude
+contexts from the resizing process we have to unregister them from the
+hypervisor.
+
+\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+\ingroup API_SC_Hypervisor_usage
+Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
+
+\fn void sc_hypervisor_stop_resize(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor_usage
+The user can totally forbid the resizing of a certain context or can
+then change his mind and allow it (in this case the resizing is
+managed by the hypervisor, that can forbid it or allow it)
+
+\fn void sc_hypervisor_start_resize(unsigned sched_ctx)
+\ingroup API_SC_Hypervisor_usage
+Allow resizing of a context. The user can then provide information to
+the hypervisor concerning the conditions of resizing.
+
+\fn char *sc_hypervisor_get_policy();
+\ingroup API_SC_Hypervisor_usage
+Returns the name of the resizing policy the hypervisor uses
+
+\fn void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx)
+\ingroup API_SC_Hypervisor_usage
+Ask the hypervisor to add workers to a sched_ctx 
+
+\fn void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now)
+\ingroup API_SC_Hypervisor_usage
+Ask the hypervisor to remove workers from a sched_ctx 
+
+\fn void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now)
+\ingroup API_SC_Hypervisor_usage
+Moves workers from one context to another
+   
+\fn void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+\ingroup API_SC_Hypervisor_usage
+Ask the hypervisor to chose a distribution of workers in the required contexts
+   
+
+\fn void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size)
+\ingroup API_SC_Hypervisor_usage
+Indicate the types of tasks a context will execute in order to better decide the sizing of ctxs
+
+\fn void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total_flops)
+\ingroup API_SC_Hypervisor_usage
+Change dynamically the total number of flops of a context, move the deadline of the finishing time of the context
+
+\fn void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops)
+\ingroup API_SC_Hypervisor_usage
+Change dynamically the number of the elapsed flops in a context, modify the past in order to better compute the speed 
+
+\fn void sc_hypervisor_ctl(unsigned sched_ctx, ...)
+\ingroup API_SC_Hypervisor_usage
+Inputs conditions to the context sched_ctx with the following
+arguments. The argument list must be zero-terminated.
+
+\def SC_HYPERVISOR_MAX_IDLE
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 3 arguments: an array of int for the workerids to apply
+the condition, an int to indicate the size of the array, and a double
+value indicating the maximum idle time allowed for a worker before the
+resizing process should be triggered
+
+\def SC_HYPERVISOR_PRIORITY
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 3 arguments: an array of int for the workerids to apply
+the condition, an int to indicate the size of the array, and an int
+value indicating the priority of the workers previously mentioned. The
+workers with the smallest priority are moved the first.
+
+\def SC_HYPERVISOR_MIN_WORKERS
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument(int) indicating the minimum number of workers a
+context should have, underneath this limit the context cannot execute.
+
+\def SC_HYPERVISOR_MAX_WORKERS
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument(int) indicating the maximum number of workers a
+context should have, above this limit the context would not be able to
+scale
+
+\def SC_HYPERVISOR_GRANULARITY
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument(int) indicating the granularity of the resizing
+process (the number of workers should be moved from the context once
+it is resized) This parameter is ignore for the Gflops rate based
+strategy (see \ref ResizingStrategies), the number of workers that have to
+be moved is calculated by the strategy.
+
+\def SC_HYPERVISOR_FIXED_WORKERS
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 2 arguments: an array of int for the workerids to apply
+the condition and an int to indicate the size of the array. These
+workers are not allowed to be moved from the context.
+
+\def SC_HYPERVISOR_MIN_TASKS
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument (int) that indicated the minimum number of
+tasks that have to be executed before the context could be resized.
+This parameter is ignored for the Application Driven strategy (see \ref 
+ResizingStrategies) where the user indicates exactly when the resize
+should be done.
+
+\def SC_HYPERVISOR_NEW_WORKERS_MAX_IDLE
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument, a double value indicating the maximum idle
+time allowed for workers that have just been moved from other contexts
+in the current context.
+
+\def SC_HYPERVISOR_TIME_TO_APPLY
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument (int) indicating the tag an executed task
+should have such that this configuration should be taken into account.
+
+
+\def SC_HYPERVISOR_ISPEED_W_SAMPLE
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument, a double, that indicates the number of flops
+needed to be executed before computing the speed of a worker
+
+\def SC_HYPERVISOR_ISPEED_CTX_SAMPLE
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument, a double, that indicates the number of flops
+needed to be executed before computing the speed of a context
+
+
+\def SC_HYPERVISOR_NULL
+\ingroup API_SC_Hypervisor_usage
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 arguments
+
+*/

+ 5 - 0
doc/doxygen/chapters/api/scc_extensions.doxy

@@ -13,6 +13,11 @@
 This macro is defined when StarPU has been installed with SCC support.
 It should be used in your code to detect the availability of SCC.
 
+\def STARPU_MAXSCCDEVS
+\ingroup API_SCC_Extensions
+This macro defines the maximum number of SCC devices that are
+supported by StarPU.
+
 \typedef starpu_scc_func_symbol_t
 \ingroup API_SCC_Extensions
 Type for SCC function symbols

+ 0 - 304
doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy

@@ -1,304 +0,0 @@
-/*
- * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
- * See the file version.doxy for copying conditions.
- */
-
-/*! \defgroup API_Scheduling_Context_Hypervisor Scheduling Context Hypervisor
-
-\struct sc_hypervisor_policy
-\ingroup API_Scheduling_Context_Hypervisor
-This structure contains all the methods that implement a hypervisor resizing policy.
-\var sc_hypervisor_policy::name
-        Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
-\var sc_hypervisor_policy::custom
-        Indicates whether the policy is custom or not
-\var sc_hypervisor_policy::handle_idle_cycle
-        It is called whenever the indicated worker executes another idle cycle in sched_ctx
-\var sc_hypervisor_policy::handle_pushed_task
-        It is called whenever a task is pushed on the worker’s queue corresponding to the context sched_ctx
-\var sc_hypervisor_policy::handle_poped_task
-        It is called whenever a task is poped from the worker’s queue corresponding to the context sched_ctx
-\var sc_hypervisor_policy::handle_idle_end
-        It is called whenever a task is executed on the indicated worker and context after a long period of idle time
-\var sc_hypervisor_policy::handle_post_exec_hook
-        It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
-
-\struct sc_hypervisor_policy_config
-\ingroup API_Scheduling_Context_Hypervisor
-This structure contains all configuration information of a
-context. It contains configuration information for each context, which
-can be used to construct new resize strategies.
-\var sc_hypervisor_policy_config::min_nworkers
-        Indicates the minimum number of workers needed by the context
-\var sc_hypervisor_policy_config::max_nworkers
-        Indicates the maximum number of workers needed by the context
-\var sc_hypervisor_policy_config::granularity
-        Indicates the workers granularity of the context
-\var sc_hypervisor_policy_config::priority
-        Indicates the priority of each worker in the context
-\var sc_hypervisor_policy_config::max_idle
-        Indicates the maximum idle time accepted before a resize is triggered
-\var sc_hypervisor_policy_config::fixed_workers
-        Indicates which workers can be moved and which ones are fixed
-\var sc_hypervisor_policy_config:: new_workers_max_idle
-        Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
-
-\struct sc_hypervisor_wrapper
-\ingroup API_Scheduling_Context_Hypervisor
-This structure is a wrapper of the contexts available in StarPU
-and contains all information about a context obtained by incrementing
-the performance counters.
-\var sc_hypervisor_wrapper::sched_ctx
-        The context wrapped
-\var sc_hypervisor_wrapper::config
-        The corresponding resize configuration
-\var sc_hypervisor_wrapper::current_idle_time
-        The idle time counter of each worker of the context
-\var sc_hypervisor_wrapper::pushed_tasks
-        The number of pushed tasks of each worker of the context
-\var sc_hypervisor_wrapper::poped_tasks
-        The number of poped tasks of each worker of the context
-\var sc_hypervisor_wrapper::total_flops
-        The total number of flops to execute by the context
-\var sc_hypervisor_wrapper::total_elapsed_flops
-        The number of flops executed by each workers of the context
-\var sc_hypervisor_wrapper::elapsed_flops
-        The number of flops executed by each worker of the context from last resize
-\var sc_hypervisor_wrapper::remaining_flops
-        The number of flops that still have to be executed by the workers in the context
-\var sc_hypervisor_wrapper::start_time
-        The time when he started executed
-\var sc_hypervisor_wrapper::resize_ack
-        The structure confirming the last resize finished and a new one can be done
-
-\struct sc_hypervisor_resize_ack
-\ingroup API_Scheduling_Context_Hypervisor
-This structures checks if the workers moved to another context
-are actually taken into account in that context.
-\var sc_hypervisor_resize_ack::receiver_sched_ctx
-        The context receiving the new workers
-\var sc_hypervisor_resize_ack::moved_workers
-        The workers moved to the receiver context
-\var sc_hypervisor_resize_ack::nmoved_workers
-        The number of workers moved
-\var sc_hypervisor_resize_ack::acked_workers
-        If the value corresponding to a worker is 1, this one is taken
-	into account in the new context if 0 not yet
-
-\struct sc_hypervisor_policy_task_pool
-task wrapper linked list
-\ingroup API_Scheduling_Context_Hypervisor
-\var sc_hypervisor_policy_task_pool::cl
-Which codelet has been executed
-\var sc_hypervisor_policy_task_pool::footprint
-Task footprint key
-\var sc_hypervisor_policy_task_pool::sched_ctx_id
-Context the task belongs to
-\var sc_hypervisor_policy_task_pool::n
-Number of tasks of this kind
-\var sc_hypervisor_policy_task_pool::next
-Other task kinds
-
-@name Managing the hypervisor
-\ingroup API_Scheduling_Context_Hypervisor
-
-There is a single hypervisor that is in charge of resizing contexts
-and the resizing strategy is chosen at the initialization of the
-hypervisor. A single resize can be done at a time.
-
-The Scheduling Context Hypervisor Plugin provides a series of
-performance counters to StarPU. By incrementing them, StarPU can help
-the hypervisor in the resizing decision making process. TODO maybe
-they should be hidden to the user
-
-\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
-\ingroup API_Scheduling_Context_Hypervisor
-Initializes the hypervisor to use the strategy provided as parameter
-and creates the performance counters (see starpu_sched_ctx_performance_counters).
-These performance counters represent actually some callbacks that will
-be used by the contexts to notify the information needed by the
-hypervisor.
-
-Note: The Hypervisor is actually a worker that takes this role once
-certain conditions trigger the resizing process (there is no
-additional thread assigned to the hypervisor).
-
-\fn void sc_hypervisor_shutdown(void)
-\ingroup API_Scheduling_Context_Hypervisor
-The hypervisor and all information concerning it is cleaned. There is
-no synchronization between this function and starpu_shutdown(). Thus,
-this should be called after starpu_shutdown(), because the performance
-counters will still need allocated callback functions.
-
-@name Registering Scheduling Contexts to the hypervisor
-\ingroup API_Scheduling_Context_Hypervisor
-
-Scheduling Contexts that have to be resized by the hypervisor must be
-first registered to the hypervisor. Whenever we want to exclude
-contexts from the resizing process we have to unregister them from the
-hypervisor.
-
-\fn void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
-\ingroup API_Scheduling_Context_Hypervisor
-Register the context to the hypervisor, and indicate the number of
-flops the context will execute (needed for Gflops rate based strategy
-see \ref ResizingStrategies or any other custom strategy needing it, for
-the others we can pass 0.0)
-
-\fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
-\ingroup API_Scheduling_Context_Hypervisor
-Unregister the context from the hypervisor.
-
-@name Users’ Input In The Resizing Process
-\anchor UsersInputInTheResizingProcess
-\ingroup API_Scheduling_Context_Hypervisor
-
-The user can totally forbid the resizing of a certain context or can
-then change his mind and allow it (in this case the resizing is
-managed by the hypervisor, that can forbid it or allow it)
-
-\fn void sc_hypervisor_stop_resize(unsigned sched_ctx)
-\ingroup API_Scheduling_Context_Hypervisor
-Forbid resizing of a context
-
-\fn void sc_hypervisor_start_resize(unsigned sched_ctx)
-\ingroup API_Scheduling_Context_Hypervisor
-Allow resizing of a context. The user can then provide information to
-the hypervisor concerning the conditions of resizing.
-
-\fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
-\ingroup API_Scheduling_Context_Hypervisor
-Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag
-finished executing 
-
-\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
-\ingroup API_Scheduling_Context_Hypervisor
-Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
-
-\fn void sc_hypervisor_ioctl(unsigned sched_ctx, ...)
-\ingroup API_Scheduling_Context_Hypervisor
-Inputs conditions to the context sched_ctx with the following
-arguments. The argument list must be zero-terminated.
-
-\def HYPERVISOR_MAX_IDLE
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 3 arguments: an array of int for the workerids to apply
-the condition, an int to indicate the size of the array, and a double
-value indicating the maximum idle time allowed for a worker before the
-resizing process should be triggered
-
-\def HYPERVISOR_PRIORITY
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 3 arguments: an array of int for the workerids to apply
-the condition, an int to indicate the size of the array, and an int
-value indicating the priority of the workers previously mentioned. The
-workers with the smallest priority are moved the first.
-
-\def HYPERVISOR_MIN_WORKERS
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument(int) indicating the minimum number of workers a
-context should have, underneath this limit the context cannot execute.
-
-\def HYPERVISOR_MAX_WORKERS
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument(int) indicating the maximum number of workers a
-context should have, above this limit the context would not be able to
-scale
-
-\def HYPERVISOR_GRANULARITY
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument(int) indicating the granularity of the resizing
-process (the number of workers should be moved from the context once
-it is resized) This parameter is ignore for the Gflops rate based
-strategy (see \ref ResizingStrategies), the number of workers that have to
-be moved is calculated by the strategy.
-
-\def HYPERVISOR_FIXED_WORKERS
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 2 arguments: an array of int for the workerids to apply
-the condition and an int to indicate the size of the array. These
-workers are not allowed to be moved from the context.
-
-\def HYPERVISOR_MIN_TASKS
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument (int) that indicated the minimum number of
-tasks that have to be executed before the context could be resized.
-This parameter is ignored for the Application Driven strategy (see \ref 
-ResizingStrategies) where the user indicates exactly when the resize
-should be done.
-
-\def HYPERVISOR_NEW_WORKERS_MAX_IDLE
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument, a double value indicating the maximum idle
-time allowed for workers that have just been moved from other contexts
-in the current context.
-
-\def HYPERVISOR_TIME_TO_APPLY
-\ingroup API_Scheduling_Context_Hypervisor
-This macro is used when calling sc_hypervisor_ioctl() and must be
-followed by 1 argument (int) indicating the tag an executed task
-should have such that this configuration should be taken into account.
-
-@name Defining a new hypervisor policy
-\ingroup API_Scheduling_Context_Hypervisor
-
-While Scheduling Context Hypervisor Plugin comes with a variety of
-resizing policies (see \ref ResizingStrategies), it may sometimes be
-desirable to implement custom policies to address specific problems.
-The API described below allows users to write their own resizing policy.
-
-Here an example of how to define a new policy
-
-\code{.c}
-struct sc_hypervisor_policy dummy_policy =
-{
-       .handle_poped_task = dummy_handle_poped_task,
-       .handle_pushed_task = dummy_handle_pushed_task,
-       .handle_idle_cycle = dummy_handle_idle_cycle,
-       .handle_idle_end = dummy_handle_idle_end,
-       .handle_post_exec_hook = dummy_handle_post_exec_hook,
-       .custom = 1,
-       .name = "dummy"
-};
-\endcode
-
-\fn void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
-\ingroup API_Scheduling_Context_Hypervisor
-    Moves workers from one context to another
-
-\fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
-\ingroup API_Scheduling_Context_Hypervisor
-    Returns the configuration structure of a context
-
-\fn int *sc_hypervisor_get_sched_ctxs();
-\ingroup API_Scheduling_Context_Hypervisor
-    Gets the contexts managed by the hypervisor
-
-\fn int sc_hypervisor_get_nsched_ctxs();
-\ingroup API_Scheduling_Context_Hypervisor
-    Gets the number of contexts managed by the hypervisor
-
-\fn struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
-\ingroup API_Scheduling_Context_Hypervisor
-    Returns the wrapper corresponding the context \p sched_ctx
-
-\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
-\ingroup API_Scheduling_Context_Hypervisor
-    Returns the flops of a context elapsed from the last resize
-
-\fn char *sc_hypervisor_get_policy();
-\ingroup API_Scheduling_Context_Hypervisor
-    Returns the name of the resizing policy the hypervisor uses
-
-*/

+ 90 - 89
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -1,4 +1,4 @@
-*
+/*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
@@ -16,47 +16,6 @@ starpu tasks to them and we schedule them with the policy assigned to
 the context. Scheduling contexts can be created, deleted and modified
 dynamically.
 
-\enum starpu_worker_collection_type
-\ingroup API_Scheduling_Contexts
-types of structures the worker collection can implement
-\var starpu_worker_collection_type::STARPU_WORKER_LIST
-\ingroup API_Scheduling_Contexts
-List of workers
-
-\struct starpu_sched_ctx_iterator
-\ingroup API_Scheduling_Contexts
-todo
-\var starpu_sched_ctx_iterator::cursor
-todo
-
-\struct starpu_worker_collection
-\ingroup API_Scheduling_Contexts
-A scheduling context manages a collection of workers that can
-be memorized using different data structures. Thus, a generic
-structure is available in order to simplify the choice of its type.
-Only the list data structure is available but further data
-structures(like tree) implementations are foreseen.
-\var starpu_worker_collection::workerids
-        The workerids managed by the collection
-\var starpu_worker_collection::nworkers
-        The number of workers in the collection
-\var starpu_worker_collection::type
-        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
-\var starpu_worker_collection::has_next
-        Checks if there is another element in collection
-\var starpu_worker_collection::get_next
-        return the next element in the collection
-\var starpu_worker_collection::add
-        add a new element in the collection
-\var starpu_worker_collection::remove
-        remove an element from the collection
-\var starpu_worker_collection::init
-        Initialize the collection
-\var starpu_worker_collection::deinit
-        Deinitialize the colection
-\var starpu_worker_collection::init_iterator
-        Initialize the cursor if there is one
-
 \struct starpu_sched_ctx_performance_counters
 Performance counters used by the starpu to indicate the
 hypervisor how the application and the resources are executing.
@@ -66,43 +25,76 @@ hypervisor how the application and the resources are executing.
 \var starpu_sched_ctx_performance_counters::notify_idle_end
         Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context. The idle counter it though reset.
 \var starpu_sched_ctx_performance_counters::notify_pushed_task
-        Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+        Notifies the hypervisor that a task has been scheduled on the queue of the worker corresponding to the specified context
 \var starpu_sched_ctx_performance_counters::notify_poped_task
-        Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+        Informs the hypervisor that a task executing a specified number of instructions has been poped from the worker
 \var starpu_sched_ctx_performance_counters::notify_post_exec_hook
-        Notifies the hypervisor a task has just been executed
+        Notifies the hypervisor that a task has just been executed
+\var starpu_sched_ctx_performance_counters::notify_submitted_job
+        Notifies the hypervisor that a task has just been submitted
+\var starpu_sched_ctx_performance_counters::notify_delete_context
+        Notifies the hypervisor that the context was deleted
+
 
 @name Scheduling Contexts Basic API
 \ingroup API_Scheduling_Contexts
 
-\fn unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name)
+\fn unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...)
 \ingroup API_Scheduling_Contexts
-This function creates a scheduling context which uses the scheduling
-policy \p policy_name and assigns the workers in \p workerids_ctx to
-execute the tasks submitted to it.
-The return value represents the identifier of the context that has
-just been created. It will be further used to indicate the context the
-tasks will be submitted to. The return value should be at most
-\ref STARPU_NMAX_SCHED_CTXS.
+This function creates a scheduling context with the given parameters
+(see below) and assigns the workers in \p workerids_ctx to execute the
+tasks submitted to it. The return value represents the identifier of
+the context that has just been created. It will be further used to
+indicate the context the tasks will be submitted to. The return value
+should be at most \ref STARPU_NMAX_SCHED_CTXS.
 
-\fn unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *policy, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name)
+The arguments following the name of the scheduling context can be of
+the following types:
+<ul>
+<li> ::STARPU_SCHED_CTX_POLICY_NAME, followed by the name of a
+predefined scheduling policy
+</li>
+<li> ::STARPU_SCHED_CTX_POLICY_STRUCT, followed by a pointer to a
+custom scheduling policy (struct starpu_sched_policy *)
+</li>
+<li> ::STARPU_SCHED_CTX_POLICY_MIN_PRIO, followed by a integer
+representing the minimum priority value to be defined for the
+scheduling policy.
+</li>
+<li> ::STARPU_SCHED_CTX_POLICY_MAX_PRIO, followed by a integer
+representing the maximum priority value to be defined for the
+scheduling policy.
+</li>
+</ul>
+
+\def STARPU_SCHED_CTX_POLICY_NAME
 \ingroup API_Scheduling_Contexts
-This function creates a scheduling context which uses the scheduling
-policy \p policy (the pointer to the custom scheduling policy) and assigns the workers in \p workerids_ctx to
-execute the tasks submitted to it.
-The return value represents the identifier of the context that has
-just been created. It will be further used to indicate the context the
-tasks will be submitted to. The return value should be at most
-\ref STARPU_NMAX_SCHED_CTXS.
+This macro is used when calling starpu_sched_ctx_create() to specify a
+name for a scheduling policy
+
+\def STARPU_SCHED_CTX_POLICY_STRUCT
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() to specify a
+pointer to a scheduling policy
+
+\def STARPU_SCHED_CTX_POLICY_MIN_PRIO
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() to specify a
+minimum scheduler priority value.
 
-\fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
+\def STARPU_SCHED_CTX_POLICY_MAX_PRIO
+\ingroup API_Scheduling_Contexts
+This macro is used when calling starpu_sched_ctx_create() to specify a
+maximum scheduler priority value.
+
+\fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
 
-\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
+\fn void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args)
 \ingroup API_Scheduling_Contexts
-Delete scheduling context \p sched_ctx_id and transfer remaining
-workers to the inheritor scheduling context.
+Execute the callback whenever the last task of the context finished executing, it is called with the pramaters: sched_ctx and any other paramter needed
+by the application (packed in a void*)
 
 \fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
@@ -116,6 +108,11 @@ This function removes the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than
 STARPU_NMAX_SCHED_CTXS.
 
+\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Delete scheduling context \p sched_ctx_id and transfer remaining
+workers to the inheritor scheduling context.
+
 \fn void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 \ingroup API_Scheduling_Contexts
 Indicate which context whill inherit the resources of this context
@@ -134,12 +131,18 @@ Return the scheduling context the tasks are currently submitted to
 Stop submitting tasks from the empty context list until the next time
 the context has time to check the empty context list
 
-\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
+\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Indicate starpu that the application finished submitting to this
 context in order to move the workers to the inheritor as soon as
 possible.
 
+\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
+\ingroup API_Scheduling_Contexts
+Returns the list of workers in the array \p workerids, the returned value is the 
+number of workers. The user should free the \p workerids table after finishing
+using it (it is allocated inside the function with the proper size)
+
 \fn unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Return the number of workers managed by the specified contexts
@@ -154,25 +157,15 @@ blocked)
 \ingroup API_Scheduling_Contexts
 Return 1 if the worker belongs to the context and 0 otherwise
 
-\fn unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
-\ingroup API_Scheduling_Contexts
-Check if a worker is shared between several contexts
-
-\fn unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
+\fn unsigned starpu_sched_ctx_worker_get_id(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
-Manage sharing of resources between contexts: checkOB which ctx has
-its turn to pop.
+Return the workerid if the worker belongs to the context and -1 otherwise.
+If the thread calling this function is not a worker the function returns -1
+as it calls the function \ref starpu_worker_get_id()
 
-\fn void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
-\ingroup API_Scheduling_Contexts
-Manage sharing of resources between contexts: by default a round_robin
-strategy is executed but the user can interfere to tell which ctx has
-its turn to pop.
-
-\fn double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
+\fn unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 \ingroup API_Scheduling_Contexts
-Time sharing a resources, indicate how long a worker has been active
-in the current sched_ctx.
+Check if a worker is shared between several contexts
 
 @name Scheduling Context Priorities
 \ingroup API_Scheduling_Contexts
@@ -235,16 +228,10 @@ Delete the worker collection of the specified scheduling context
 \ingroup API_Scheduling_Contexts
 Return the worker collection managed by the indicated context
 
-\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
-\ingroup API_Scheduling_Contexts
-Returns the list of workers in the array \p workerids, the returned value is the 
-number of workers. The user should free the \p workerids table after finishing
-using it (it is allocated inside the function with the proper size)
-
 @name Scheduling Context Link with Hypervisor
 \ingroup API_Scheduling_Contexts
 
-\fn void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
+\fn void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void *perf_counters)
 \ingroup API_Scheduling_Contexts
 Indicates to starpu the pointer to the performance counter
 
@@ -261,4 +248,18 @@ Allow the hypervisor to let starpu know he's initialised
 \ingroup API_Scheduling_Contexts
 Ask starpu if he is informed if the hypervisor is initialised
 
+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
+\ingroup API_Scheduling_Contexts
+Allocate the scheduling policy data (private information of the scheduler like queues, variables,
+additional condition variables) the context
+
+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Return the scheduling policy data (private information of the scheduler) of the contexts previously 
+assigned to.
+
+\fn void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+execute any parallel code on the workers of the sched_ctx (workers are blocked)
+
 */

+ 11 - 15
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -27,7 +27,7 @@ starpu_init().
         Insert a task into the scheduler.
 \var starpu_sched_policy::push_task_notify
         Notify the scheduler that a task was pushed on a given worker.
-	This method is called when a task that was explicitely
+	This method is called when a task that was explicitly
 	assigned to a worker becomes ready and is about to be executed
 	by the worker. This method therefore permits to keep the state
 	of the scheduler coherent even when StarPU bypasses the
@@ -73,15 +73,6 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
-\ingroup API_Scheduling_Policy
-Each scheduling policy uses some specific data (queues, variables,
-additional condition variables). It is memorize through a local
-structure. This function assigns it to a scheduling context.
-
-\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
-\ingroup API_Scheduling_Policy
-Returns the policy data previously assigned to a context
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy
@@ -135,15 +126,15 @@ otherwise the task may fail to execute.
 \ingroup API_Scheduling_Policy
 Return the current date in micro-seconds.
 
-\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
 Returns the footprint for a given task
 
-\fn double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
 Returns expected task duration in micro-seconds.
 
-\fn double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype)
+\fn double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch)
 \ingroup API_Scheduling_Policy
 Returns an estimated speedup factor relative to CPU speed
 
@@ -155,11 +146,11 @@ Returns expected data transfer time in micro-seconds.
 \ingroup API_Scheduling_Policy
 Predict the transfer time (in micro-seconds) to move \p handle to a memory node
 
-\fn double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
 Returns expected power consumption in J
 
-\fn double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
 Returns expected conversion time in ms (multiformat interface only)
 
@@ -171,4 +162,9 @@ Whether \ref STARPU_PREFETCH was set
 \ingroup API_Scheduling_Policy
 Prefetch data for a given task on a given node
 
+\fn void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id)
+\ingroup API_Scheduling_Policy
+The scheduling policies indicates if the worker may pop tasks from the list of other workers
+or if there is a central list with task for all the workers
+
 */

+ 10 - 0
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -8,6 +8,16 @@
 
 /*! \defgroup API_Standard_Memory_Library Standard Memory Library
 
+\def starpu_data_malloc_pinned_if_possible
+\ingroup API_Standard_Memory_Library
+\deprecated
+Equivalent to starpu_malloc(). This macro is provided to avoid breaking old codes.
+
+\def starpu_data_free_pinned_if_possible
+\ingroup API_Standard_Memory_Library
+\deprecated
+Equivalent to starpu_free(). This macro is provided to avoid breaking old codes.
+
 \def STARPU_MALLOC_PINNED
 \ingroup API_Standard_Memory_Library
 Value passed to the function starpu_malloc_flags() to indicate the memory allocation should be pinned.

+ 2 - 2
doc/doxygen/chapters/api/task_bundles.doxy

@@ -44,11 +44,11 @@ Inform the runtime that the user will not modify \p bundle anymore, it
 means no more inserting or removing task. Thus the runtime can destroy
 it when possible.
 
-\fn double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Task_Bundles
 Return the expected duration of \p bundle in micro-seconds.
 
-\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl)
 \ingroup API_Task_Bundles
 Return the expected power consumption of \p bundle in J.
 

+ 306 - 0
doc/doxygen/chapters/api/threads.doxy

@@ -0,0 +1,306 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Threads Threads
+
+\brief This section describes the thread facilities provided
+by StarPU. The thread function are either implemented on top of the
+pthread library or the Simgrid library when the simulated performance
+mode is enabled (\ref SimulatedPerformance).
+
+\def STARPU_PTHREAD_CREATE_ON
+\ingroup API_Threads
+This macro calls the function starpu_pthread_create_on() and aborts on error.
+
+\def STARPU_PTHREAD_CREATE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_create() and aborts on error.
+
+\def STARPU_PTHREAD_MUTEX_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_MUTEX_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_MUTEX_LOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_lock() and aborts
+on error.
+
+\def STARPU_PTHREAD_MUTEX_UNLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_unlock() and aborts
+on error.
+
+\def STARPU_PTHREAD_KEY_CREATE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_key_create() and aborts
+on error.
+
+\def STARPU_PTHREAD_KEY_DELETE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_key_delete() and aborts
+on error.
+
+\def STARPU_PTHREAD_SETSPECIFIC
+\ingroup API_Threads
+This macro calls the function starpu_pthread_setspecific() and aborts
+on error.
+
+\def STARPU_PTHREAD_GETSPECIFIC
+\ingroup API_Threads
+This macro calls the function starpu_pthread_getspecific() and aborts
+on error.
+
+\def STARPU_PTHREAD_RWLOCK_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_RWLOCK_RDLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_rdlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_WRLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_wrlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_UNLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_unlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_COND_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_init() and aborts on error.
+
+\def STARPU_PTHREAD_COND_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_destroy() and aborts
+on error.
+
+\def STARPU_PTHREAD_COND_SIGNAL
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_signal() and aborts
+on error.
+
+\def STARPU_PTHREAD_COND_BROADCAST
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_broadcast() and
+aborts on error.
+
+\def STARPU_PTHREAD_COND_WAIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_wait() and aborts on error.
+
+\def STARPU_PTHREAD_BARRIER_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_BARRIER_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_BARRIER_WAIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_wait() and aborts
+on error.
+
+\fn int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, int where)
+\ingroup API_Threads
+
+\fn int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg)
+\ingroup API_Threads
+This function starts a new thread in the calling process.  The new
+thread starts execution by invoking \p start_routine; \p arg is passed
+as the sole argument of \p start_routine.
+
+\fn int starpu_pthread_join(starpu_pthread_t thread, void **retval)
+\ingroup API_Threads
+This function waits for the thread specified by \p thread to
+terminate.  If that thread has already terminated, then the function
+returns immediately. The thread specified by \p thread must be
+joinable.
+
+\fn int starpu_pthread_attr_init(starpu_pthread_attr_t *attr)
+\ingroup API_Threads
+This function initializes the thread attributes object pointed to by
+\p attr with default attribute values.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
+\ingroup API_Threads
+This function destroys a thread attributes object which is no longer
+required. Destroying a thread attributes object has no effect on
+threads that were created using that object.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
+\ingroup API_Threads
+This function sets the detach state attribute of the thread attributes
+object referred to by \p attr to the value specified in \p
+detachstate.  The detach state attribute determines whether a thread
+created using the thread attributes object \p attr will be created in
+a joinable or a detached state.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
+\ingroup API_Threads
+This function initializes the mutex object pointed to by \p mutex
+according to the mutex attributes specified in \p mutexattr.  If \p
+mutexattr is NULL, default attributes are used instead.
+
+\fn int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function destroys a mutex object, freeing the resources it might
+hold. The mutex must be unlocked on entrance.
+
+\fn int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function locks the given mutex. If the mutex is currently
+unlocked, it becomes locked and owned by the calling thread, and the
+function returns immediately. If the mutex is already locked by
+another thread, the function suspends the calling thread until the
+mutex is unlocked.
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function unlocks the given mutex. The mutex is assumed to be
+locked and owned by the calling thread on entrance to
+starpu_pthread_mutex_unlock().
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function behaves identically to starpu_pthread_mutex_lock(),
+except that it does not block the calling thread if the mutex is
+already locked by another thread (or by the calling thread in the case
+of a ``fast''  mutex). Instead, the function returns immediately with
+the error code EBUSY.
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\typedef STARPU_PTHREAD_MUTEX_INITIALIZER
+\ingroup API_Threads
+This macro initializes the mutex given in parameter.
+
+\fn int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *))
+\ingroup API_Threads
+This function allocates a new TSD key. The key is stored in the
+location pointed to by \p key.
+
+\fn int starpu_pthread_key_delete(starpu_pthread_key_t key)
+\ingroup API_Threads
+This function deallocates a TSD key. It does not check whether
+non-NULL values are associated with that key in the currently
+executing threads, nor call the destructor function associated with
+the key.
+
+\fn int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
+\ingroup API_Threads
+This function changes the value associated with \p key in the calling
+thread, storing the given \p pointer instead.
+
+\fn  *starpu_pthread_getspecific(starpu_pthread_key_t key)
+\ingroup API_Threads
+This function returns the value associated with \p key on success, and
+NULL on error.
+
+\typedef STARPU_PTHREAD_COND_INITIALIZER
+\ingroup API_Threads
+This macro initializes the condition variable given in parameter.
+
+\fn starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr)
+\ingroup API_Threads
+This function initializes the condition variable \p cond, using the
+condition attributes specified in \p cond_attr, or default attributes
+if \p cond_attr is NULL.
+
+\fn starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function restarts one of the threads that are waiting on the
+condition variable \p cond. If no threads are waiting on \p cond,
+nothing happens. If several threads are waiting on \p cond, exactly
+one is restarted, but it not specified which.
+
+\fn starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function restarts all the threads that are waiting on the
+condition variable \p cond. Nothing happens if no threads are waiting on cond.
+
+\fn starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function atomically unlocks the mutex (as per
+starpu_pthread_mutex_unlock()) and waits for the condition variable \p cond
+to be signaled. The thread execution is suspended and does not consume
+any CPU time until the condition variable is signaled. The mutex must
+be locked by the calling thread on entrance to
+starpu_pthread_cond_wait(). Before returning to the calling thread, the
+function re-acquires mutex (as per starpu_pthread_mutex_lock()).
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
+\ingroup API_Threads
+This function atomically unlocks \p mutex and waits on \p cond, as
+starpu_pthread_cond_wait() does, but it also bounds the duration of
+the wait.
+
+\fn starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function destroys a condition variable, freeing the resources it
+might hold. No threads must be waiting on the condition variable on
+entrance to the function.
+
+\fn starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_init().
+
+\fn starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_destroy().
+
+\fn starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_lock().
+
+\fn starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_lock().
+
+\fn starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_unlock().
+
+*/

+ 111 - 0
doc/doxygen/chapters/api/toolbox.doxy

@@ -0,0 +1,111 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Toolbox Toolbox
+
+\brief The following macros allow to make GCC extensions portable, and
+to have a code which can be compiled with any C compiler.
+
+\def STARPU_GNUC_PREREQ
+\ingroup API_Toolbox
+Return true (non-zero) if GCC version MAJ.MIN or later is being used (macro taken from glibc.)
+
+\def STARPU_UNLIKELY
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro allows programmers to mark an expression as unlikely.
+
+\def STARPU_LIKELY
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro allows programmers to mark an expression as likely.
+
+\def STARPU_ATTRIBUTE_UNUSED
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((unused))
+
+\def STARPU_ATTRIBUTE_INTERNAL
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((visibility ("internal")))
+
+\def STARPU_ATTRIBUTE_MALLOC
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((malloc))
+
+\def STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((warn_unused_result))
+
+\def STARPU_ATTRIBUTE_PURE
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((pure))
+
+\def STARPU_ATTRIBUTE_ALIGNED
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to__attribute__((aligned(size)))
+
+\def STARPU_WARN_UNUSED_RESULT
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to__attribute__((__warn_unused_result__))
+
+\def STARPU_POISON_PTR
+\ingroup API_Toolbox
+This macro defines a value which can be used to mark pointers as
+invalid values.
+
+\def STARPU_MIN
+\ingroup API_Toolbox
+This macro returns the min of the two parameters.
+
+\def STARPU_MAX
+\ingroup API_Toolbox
+This macro returns the max of the two parameters.
+
+\def STARPU_ASSERT
+\ingroup API_Toolbox
+Unless StarPU has been configured with the option \ref enable-fast
+"--enable-fast", this macro will abort if the expression is false.
+
+\def STARPU_ASSERT_MSG
+\ingroup API_Toolbox
+Unless StarPU has been configured with the option \ref enable-fast
+"--enable-fast", this macro will abort if the expression is false. The
+given message will be displayed.
+
+\def STARPU_ABORT
+\ingroup API_Toolbox
+This macro aborts the program.
+
+\def STARPU_ABORT_MSG
+\ingroup API_Toolbox
+This macro aborts the program, and displays the given message.
+
+\def STARPU_CHECK_RETURN_VALUE
+\ingroup API_Toolbox
+If \p err has a value which is not 0, the given message is displayed
+before aborting.
+
+\def STARPU_CHECK_RETURN_VALUE_IS
+\ingroup API_Toolbox
+If \p err has a value which is not \p value, the given message is displayed
+before aborting.
+
+\def STARPU_RMB
+\ingroup API_Toolbox
+This macro can be used to do a synchronization.
+
+\def STARPU_WMB
+\ingroup API_Toolbox
+This macro can be used to do a synchronization.
+
+\fn int starpu_get_env_number(const char *str)
+\ingroup API_Toolbox
+If \p str is the name of a existing environment variable which is
+defined to an integer, the function returns the value of the integer.
+It returns 0 otherwise.
+
+*/
+

+ 47 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -8,6 +8,10 @@
 
 /*! \defgroup API_Workers_Properties Workers’ Properties
 
+\def STARPU_NMAXWORKERS
+\ingroup API_Workers_Properties
+Define the maximum number of workers managed by StarPU.
+
 \enum starpu_node_kind
 \ingroup API_Workers_Properties
 TODO
@@ -58,6 +62,49 @@ Intel MIC device
 Intel SCC device
 
 
+\struct starpu_worker_collection
+\ingroup API_Workers_Properties
+A scheduling context manages a collection of workers that can
+be memorized using different data structures. Thus, a generic
+structure is available in order to simplify the choice of its type.
+Only the list data structure is available but further data
+structures(like tree) implementations are foreseen.
+\var starpu_worker_collection::workerids
+        The workerids managed by the collection
+\var starpu_worker_collection::nworkers
+        The number of workers in the collection
+\var starpu_worker_collection::type
+        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
+\var starpu_worker_collection::has_next
+        Checks if there is another element in collection
+\var starpu_worker_collection::get_next
+        return the next element in the collection
+\var starpu_worker_collection::add
+        add a new element in the collection
+\var starpu_worker_collection::remove
+        remove an element from the collection
+\var starpu_worker_collection::init
+        Initialize the collection
+\var starpu_worker_collection::deinit
+        Deinitialize the colection
+\var starpu_worker_collection::init_iterator
+        Initialize the cursor if there is one
+
+\enum starpu_worker_collection_type
+\ingroup API_Workers_Properties
+Types of structures the worker collection can implement
+\var starpu_worker_collection_type::STARPU_WORKER_LIST
+\ingroup API_Workers_Properties
+The collection is an array
+
+\struct starpu_sched_ctx_iterator
+\ingroup API_Workers_Properties
+Structure needed to iterate on the collection
+\var starpu_sched_ctx_iterator::cursor
+The index of the current worker in the collection, needed when iterating on
+the collection.
+
+
 \fn unsigned starpu_worker_get_count(void)
 \ingroup API_Workers_Properties
 This function returns the number of workers (i.e. processing

+ 1 - 1
doc/doxygen/chapters/code/disk_copy.c

@@ -35,7 +35,7 @@ int main(int argc, char **argv)
 	double * A,*B,*C,*D,*E,*F;
 
 	/* limit main ram to force to push in disk */
-	putenv("STARPU_LIMIT_CPU_MEM=160");
+	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);
 
 	/* Initialize StarPU with default configuration */
 	int ret = starpu_init(NULL);

+ 7 - 0
doc/doxygen/chapters/configure_options.doxy

@@ -22,6 +22,13 @@ the following configure options.
 Enable debugging messages.
 </dd>
 
+<dt>--enable-spinlock-check</dt>
+<dd>
+\anchor enable-spinlock-check
+\addindex __configure__--enable-spinlock-check
+Enable checking that spinlocks are taken and released properly.
+</dd>
+
 <dt>--enable-fast</dt>
 <dd>
 \anchor enable-fast

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 94199 - 0
doc/doxygen/chapters/data_trace.eps


BIN
doc/doxygen/chapters/data_trace.pdf


BIN
doc/doxygen/chapters/data_trace.png


+ 106 - 0
doc/doxygen/chapters/environment_variables.doxy

@@ -70,6 +70,20 @@ STARPU_OPENCL_ONLY_ON_CPUS to 1, the OpenCL driver will ONLY enable
 CPU devices.
 </dd>
 
+<dt>STARPU_NMIC</dt>
+<dd>
+\anchor STARPU_NMIC
+\addindex __env__STARPU_NMIC
+MIC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_NSCC</dt>
+<dd>
+\anchor STARPU_NSCC
+\addindex __env__STARPU_NSCC
+SCC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
 <dt>STARPU_WORKERS_NOBIND</dt>
 <dd>
 \anchor STARPU_WORKERS_NOBIND
@@ -136,6 +150,28 @@ starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
 is set.
 </dd>
 
+<dt>STARPU_WORKERS_MICID</dt>
+<dd>
+\anchor STARPU_WORKERS_MICID
+\addindex __env__STARPU_WORKERS_MICID
+MIC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_mic_deviceid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_WORKERS_SCCID</dt>
+<dd>
+\anchor STARPU_WORKERS_SCCID
+\addindex __env__STARPU_WORKERS_SCCID
+SCC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_scc_deviceid passed to starpu_init()
+is set.
+</dd>
+
 <dt>STARPU_SINGLE_COMBINED_WORKER</dt>
 <dd>
 \anchor STARPU_SINGLE_COMBINED_WORKER
@@ -514,4 +550,74 @@ end of the execution of an application (\ref DataStatistics).
 
 </dl>
 
+\section ConfiguringTheHypervisor Configuring The Hypervisor
+
+<dl>
+
+<dt>SC_HYPERVISOR_POLICY</dt>
+<dd>
+\anchor SC_HYPERVISOR_POLICY
+\addindex __env__SC_HYPERVISOR_POLICY
+Choose between the different resizing policies proposed by StarPU for the hypervisor: 
+idle, app_driven, feft_lp, teft_lp; ispeed_lp, throughput_lp etc.
+
+Use <c>SC_HYPERVISOR_POLICY=help</c> to get the list of available policies for the hypervisor
+</dd>
+
+<dt>SC_HYPERVISOR_TRIGGER_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_TRIGGER_RESIZE
+\addindex __env__SC_HYPERVISOR_TRIGGER_RESIZE
+Choose how should the hypervisor be triggered: <c>speed</c> if the resizing algorithm should
+be called whenever the speed of the context does not correspond to an optimal precomputed value,
+<c>idle</c> it the resizing algorithm should be called whenever the workers are idle for a period
+longer than the value indicated when configuring the hypervisor.
+</dd>
+
+<dt>SC_HYPERVISOR_START_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_START_RESIZE
+\addindex __env__SC_HYPERVISOR_START_RESIZE
+Indicate the moment when the resizing should be available. The value correspond to the percentage
+of the total time of execution of the application. The default value is the resizing frame.
+</dd>
+
+<dt>SC_HYPERVISOR_MAX_SPEED_GAP</dt>
+<dd>
+\anchor SC_HYPERVISOR_MAX_SPEED_GAP
+\addindex __env__SC_HYPERVISOR_MAX_SPEED_GAP
+Indicate the ratio of speed difference between contexts that should trigger the hypervisor.
+This situation may occur only when a theoretical speed could not be computed and the hypervisor
+has no value to compare the speed to. Otherwise the resizing of a context is not influenced by the 
+the speed of the other contexts, but only by the the value that a context should have.
+</dd>
+
+<dt>SC_HYPERVISOR_STOP_PRINT</dt>
+<dd>
+\anchor SC_HYPERVISOR_STOP_PRINT
+\addindex __env__SC_HYPERVISOR_STOP_PRINT
+By default the values of the speed of the workers is printed during the execution
+of the application. If the value 1 is given to this environment variable this printing
+is not done.
+</dd>
+
+<dt>SC_HYPERVISOR_LAZY_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_LAZY_RESIZE
+\addindex __env__SC_HYPERVISOR_LAZY_RESIZE
+By default the hypervisor resizes the contexts in a lazy way, that is workers are firstly added to a new context
+before removing them from the previous one. Once this workers are clearly taken into account 
+into the new context (a task was poped there) we remove them from the previous one. However if the application
+would like that the change in the distribution of workers should change right away this variable should be set to 0
+</dl>
+
+<dt>SC_HYPERVISOR_SAMPLE_CRITERIA</dt>
+<dd>
+\anchor SC_HYPERVISOR_SAMPLE_CRITERIA
+\addindex __env__SC_HYPERVISOR_SAMPLE_CRITERIA
+By default the hypervisor uses a sample of flops when computing the speed of the contexts and of the workers.
+If this variable is set to <c>time</c> the hypervisor uses a sample of time (10% of an aproximation of the total
+execution time of the application)
+</dl>
+
 */

+ 1 - 0
doc/doxygen/chapters/files.doxy

@@ -33,6 +33,7 @@
 \file starpu_scheduler.h
 \file starpu_sched_node.h
 \file starpu_sched_ctx.h
+\file starpu_sched_ctx_hypervisor.h
 \file starpu_top.h
 \file starpu_hash.h
 \file starpu_rand.h

+ 22 - 4
doc/doxygen/chapters/mpi_support.doxy

@@ -232,7 +232,7 @@ task, and trigger the required MPI transfers.
 
 The list of functions is described in \ref MPIInsertTask "MPI Insert Task".
 
-Here an stencil example showing how to use starpu_mpi_insert_task(). One
+Here an stencil example showing how to use starpu_mpi_task_insert(). One
 first needs to define a distribution function which specifies the
 locality of the data. Note that that distribution information needs to
 be given to StarPU by calling starpu_data_set_rank(). A MPI tag
@@ -291,14 +291,14 @@ data which will be needed by the tasks that we will execute.
     }
 \endcode
 
-Now starpu_mpi_insert_task() can be called for the different
+Now starpu_mpi_task_insert() can be called for the different
 steps of the application.
 
 \code{.c}
     for(loop=0 ; loop<niter; loop++)
         for (x = 1; x < X-1; x++)
             for (y = 1; y < Y-1; y++)
-                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
+                starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl,
                                        STARPU_RW, data_handles[x][y],
                                        STARPU_R, data_handles[x-1][y],
                                        STARPU_R, data_handles[x+1][y],
@@ -365,7 +365,7 @@ for(x = 0; x < nblocks ;  x++) {
     if (data_handles[x]) {
         int owner = starpu_data_get_rank(data_handles[x]);
         if (owner == rank) {
-            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
+            starpu_task_insert(&cl, STARPU_RW, data_handles[x], 0);
         }
     }
 }
@@ -375,3 +375,21 @@ starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
 \endcode
 
 */
+
+\section MPIExamples More MPI examples
+
+MPI examples are available in the StarPU source code in mpi/examples:
+
+<ul>
+<li><c>complex</c> is a simple example using a user-define data interface over
+MPI (complex numbers),
+<li><c>stencil5</c> is a simple stencil example using starpu_mpi_task_insert(),
+<li><c>matrix_decomposition</c> is a cholesky decomposition example using
+starpu_mpi_task_insert(). The non-distributed version can check for
+<algorithm correctness in 1-node configuration, the distributed version uses
+exactly the same source code, to be used over MPI,
+<li><c>mpi_lu</c> is an LU decomposition example, provided in three versions:
+<c>plu_example</c> uses explicit MPI data transfers, <c>plu_implicit_example</c>
+uses implicit MPI data transfers, <c>plu_outofcore_example</c> uses implicit MPI
+data transfers and supports data matrices which do not fit in memory (out-of-core).
+</ul>

+ 13 - 4
doc/doxygen/chapters/optimize_performance.doxy

@@ -171,10 +171,13 @@ The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
 is now just an alias for <b>dmda</b>.
 
 The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
-parallel tasks (still experimental).
+parallel tasks (still experimental). Should not be used when several contexts using
+it are being executed simultaneously.
 
 The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
-supports parallel tasks (still experimental).
+supports parallel tasks (still experimental). Should not be used when several 
+contexts using it are being executed simultaneously.
+
 
 \section PerformanceModelCalibration Performance Model Calibration
 
@@ -298,7 +301,7 @@ already gives the good results that a precise estimation would give.
 The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
 perform data prefetch (see \ref STARPU_PREFETCH):
 as soon as a scheduling decision is taken for a task, requests are issued to
-transfer its required data to the target processing unit, if needeed, so that
+transfer its required data to the target processing unit, if needed, so that
 when the processing unit actually starts the task, its data will hopefully be
 already available and it will not have to wait for the transfer to finish.
 
@@ -357,6 +360,12 @@ task->execute_on_a_specific_worker = 1;
 task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
 \endcode
 
+Note however that using scheduling contexts while statically scheduling tasks on workers
+could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
+contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
+the execution of the application may deadlock. Moreover, the hypervisor should not be used when
+statically scheduling tasks.
+
 \section Profiling Profiling
 
 A quick view of how many tasks each worker has executed can be obtained by setting
@@ -516,7 +525,7 @@ on the desktop machine.
 
 If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
 use simgrid to simulate execution with CUDA/OpenCL devices, but the application
-source code will probably disable the CUDA and OpenCL codelets in that
+source code will probably disable the CUDA and OpenCL codelets in thatcd sc
 case. Since during simgrid execution, the functions of the codelet are actually
 not called, one can use dummy functions such as the following to still permit
 CUDA or OpenCL execution:

+ 20 - 2
doc/doxygen/chapters/performance_feedback.doxy

@@ -10,7 +10,7 @@
 
 \section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
 
-StarPU can connect to Temanejo (see
+StarPU can connect to Temanejo >= 1.0rc2 (see
 http://www.hlrs.de/temanejo), to permit
 nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
 install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
@@ -35,7 +35,10 @@ call starpu_profiling_status_set() with the parameter
 is already enabled or not by calling starpu_profiling_status_get().
 Enabling monitoring also reinitialize all previously collected
 feedback. The environment variable \ref STARPU_PROFILING can also be
-set to <c>1</c> to achieve the same effect.
+set to <c>1</c> to achieve the same effect. The function
+starpu_profiling_init() can also be called during the execution to
+reinitialize performance counters and to start the profiling if the
+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
 
 Likewise, performance monitoring is stopped by calling
 starpu_profiling_status_set() with the parameter
@@ -579,6 +582,21 @@ Computation took (in ms)
 Synthetic GFlops : 44.21
 \endverbatim
 
+\section DataTrace Data trace and tasks length
+It is possible to get statistics about tasks length and data size by using :
+\verbatim
+$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
+\endverbatim
+Where filename is the FxT trace file and codeletX the names of the codelets you 
+want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
+This will create a file, <c>data_trace.gp</c> which
+can be plotted to get a .eps image of these results. On the image, each point represents a 
+task, and each color corresponds to a codelet.
+
+\image html data_trace.png
+\image latex data_trace.eps "" width=\textwidth
+
+
 \internal
 TODO: data transfer stats are similar to the ones displayed when
 setting STARPU_BUS_STATS

+ 98 - 25
doc/doxygen/chapters/scheduling_context_hypervisor.doxy

@@ -10,8 +10,8 @@
 
 \section WhatIsTheHypervisor What Is The Hypervisor
 
-StarPU proposes a platform for constructing Scheduling Contexts, for
-deleting and modifying them dynamically. A parallel kernel, can thus
+StarPU proposes a platform to construct Scheduling Contexts, to
+delete and modify them dynamically. A parallel kernel, can thus
 be isolated into a scheduling context and interferences between
 several parallel kernels are avoided. If the user knows exactly how
 many workers each scheduling context needs, he can assign them to the
@@ -31,11 +31,11 @@ platform for implementing additional custom ones is available.
 
 \section StartTheHypervisor Start the Hypervisor
 
-The Hypervisor must be initialised once at the beging of the
+The Hypervisor must be initialized once at the beginning of the
 application. At this point a resizing policy should be indicated. This
 strategy depends on the information the application is able to provide
 to the hypervisor as well as on the accuracy needed for the resizing
-procedure. For exemple, the application may be able to provide an
+procedure. For example, the application may be able to provide an
 estimation of the workload of the contexts. In this situation the
 hypervisor may decide what resources the contexts need. However, if no
 information is provided the hypervisor evaluates the behavior of the
@@ -46,17 +46,25 @@ The hypervisor resizes only the registered contexts.
 
 The runtime provides the hypervisor with information concerning the
 behavior of the resources and the application. This is done by using
-the performance_counters, some callbacks indicating when the resources
-are idle or not efficient, when the application submits tasks or when
-it becames to slow.
+the <c>performance_counters</c> which represent callbacks indicating 
+when the resources are idle or not efficient, when the application 
+submits tasks or when it becomes to slow.
 
 \section TriggerTheHypervisor Trigger the Hypervisor
 
-The resizing is triggered either when the application requires it or
+The resizing is triggered either when the application requires it 
+(<c> sc_hypervisor_resize_ctxs </c>) or
 when the initials distribution of resources alters the performance of
-the application( the application is to slow or the resource are idle
-for too long time, threashold indicated by the user). When this
-happens different resizing strategy are applied that target minimising
+the application (the application is to slow or the resource are idle
+for too long time). If the environment 
+variable <c>SC_HYPERVISOR_TRIGGER_RESIZE</c> is set to <c>speed</c> 
+the monitored speed of the contexts is compared to a theoretical value
+computed with a linear program, and the resizing is triggered
+whenever the two values do not correspond. Otherwise, if the environment 
+variable is set to <c>idle</c> the hypervisor triggers the resizing algorithm
+whenever the workers are idle for a period longer than the threshold 
+indicated by the programmer. When this
+happens different resizing strategy are applied that target minimizing
 the total execution of the application, the instant speed or the idle
 time of the resources.
 
@@ -68,7 +76,7 @@ The <b>Application driven</b> strategy uses the user's input concerning the mome
 Thus, the users tags the task that should trigger the resizing
 process. We can set directly the field starpu_task::hypervisor_tag or
 use the macro ::STARPU_HYPERVISOR_TAG in the function
-starpu_insert_task().
+starpu_task_insert().
 
 \code{.c}
 task.hypervisor_tag = 2;
@@ -77,7 +85,7 @@ task.hypervisor_tag = 2;
 or
 
 \code{.c}
-starpu_insert_task(&codelet,
+starpu_task_insert(&codelet,
 		    ...,
 		    STARPU_HYPERVISOR_TAG, 2,
                     0);
@@ -92,29 +100,28 @@ sc_hypervisor_resize(sched_ctx, 2);
 The user can use the same tag to change the resizing configuration of the contexts if he considers it necessary.
 
 \code{.c}
-sc_hypervisor_ioctl(sched_ctx,
-                    HYPERVISOR_MIN_WORKERS, 6,
-                    HYPERVISOR_MAX_WORKERS, 12,
-                    HYPERVISOR_TIME_TO_APPLY, 2,
+sc_hypervisor_ctl(sched_ctx,
+                    SC_HYPERVISOR_MIN_WORKERS, 6,
+                    SC_HYPERVISOR_MAX_WORKERS, 12,
+                    SC_HYPERVISOR_TIME_TO_APPLY, 2,
                     NULL);
 \endcode
 
 
-The <b>Idleness</b> based strategy resizes the scheduling contexts every time one of their workers stays idle
-for a period longer than the one imposed by the user
+The <b>Idleness</b> based strategy moves workers unused in a certain context to another one needing them.
 (see \ref UsersInputInTheResizingProcess "Users’ Input In The Resizing Process")
 
 \code{.c}
 int workerids[3] = {1, 3, 10};
 int workerids2[9] = {0, 2, 4, 5, 6, 7, 8, 9, 11};
-sc_hypervisor_ioctl(sched_ctx_id,
-            HYPERVISOR_MAX_IDLE, workerids, 3, 10000.0,
-            HYPERVISOR_MAX_IDLE, workerids2, 9, 50000.0,
+sc_hypervisor_ctl(sched_ctx_id,
+            SC_HYPERVISOR_MAX_IDLE, workerids, 3, 10000.0,
+            SC_HYPERVISOR_MAX_IDLE, workerids2, 9, 50000.0,
             NULL);
 \endcode
 
 The <b>Gflops rate</b> based strategy resizes the scheduling contexts such that they all finish at the same time.
-The speed of each of them is considered and once one of them is significantly slower the resizing process is triggered.
+The speed of each of them is computed and once one of them is significantly slower the resizing process is triggered.
 In order to do these computations the user has to input the total number of instructions needed to be executed by the
 parallel kernels and the number of instruction to be executed by each
 task.
@@ -124,7 +131,7 @@ The number of flops to be executed by a context are passed as
  (<c>sc_hypervisor_register_ctx(sched_ctx_id, flops)</c>) and the one
  to be executed by each task are passed when the task is submitted.
  The corresponding field is starpu_task::flops and the corresponding
- macro in the function starpu_insert_task() is ::STARPU_FLOPS
+ macro in the function starpu_task_insert() is ::STARPU_FLOPS
  (<b>Caution</b>: but take care of passing a double, not an integer,
  otherwise parameter passing will be bogus). When the task is executed
  the resizing process is triggered.
@@ -136,10 +143,76 @@ task.flops = 100;
 or
 
 \code{.c}
-starpu_insert_task(&codelet,
+starpu_task_insert(&codelet,
                     ...,
                     STARPU_FLOPS, (double) 100,
                     0);
 \endcode
 
+The <b>Feft</b> strategy uses a linear program to predict the best distribution of resources
+such that the application finishes in a minimum amount of time. As for the <b>Gflops rate </b>
+strategy the programmers has to indicate the total number of flops to be executed
+when registering the context. This number of flops may be updated dynamically during the execution
+of the application whenever this information is not very accurate from the beginning.
+The function <c>sc_hypervisor_update_diff_total_flop </c> is called in order add or remove
+a difference to the flops left to be executed.
+Tasks are provided also the number of flops corresponding to each one of them. During the 
+execution of the application the hypervisor monitors the consumed flops and recomputes
+the time left and the number of resources to use. The speed of each type of resource
+is (re)evaluated and inserter in the linear program in order to better adapt to the 
+needs of the application.
+
+The <b>Teft</b> strategy uses a linear program too, that considers all the types of tasks
+and the number of each of them and it tries to allocates resources such that the application
+finishes in a minimum amount of time. A previous calibration of StarPU would be useful
+in order to have good predictions of the execution time of each type of task.
+
+The types of tasks may be determines directly by the hypervisor when they are submitted.
+However there are applications that do not expose all the graph of tasks from the beginning.
+In this case in order to let the hypervisor know about all the tasks the function
+<c> sc_hypervisor_set_type_of_task </c> will just inform the hypervisor about future tasks
+without submitting them right away.
+
+The <b>Ispeed </b> strategy divides the execution of the application in several frames.
+For each frame the hypervisor computes the speed of the contexts and tries making them
+run at the same speed. The strategy requires less contribution from the user as
+the hypervisor requires only the size of the frame in terms of flops.
+
+\code{.c}
+int workerids[3] = {1, 3, 10};
+int workerids2[9] = {0, 2, 4, 5, 6, 7, 8, 9, 11};
+sc_hypervisor_ctl(sched_ctx_id,
+                  SC_HYPERVISOR_ISPEED_W_SAMPLE, workerids, 3, 2000000000.0,
+                  SC_HYPERVISOR_ISPEED_W_SAMPLE, workerids2, 9, 200000000000.0,
+                  SC_HYPERVISOR_ISPEED_CTX_SAMPLE, 60000000000.0,
+            NULL);
+\endcode
+
+The <b>Throughput </b> strategy focuses on maximizing the throughput of the resources
+and resizes the contexts such that the machine is running at its maximum efficiency
+(maximum instant speed of the workers).
+
+\section  Defining a new hypervisor policy
+
+While Scheduling Context Hypervisor Plugin comes with a variety of
+resizing policies (see \ref ResizingStrategies), it may sometimes be
+desirable to implement custom policies to address specific problems.
+The API described below allows users to write their own resizing policy.
+
+Here an example of how to define a new policy
+
+\code{.c}
+struct sc_hypervisor_policy dummy_policy =
+{
+       .handle_poped_task = dummy_handle_poped_task,
+       .handle_pushed_task = dummy_handle_pushed_task,
+       .handle_idle_cycle = dummy_handle_idle_cycle,
+       .handle_idle_end = dummy_handle_idle_end,
+       .handle_post_exec_hook = dummy_handle_post_exec_hook,
+       .custom = 1,
+       .name = "dummy"
+};
+\endcode
+
+
 */

+ 49 - 25
doc/doxygen/chapters/scheduling_contexts.doxy

@@ -1,6 +1,6 @@
 /*
  * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+//  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
@@ -19,34 +19,44 @@ to minimize interferences between the execution of multiple parallel
 kernels, by partitioning the underlying pool of workers using
 contexts.
 
+
 \section CreatingAContext Creating A Context
 
 By default, the application submits tasks to an initial context, which
-disposes of all the computation ressources available to StarPU (all
+disposes of all the computation resources available to StarPU (all
 the workers). If the application programmer plans to launch several
-parallel kernels simultaneusly, by default these kernels will be
+parallel kernels simultaneously, by default these kernels will be
 executed within this initial context, using a single scheduler
 policy(see \ref TaskSchedulingPolicy). Meanwhile, if the application
 programmer is aware of the demands of these kernels and of the
 specificity of the machine used to execute them, the workers can be
 divided between several contexts. These scheduling contexts will
 isolate the execution of each kernel and they will permit the use of a
-scheduling policy proper to each one of them. In order to create the
-contexts, you have to know the indentifiers of the workers running
-within StarPU. By passing a set of workers together with the
-scheduling policy to the function starpu_sched_ctx_create(), you will
-get an identifier of the context created which you will use to
+scheduling policy proper to each one of them. 
+
+Scheduling Contexts may be created in two ways: either the programmers indicates
+the set of workers corresponding to each context (providing he knows the 
+identifiers of the workers running within StarPU), or the programmer
+does not provide any worker list and leaves the Hypervisor assign
+workers to each context according to their needs (\ref SchedulingContextHypervisor)
+
+Both cases require a call to the function <c>starpu_sched_ctx_create</c>, which 
+requires as input the worker list (the exact list or a NULL pointer) and the scheduling
+policy. The latter one can be a character list corresponding to the name of a StarPU
+predefined policy or the pointer to a custom policy. The function returns 
+an identifier of the context created which you will use to
 indicate the context you want to submit the tasks to.
 
+
 \code{.c}
-/* the list of ressources the context will manage */
+/* the list of resources the context will manage */
 int workerids[3] = {1, 3, 10};
 
 /* indicate the scheduling policy to be used within the context, the list of
    workers assigned to it, the number of workers, the name of the context */
 int id_ctx = starpu_sched_ctx_create("dmda", workerids, 3, "my_ctx");
 
-/* let StarPU know that the folowing tasks will be submitted to this context */
+/* let StarPU know that the following tasks will be submitted to this context */
 starpu_sched_ctx_set_task_context(id);
 
 /* submit the task to StarPU */
@@ -77,19 +87,32 @@ starpu_sched_ctx_add_workers(workerids, 3, sched_ctx2);
 starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 \endcode
 
+\section SubmittingTasksToAContext Submitting Tasks To A Context
+The application may submit tasks to several contexts either 
+simultaneously or sequnetially. If several threads of submission
+are used the function <c>starpu_sched_ctx_set_context</c> may be called just
+before <c>starpu_task_submit</c>. Thus StarPU considers that 
+the current thread will submit tasks to the coresponding context.
+ 
+When the application may not assign a thread of submission to each
+context, the id of the context must be indicated by using the
+function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
+for starpu_task_insert().
+
 \section DeletingAContext Deleting A Context
 
 When a context is no longer needed it must be deleted. The application
 can indicate which context should keep the resources of a deleted one.
-All the tasks of the context should be executed before doing this. If
-the application need to avoid a barrier before moving the resources
-from the deleted context to the inheritor one, the application can
-just indicate when the last task was submitted. Thus, when this last
-task was submitted the resources will be move, but the context should
-still be deleted at some point of the application.
+All the tasks of the context should be executed before doing this. 
+Thus, the programmer may use either a barrier and then delete the context 
+directly, or just indicate
+that other tasks will not be submitted later on to the context (such that when 
+the last task is executed its workers will be moved to the inheritor)
+and delete the context at the end of the execution (when a barrier will
+be used eventually).
 
 \code{.c}
-/* when the context 2 will be deleted context 1 will be keep its resources */
+/* when the context 2 is deleted context 1 inherits its resources */
 starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
 /* submit tasks to context 2 */
@@ -98,7 +121,7 @@ for (i = 0; i < ntasks; i++)
 
 /* indicate that context 2 finished submitting and that */
 /* as soon as the last task of context 2 finished executing */
-/* its workers can be mobed to the inheritor context */
+/* its workers can be moved to the inheritor context */
 starpu_sched_ctx_finished_submit(sched_ctx1);
 
 /* wait for the tasks of both contexts to finish */
@@ -113,14 +136,15 @@ starpu_sched_ctx_delete(sched_ctx1);
 
 \section EmptyingAContext Emptying A Context
 
-A context may not have any resources at the begining or at a certain
+A context may have no resources at the begining or at a certain
 moment of the execution. Task can still be submitted to these contexts
-and they will execute them as soon as they will have resources. A list
+and they will be executed as soon as the contexts will have resources. A list
 of tasks pending to be executed is kept and when workers are added to
-the contexts the tasks are submitted. However, if no resources are
-allocated the program will not terminate. If these tasks have not much
-priority the programmer can forbid the application to submitted them
-by calling the function starpu_sched_ctx_stop_task_submission().
+the contexts these tasks start being submitted. However, if resources 
+are never allocated to the context the program will not terminate. 
+If these tasks have low
+priority the programmer can forbid the application to submit them
+by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
 
 \section ContextsSharingWorkers Contexts Sharing Workers
 
@@ -129,7 +153,7 @@ efficiently enough alone on these workers or when the application
 decides to express a hierarchy of contexts. The workers apply an
 alogrithm of ``Round-Robin'' to chose the context on which they will
 ``pop'' next. By using the function
-starpu_sched_ctx_set_turn_to_other_ctx(), the programmer can impose
+<c>starpu_sched_ctx_set_turn_to_other_ctx</c>, the programmer can impose
 the <c>workerid</c> to ``pop'' in the context <c>sched_ctx_id</c>
 next.
 

+ 2 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -18,6 +18,7 @@
 
 INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 		       	 @top_srcdir@/doc/doxygen/chapters/api \
+		       	 @top_srcdir@/doc/doxygen/chapters/api/sc_hypervisor \
                          @top_builddir@/doc/doxygen/starpu_config.h \
 	 		 @top_srcdir@/include/starpu_bound.h \
 			 @top_srcdir@/include/starpu_cublas.h \
@@ -39,6 +40,7 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu_rand.h \
 			 @top_srcdir@/include/starpu_scc.h \
 			 @top_srcdir@/include/starpu_sched_ctx.h \
+			 @top_srcdir@/include/starpu_sched_ctx_hypervisor.h \
 			 @top_srcdir@/include/starpu_scheduler.h \
 			 @top_srcdir@/include/starpu_sink.h \
 			 @top_srcdir@/include/starpu_stdlib.h \

+ 2 - 1
doc/doxygen/doxygen.cfg

@@ -639,7 +639,7 @@ CITE_BIB_FILES         =
 # The QUIET tag can be used to turn on/off the messages that are generated
 # by doxygen. Possible values are YES and NO. If left blank NO is used.
 
-QUIET                  = NO
+QUIET                  = YES
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
 # generated by doxygen. Possible values are YES and NO. If left blank
@@ -1622,6 +1622,7 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \
+			 STARPU_SIMGRID=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 5 - 4
doc/doxygen/refman.tex

@@ -40,7 +40,6 @@
 \lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=8,numbers=left }
 \makeindex
 \setcounter{tocdepth}{3}
-\renewcommand{\footrulewidth}{0.4pt}
 \renewcommand{\familydefault}{\sfdefault}
 \hfuzz=15pt
 \setlength{\emergencystretch}{15pt}
@@ -85,10 +84,8 @@ Documentation License”.
 \end{quote}
 \end{figure}
 
-\clearemptydoublepage
 \pagenumbering{roman}
 \tableofcontents
-\clearemptydoublepage
 \pagenumbering{arabic}
 \hypersetup{pageanchor=true,citecolor=blue}
 
@@ -198,6 +195,8 @@ Documentation License”.
 \input{group__API__Versioning}
 \input{group__API__Initialization__and__Termination}
 \input{group__API__Standard__Memory__Library}
+\input{group__API__Toolbox}
+\input{group__API__Threads}
 \input{group__API__Workers__Properties}
 \input{group__API__Data__Management}
 \input{group__API__Data__Interfaces}
@@ -227,7 +226,8 @@ Documentation License”.
 \input{group__API__StarPUTop__Interface}
 \input{group__API__Scheduling__Contexts}
 \input{group__API__Scheduling__Policy}
-\input{group__API__Scheduling__Context__Hypervisor}
+\input{group__API__SC__Hypervisor__usage}
+\input{group__API__SC__Hypervisor}
 \input{group__API__Modularized__Scheduler}
 
 \chapter{File Index}
@@ -258,6 +258,7 @@ Documentation License”.
 \input{starpu__rand_8h}
 \input{starpu__scc_8h}
 \input{starpu__sched__ctx_8h}
+\input{starpu__sched__ctx__hypervisor_8h}
 \input{starpu__scheduler_8h}
 \input{starpu__sink_8h}
 \input{starpu__stdlib_8h}

+ 77 - 0
doc/tutorial/hello_world_mvsc.c

@@ -0,0 +1,77 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+struct params
+{
+	int i;
+	float f;
+};
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+	struct params *params = cl_arg;
+
+	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+}
+
+void callback_func(void *callback_arg)
+{
+	printf("Callback function (arg %x)\n", callback_arg);
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	struct starpu_codelet cl;
+	struct starpu_task *task;
+	struct params params;
+
+	starpu_codelet_init(&cl);
+	cl.cpu_funcs[0] = cpu_func;
+	cl.cpu_funcs[1] = NULL;
+	cl.nbuffers = 0;
+
+	/* initialize StarPU */
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	task = starpu_task_create();
+
+	task->cl = &cl; /* Pointer to the codelet defined above */
+
+	params.i = 1;
+	params.f = 2.0f;
+	task->cl_arg = &params;
+	task->cl_arg_size = sizeof(params);
+
+	task->callback_func = callback_func;
+	task->callback_arg = (void*) (uintptr_t) 0x42;
+
+	/* starpu_task_submit will be a blocking call */
+	task->synchronous = 1;
+
+	/* submit the task to StarPU */
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* terminate StarPU */
+	starpu_shutdown();
+
+	return 0;
+}

+ 10 - 6
examples/Makefile.am

@@ -20,7 +20,7 @@ AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STAR
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 SUBDIRS = stencil
 
@@ -46,10 +46,6 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
-	sched_ctx_utils/sched_ctx_utils.c			\
-	sched_ctx/sched_ctx.c					\
-	sched_ctx/parallel_code.c				\
-	sched_ctx/dummy_sched_with_ctx.c			\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -182,6 +178,7 @@ examplebin_PROGRAMS +=				\
 	spmd/vector_scal_spmd			\
 	spmv/spmv				\
 	callback/callback			\
+	callback/prologue			\
 	incrementer/incrementer			\
 	binary/binary				\
 	interface/complex			\
@@ -190,6 +187,7 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/sched_ctx			\
 	sched_ctx/parallel_code			\
 	sched_ctx/dummy_sched_with_ctx		\
+	sched_ctx/prio				\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	mandelbrot/mandelbrot			\
@@ -256,6 +254,7 @@ STARPU_EXAMPLES +=				\
 	spmd/vector_scal_spmd			\
 	spmv/spmv				\
 	callback/callback			\
+	callback/prologue			\
 	incrementer/incrementer			\
 	binary/binary				\
 	interface/complex			\
@@ -263,11 +262,16 @@ STARPU_EXAMPLES +=				\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
 	sched_ctx/sched_ctx			\
-	sched_ctx/parallel_code			\
+	sched_ctx/prio				\
 	sched_ctx/dummy_sched_with_ctx		\
 	reductions/dot_product			\
 	reductions/minmax_reduction
 
+if STARPU_LONG_CHECK
+STARPU_EXAMPLES +=				\
+	sched_ctx/parallel_code
+endif
+
 if STARPU_HAVE_F77_H
 STARPU_EXAMPLES +=				\
 	basic_examples/vector_scal_fortran

+ 4 - 4
examples/basic_examples/dynamic_handles.c

@@ -112,12 +112,12 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	ret = starpu_insert_task(&dummy_small_cl,
+	ret = starpu_task_insert(&dummy_small_cl,
 				 STARPU_VALUE, &dummy_small_cl.nbuffers, sizeof(dummy_small_cl.nbuffers),
 				 STARPU_RW, handle,
 				 0);
 	if (ret == -ENODEV) goto enodev;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
         ret = starpu_task_wait_for_all();
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 
@@ -126,12 +126,12 @@ int main(int argc, char **argv)
 	{
 		handles[i] = handle;
 	}
-	ret = starpu_insert_task(&dummy_big_cl,
+	ret = starpu_task_insert(&dummy_big_cl,
 				 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
 				 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
 				 0);
 	if (ret == -ENODEV) goto enodev;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
         ret = starpu_task_wait_for_all();
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
 	free(handles);

+ 1 - 1
examples/binary/binary.c

@@ -67,7 +67,7 @@ int compute(char *file_name, int load_as_file)
 
 	for (i = 0; i < niter; i++)
 	{
-		ret = starpu_insert_task(&cl, STARPU_RW, float_array_handle, 0);
+		ret = starpu_task_insert(&cl, STARPU_RW, float_array_handle, 0);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");

+ 104 - 0
examples/callback/prologue.c

@@ -0,0 +1,104 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <sys/time.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+starpu_data_handle_t handle;
+
+void cpu_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+{
+	int *val = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+
+	*val += 1;
+}
+
+struct starpu_codelet cl =
+{
+	.modes = { STARPU_RW },
+	.cpu_funcs = {cpu_codelet, NULL},
+	.nbuffers = 1,
+	.name = "callback"
+};
+
+void callback_func(void *callback_arg)
+{
+	int ret;
+
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->handles[0] = handle;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+void prologue_callback_func(void *callback_arg)
+{
+	double *x = (double*)callback_arg;
+	printf("x = %lf\n", *x);
+}
+
+
+int main(int argc, char **argv)
+{
+	int v=40;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&v, sizeof(int));
+
+	struct starpu_task *task = starpu_task_create();
+	task->cl = &cl;
+	task->prologue_callback_func = callback_func;
+	task->prologue_callback_arg = NULL;
+	task->handles[0] = handle;
+
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	double *x = (double*)malloc(sizeof(double));
+	*x = -999.0;
+	int ret2 = starpu_task_insert(&cl,
+				      STARPU_RW, handle,
+				      STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
+				      STARPU_PROLOGUE_CALLBACK_ARG, x,
+				      0);
+
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle);
+
+	FPRINTF(stderr, "v -> %d\n", v);
+
+	free(x);
+
+	starpu_shutdown();
+
+	return 0;
+
+enodev:
+	starpu_shutdown();
+	return 77;
+}

+ 12 - 12
examples/cg/cg_kernels.c

@@ -288,20 +288,20 @@ int dot_kernel(starpu_data_handle_t v1,
 	if (use_reduction)
 		starpu_data_invalidate_submit(s);
 	else {
-		ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&dot_kernel_cl,
+		ret = starpu_task_insert(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
 					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
 					 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }
@@ -442,12 +442,12 @@ int gemv_kernel(starpu_data_handle_t v1,
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		ret = starpu_insert_task(&scal_kernel_cl,
+		ret = starpu_task_insert(&scal_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -455,14 +455,14 @@ int gemv_kernel(starpu_data_handle_t v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			ret = starpu_insert_task(&gemv_kernel_cl,
+			ret = starpu_task_insert(&gemv_kernel_cl,
 						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
 						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
 						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 	}
 	return 0;
@@ -535,14 +535,14 @@ int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&scal_axpy_kernel_cl,
+		ret = starpu_task_insert(&scal_axpy_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }
@@ -609,13 +609,13 @@ int axpy_kernel(starpu_data_handle_t v1,
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_insert_task(&axpy_kernel_cl,
+		ret = starpu_task_insert(&axpy_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
 	return 0;
 }

+ 11 - 3
examples/cholesky/cholesky.h

@@ -132,15 +132,23 @@ void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
 void chol_cpu_codelet_update_u22(void **, void *);
 
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
 #ifdef STARPU_USE_CUDA
 void chol_cublas_codelet_update_u11(void *descr[], void *_args);
 void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 #endif
 
-extern struct starpu_perfmodel chol_model_11;
-extern struct starpu_perfmodel chol_model_21;
-extern struct starpu_perfmodel chol_model_22;
+void initialize_chol_model(struct starpu_perfmodel* model, char* symbol, 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
 
 static void STARPU_ATTRIBUTE_UNUSED parse_args(int argc, char **argv)
 {

+ 14 - 0
examples/cholesky/cholesky_grain_tag.c

@@ -18,6 +18,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /*
  *	Some useful functions
  */
@@ -292,6 +296,16 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID

+ 20 - 7
examples/cholesky/cholesky_implicit.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,6 +21,9 @@
 /*
  *	Create the codelets
  */
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
 
 static struct starpu_codelet cl11 =
 {
@@ -96,27 +99,27 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	{
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
-                ret = starpu_insert_task(&cl11,
+                ret = starpu_task_insert(&cl11,
 					 STARPU_PRIORITY, prio_level,
 					 STARPU_RW, sdatakk,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
 					 0);
 		if (ret == -ENODEV) return 77;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 		for (j = k+1; j<nblocks; j++)
 		{
                         starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        ret = starpu_insert_task(&cl21,
+                        ret = starpu_task_insert(&cl21,
 						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
 						 0);
 			if (ret == -ENODEV) return 77;
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -125,7 +128,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
 					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
 
-					ret = starpu_insert_task(&cl22,
+					ret = starpu_task_insert(&cl22,
 								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
 								 STARPU_R, sdataki,
 								 STARPU_R, sdatakj,
@@ -133,7 +136,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
 								 0);
 					if (ret == -ENODEV) return 77;
-					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
 			}
 		}
@@ -346,6 +349,16 @@ int main(int argc, char **argv)
                 return 77;
         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 	if(with_ctxs)

+ 19 - 39
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -22,10 +22,11 @@
  */
 
 /*
- *	Number of flops of Gemm 
+ *	Number of flops of Gemm
  */
 
 #include <starpu.h>
+#include <starpu_perfmodel.h>
 #include "cholesky.h"
 
 /* #define USE_PERTURBATION	1 */
@@ -36,7 +37,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -51,7 +52,7 @@ static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -66,7 +67,7 @@ static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -81,7 +82,7 @@ static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -96,7 +97,7 @@ static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -111,7 +112,7 @@ static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmo
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -126,35 +127,14 @@ static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perfm
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel chol_model_11 =
+void initialize_chol_model(struct starpu_perfmodel* model, char * symbol,
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
 {
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_11"
-};
-
-struct starpu_perfmodel chol_model_21 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_21"
-};
-
-struct starpu_perfmodel chol_model_22 =
-{
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_22"
-};
+	model->symbol = symbol;
+	model->type = STARPU_HISTORY_BASED;
+	starpu_perfmodel_init(model);
+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+}

+ 15 - 0
examples/cholesky/cholesky_tag.c

@@ -17,6 +17,11 @@
  */
 
 #include "cholesky.h"
+#include <starpu_perfmodel.h>
+
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
 
 /*
  *	Some useful functions
@@ -258,6 +263,16 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID

+ 14 - 0
examples/cholesky/cholesky_tile_tag.c

@@ -17,6 +17,10 @@
 
 #include "cholesky.h"
 
+struct starpu_perfmodel chol_model_11;
+struct starpu_perfmodel chol_model_21;
+struct starpu_perfmodel chol_model_22;
+
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
@@ -254,6 +258,16 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_USE_CUDA
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
+#else
+	initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
+	initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
+	initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
+#endif
+
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
 

+ 1 - 1
examples/cpp/incrementer_cpp.cpp

@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < niter; i++)
 	{
-		ret = starpu_insert_task(&cl,
+		ret = starpu_task_insert(&cl,
 					 STARPU_RW, float_array_handle,
 					 0);
                 if (STARPU_UNLIKELY(ret == -ENODEV))

+ 26 - 0
examples/heat/dw_factolu.c

@@ -25,6 +25,11 @@
 #define debug(fmt, ...)
 #endif
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 unsigned *advance_11; /* size nblocks, whether the 11 task is done */
 unsigned *advance_12_21; /* size nblocks*nblocks */
 unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
@@ -701,6 +706,27 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+#ifdef STARPU_ATLAS
+	char * symbol_11 = "lu_model_11_atlas";
+	char * symbol_12 = "lu_model_12_atlas";
+	char * symbol_21 = "lu_model_21_atlas";
+	char * symbol_22 = "lu_model_22_atlas";
+#elif defined(STARPU_GOTO)
+	char * symbol_11 = "lu_model_11_goto";
+	char * symbol_12 = "lu_model_12_goto";
+	char * symbol_21 = "lu_model_21_goto";
+	char * symbol_22 = "lu_model_22_goto";
+#else
+	char * symbol_11 = "lu_model_11";
+	char * symbol_12 = "lu_model_12";
+	char * symbol_21 = "lu_model_21";
+	char * symbol_22 = "lu_model_22";
+#endif
+	initialize_lu_kernels_model(&model_11,symbol_11,task_11_cost,task_11_cost_cpu,task_11_cost_cuda);
+	initialize_lu_kernels_model(&model_12,symbol_12,task_12_cost,task_12_cost_cpu,task_12_cost_cuda);
+	initialize_lu_kernels_model(&model_21,symbol_21,task_21_cost,task_21_cost_cpu,task_21_cost_cuda);
+	initialize_lu_kernels_model(&model_22,symbol_22,task_22_cost,task_22_cost_cpu,task_22_cost_cuda);
+
 	starpu_cublas_init();
 
 	if (pinned)

+ 0 - 5
examples/heat/dw_factolu.h

@@ -216,9 +216,4 @@ void dw_callback_v2_codelet_update_u12(void *);
 void dw_callback_v2_codelet_update_u21(void *);
 void dw_callback_v2_codelet_update_u22(void *);
 
-extern struct starpu_perfmodel model_11;
-extern struct starpu_perfmodel model_12;
-extern struct starpu_perfmodel model_21;
-extern struct starpu_perfmodel model_22;
-
 #endif /* __DW_FACTO_LU_H__ */

+ 5 - 0
examples/heat/dw_factolu_grain.c

@@ -27,6 +27,11 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 /*
  *	Construct the DAG
  */

+ 5 - 0
examples/heat/dw_factolu_tag.c

@@ -27,6 +27,11 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+struct starpu_perfmodel model_11;
+struct starpu_perfmodel model_12;
+struct starpu_perfmodel model_21;
+struct starpu_perfmodel model_22;
+
 static unsigned no_prio = 0;
 
 /*

+ 26 - 84
examples/heat/lu_kernels_model.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,7 +24,7 @@
  */
 
 /*
- *	Number of flops of Gemm 
+ *	Number of flops of Gemm
  */
 
 /* #define USE_PERTURBATION	1 */
@@ -36,7 +36,7 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-/* 
+/*
  *
  *	Generic models
  *
@@ -95,14 +95,14 @@ double task_22_cost(struct starpu_task *task, unsigned nimpl)
 	return PERTURBATE(cost);
 }
 
-/* 
+/*
  *
  *	Models for CUDA
  *
  */
 
 
-double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -114,7 +114,7 @@ double task_11_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -127,7 +127,7 @@ double task_12_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 }
 
 
-double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -141,7 +141,7 @@ double task_21_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 
 
 
-double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 
@@ -155,13 +155,13 @@ double task_22_cost_cuda(struct starpu_task *task, enum starpu_perfmodel_archtyp
 	return PERTURBATE(cost);
 }
 
-/* 
+/*
  *
  *	Models for CPUs
  *
  */
 
-double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -173,7 +173,7 @@ double task_11_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 	return PERTURBATE(cost);
 }
 
-double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -186,7 +186,7 @@ double task_12_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 }
 
 
-double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t n;
 
@@ -200,7 +200,7 @@ double task_21_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 
 
 
-double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 	uint32_t nx, ny, nz;
 
@@ -214,74 +214,16 @@ double task_22_cost_cpu(struct starpu_task *task, enum starpu_perfmodel_archtype
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel model_11 =
-{
-	.cost_function = task_11_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_11_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_11_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_11_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_11_goto"
-#else
-	.symbol = "lu_model_11"
-#endif
-};
-
-struct starpu_perfmodel model_12 =
-{
-	.cost_function = task_12_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_12_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_12_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_12_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_12_goto"
-#else
-	.symbol = "lu_model_12"
-#endif
-};
-
-struct starpu_perfmodel model_21 =
-{
-	.cost_function = task_21_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_21_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_21_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_21_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_21_goto"
-#else
-	.symbol = "lu_model_21"
-#endif
-};
-
-struct starpu_perfmodel model_22 =
-{
-	.cost_function = task_22_cost,
-	.per_arch =
-	{
-		[STARPU_CPU_DEFAULT][0] = { .cost_function = task_22_cost_cpu },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_function = task_22_cost_cuda }
-	},
-	.type = STARPU_HISTORY_BASED,
-#ifdef STARPU_ATLAS
-	.symbol = "lu_model_22_atlas"
-#elif defined(STARPU_GOTO)
-	.symbol = "lu_model_22_goto"
-#else
-	.symbol = "lu_model_22"
-#endif
-};
+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol,
+		double (*cost_function)(struct starpu_task *, unsigned),
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned),
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned))
+{
+	model->symbol = symbol;
+	model->type = STARPU_HISTORY_BASED;
+	starpu_perfmodel_init(model);
+	model->cost_function = cost_function;
+	model->per_arch[STARPU_CPU_WORKER][0][0][0].cost_function = cpu_cost_function;
+	if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) != 0)
+		model->per_arch[STARPU_CUDA_WORKER][0][0][0].cost_function = cuda_cost_function;
+}

+ 20 - 0
examples/heat/lu_kernels_model.h

@@ -20,4 +20,24 @@
 
 #include <starpu.h>
 
+double task_11_cost(struct starpu_task *task, unsigned nimpl);
+double task_12_cost(struct starpu_task *task, unsigned nimpl);
+double task_21_cost(struct starpu_task *task, unsigned nimpl);
+double task_22_cost(struct starpu_task *task, unsigned nimpl);
+
+double task_11_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_12_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_21_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_22_cost_cuda(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
+double task_11_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_12_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_21_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double task_22_cost_cpu(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+
+void initialize_lu_kernels_model(struct starpu_perfmodel* model, char * symbol, 
+		double (*cost_function)(struct starpu_task *, unsigned), 
+		double (*cpu_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned), 
+		double (*cuda_cost_function)(struct starpu_task *, struct starpu_perfmodel_arch*, unsigned));
+
 #endif /* __LU_KERNELS_MODEL_H__ */

+ 14 - 16
examples/interface/complex.c

@@ -18,8 +18,6 @@
 #include "complex_interface.h"
 #include "complex_codelet.h"
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
        if (starpu_worker_get_type(workerid) == STARPU_OPENCL_WORKER)
@@ -95,21 +93,21 @@ int main(int argc, char **argv)
 	starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
 	starpu_complex_data_register(&handle2, STARPU_MAIN_RAM, &copy_real, &copy_imaginary, 1);
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_compare,
+	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	starpu_task_wait_for_all();
 	if (compare != 0)
 	{
@@ -117,28 +115,28 @@ int main(int argc, char **argv)
 	     goto end;
 	}
 
-	ret = starpu_insert_task(&cl_copy,
+	ret = starpu_task_insert(&cl_copy,
 				 STARPU_R, handle1,
 				 STARPU_W, handle2,
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle1", strlen("handle1"), STARPU_R, handle1, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle2", strlen("handle2"), STARPU_R, handle2, 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-	ret = starpu_insert_task(&cl_compare,
+	ret = starpu_task_insert(&cl_compare,
 				 STARPU_R, handle1,
 				 STARPU_R, handle2,
 				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
 				 0);
 	if (ret == -ENODEV) goto end;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	starpu_task_wait_for_all();
 

+ 3 - 1
examples/interface/complex_codelet.h

@@ -20,6 +20,8 @@
 #ifndef __COMPLEX_CODELET_H
 #define __COMPLEX_CODELET_H
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 void compare_complex_codelet(void *descr[], void *_args)
 {
 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
@@ -70,7 +72,7 @@ void display_complex_codelet(void *descr[], void *_args)
 
 	for(i=0 ; i<nx ; i++)
 	{
-		fprintf(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
+		FPRINTF(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
 	}
 }
 

+ 4 - 2
examples/lu/lu_example.c

@@ -182,6 +182,8 @@ static void init_matrix(void)
 			/* also randomize the imaginary component for complex number cases */
 			A[i + j*size] += (TYPE)(I*starpu_drand48());
 #endif
+			if (i == j)
+				A[i + j*size] *= 100;
 		}
 	}
 
@@ -299,13 +301,13 @@ int main(int argc, char **argv)
 {
 	int ret;
 
-	parse_args(argc, argv);
-
 #ifdef STARPU_QUICK_CHECK
 	size /= 4;
 	nblocks /= 4;
 #endif
 
+	parse_args(argc, argv);
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;

+ 3 - 3
examples/mandelbrot/mandelbrot.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -529,7 +529,7 @@ int main(int argc, char **argv)
 			per_block_cnt[iby] = 0;
 			int *pcnt = &per_block_cnt[iby];
 
-			ret = starpu_insert_task(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
+			ret = starpu_task_insert(use_spmd?&spmd_mandelbrot_cl:&mandelbrot_cl,
 						 STARPU_VALUE, &iby, sizeof(iby),
 						 STARPU_VALUE, &block_size, sizeof(block_size),
 						 STARPU_VALUE, &stepX, sizeof(stepX),
@@ -537,7 +537,7 @@ int main(int argc, char **argv)
 						 STARPU_W, block_handles[iby],
 						 STARPU_VALUE, &pcnt, sizeof(int *),
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 
 		for (iby = 0; iby < nblocks; iby++)

+ 8 - 8
examples/pipeline/pipeline.c

@@ -200,33 +200,33 @@ int main(void)
 			sem_wait(&sems[l%C]);
 
 		/* Now submit the next stage */
-		ret = starpu_insert_task(&pipeline_codelet_x,
+		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersX[l%K],
 				STARPU_VALUE, &x, sizeof(x),
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task x");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert x");
 
-		ret = starpu_insert_task(&pipeline_codelet_x,
+		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersY[l%K],
 				STARPU_VALUE, &y, sizeof(y),
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task y");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert y");
 
-		ret = starpu_insert_task(&pipeline_codelet_axpy,
+		ret = starpu_task_insert(&pipeline_codelet_axpy,
 				STARPU_R, buffersX[l%K],
 				STARPU_RW, buffersY[l%K],
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task axpy");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert axpy");
 
-		ret = starpu_insert_task(&pipeline_codelet_sum,
+		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
 				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
 				0);
 		if (ret == -ENODEV) goto enodev;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task sum");
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert sum");
 	}
 	starpu_task_wait_for_all();
 

+ 1 - 1
examples/sched_ctx/dummy_sched_with_ctx.c

@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	unsigned sched_ctx = starpu_sched_ctx_create_with_custom_policy(&dummy_sched_policy, NULL, -1, "dummy");
+	unsigned sched_ctx = starpu_sched_ctx_create(NULL, -1, "dummy", STARPU_SCHED_CTX_POLICY_STRUCT, &dummy_sched_policy, 0);
 #ifdef STARPU_QUICK_CHECK
 	ntasks /= 100;
 #endif

+ 2 - 2
examples/sched_ctx/parallel_code.c

@@ -83,8 +83,8 @@ int main(int argc, char **argv)
 #endif
 
 	/*create contexts however you want*/
-	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", procs1, nprocs1, "ctx1");
-	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", procs2, nprocs2, "ctx2");
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
 
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);

+ 60 - 0
examples/sched_ctx/prio.c

@@ -0,0 +1,60 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	unsigned sched_ctx1 = starpu_sched_ctx_create(NULL, -1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "prio", 0);
+
+	FPRINTF(stderr, "min prio %d\n", starpu_sched_ctx_get_min_priority(sched_ctx1));
+	FPRINTF(stderr, "max prio %d\n", starpu_sched_ctx_get_max_priority(sched_ctx1));
+
+	unsigned sched_ctx2 = starpu_sched_ctx_create(NULL, -1, "ctx2",
+						      STARPU_SCHED_CTX_POLICY_NAME, "prio",
+						      STARPU_SCHED_CTX_POLICY_MIN_PRIO, -12,
+						      STARPU_SCHED_CTX_POLICY_MAX_PRIO, 32,
+						      0);
+
+	FPRINTF(stderr, "min prio %d\n", starpu_sched_ctx_get_min_priority(sched_ctx2));
+	FPRINTF(stderr, "max prio %d\n", starpu_sched_ctx_get_max_priority(sched_ctx2));
+
+	if (starpu_sched_ctx_get_min_priority(sched_ctx2) != -12)
+	{
+		FPRINTF(stderr, "Error with min priority: %d != %d\n", starpu_sched_ctx_get_min_priority(sched_ctx2), -12);
+		ret = 1;
+	}
+	if (starpu_sched_ctx_get_max_priority(sched_ctx2) != 32)
+	{
+		FPRINTF(stderr, "Error with max priority: %d != %d\n", starpu_sched_ctx_get_max_priority(sched_ctx2), 32);
+		ret = 1;
+	}
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+
+	starpu_shutdown();
+
+	return ret;
+}

+ 3 - 3
examples/sched_ctx/sched_ctx.c

@@ -57,7 +57,7 @@ int main(int argc, char **argv)
 	starpu_pthread_mutex_init(&mut, NULL);
 	int nprocs1 = 1;
 	int nprocs2 = 1;
-	int procs1[20], procs2[20];
+	int procs1[STARPU_NMAXWORKERS], procs2[STARPU_NMAXWORKERS];
 	procs1[0] = 0;
 	procs2[0] = 0;
 
@@ -76,8 +76,8 @@ int main(int argc, char **argv)
 #endif
 
 	/*create contexts however you want*/
-	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", procs1, nprocs1, "ctx1");
-	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", procs2, nprocs2, "ctx2");
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager",  0);
 
 	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
 	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);

+ 2 - 2
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -233,7 +233,7 @@ void construct_contexts(void (*bench)(unsigned, unsigned))
 	}
 	printf("\n ");
 
-	p1.ctx = starpu_sched_ctx_create("heft", procs, nprocs1, "sched_ctx1");
+	p1.ctx = starpu_sched_ctx_create(procs, nprocs1, "sched_ctx1", STARPU_SCHED_CTX_POLICY_NAME, "heft", 0);
 	p2.the_other_ctx = (int)p1.ctx;
 	p1.procs = procs;
 	p1.nprocs = nprocs1;
@@ -260,7 +260,7 @@ void construct_contexts(void (*bench)(unsigned, unsigned))
 	}
 	printf("\n");
 
-	p2.ctx = starpu_sched_ctx_create("heft", procs2, nprocs2, "sched_ctx2");
+	p2.ctx = starpu_sched_ctx_create(procs2, nprocs2, "sched_ctx2", STARPU_SCHED_CTX_POLICY_NAME, "heft", 0);
 	p1.the_other_ctx = (int)p2.ctx;
 	p2.procs = procs2;
 	starpu_sched_ctx_set_inheritor(p1.ctx, p2.ctx);

+ 4 - 2
examples/stencil/stencil-kernels.c

@@ -18,6 +18,8 @@
 #include "stencil.h"
 #include <sys/time.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 #ifndef timersub
 #define	timersub(x, y, res) \
 	do \
@@ -382,9 +384,9 @@ void update_func_cpu(void *descr[], void *arg)
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
-fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
+		FPRINTF(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 	else
-	DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
+		DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 #ifdef STARPU_USE_MPI
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

+ 7 - 1
examples/stencil/stencil-tasks.c

@@ -221,7 +221,8 @@ static struct starpu_codelet null =
 	.cpu_funcs_name = {"null_func", NULL},
 	.cuda_funcs = {null_func, NULL},
 	.opencl_funcs = {null_func, NULL},
-	.nbuffers = 2
+	.nbuffers = 2,
+	.name = "start"
 };
 
 void create_start_task(int z, int dir)
@@ -267,11 +268,15 @@ void create_tasks(int rank)
 	}
 
 	for (iter = 0; iter <= niter; iter++)
+	{
 	for (bz = 0; bz < nbz; bz++)
 	{
 		if ((iter > 0) && (get_block_mpi_node(bz) == rank))
 			create_task_update(iter, bz, rank);
 
+	}
+	for (bz = 0; bz < nbz; bz++)
+	{
 		if (iter != niter)
 		{
 			if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
@@ -281,6 +286,7 @@ void create_tasks(int rank)
 				create_task_save(iter, bz, -1, rank);
 		}
 	}
+	}
 }
 
 /*

+ 6 - 6
gcc-plugin/src/tasks.c

@@ -525,7 +525,7 @@ declare_codelet (tree task_decl)
   return cl_decl;
 }
 
-/* Build the body of TASK_DECL, which will call `starpu_insert_task'.  */
+/* Build the body of TASK_DECL, which will call `starpu_task_insert'.  */
 
 void
 define_task (tree task_decl)
@@ -583,22 +583,22 @@ define_task (tree task_decl)
   /* Introduce a local variable to hold the error code.  */
 
   tree error_var = build_decl (loc, VAR_DECL,
-  			       create_tmp_var_name (".insert_task_error"),
+  			       create_tmp_var_name (".task_insert_error"),
   			       integer_type_node);
   DECL_CONTEXT (error_var) = task_decl;
   DECL_ARTIFICIAL (error_var) = true;
 
   /* Build this:
 
-       err = starpu_insert_task (...);
+       err = starpu_task_insert (...);
        if (err != 0)
          { printf ...; abort (); }
    */
 
-  static tree insert_task_fn;
-  LOOKUP_STARPU_FUNCTION (insert_task_fn, "starpu_insert_task");
+  static tree task_insert_fn;
+  LOOKUP_STARPU_FUNCTION (task_insert_fn, "starpu_task_insert");
 
-  tree call = build_call_expr_loc_vec (loc, insert_task_fn, args);
+  tree call = build_call_expr_loc_vec (loc, task_insert_fn, args);
 
   tree assignment = build2 (INIT_EXPR, TREE_TYPE (error_var),
   			    error_var, call);

+ 4 - 4
gcc-plugin/tests/base.c

@@ -106,7 +106,7 @@ main (int argc, char *argv[])
   unsigned char y = 77;
   long y_as_long_int = 77;
 
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { STARPU_VALUE, &y, sizeof y },
@@ -114,7 +114,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
+  expected_task_insert_arguments = expected;
 
   /* Invoke the task, which should make sure it gets called with
      EXPECTED.  */
@@ -135,14 +135,14 @@ main (int argc, char *argv[])
 
   assert (tasks_submitted == 9);
 
-  struct insert_task_argument expected2[] =
+  struct task_insert_argument expected2[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { 0, 0, 0 }
     };
 
   tasks_submitted = 0;
-  expected_insert_task_arguments = expected2;
+  expected_task_insert_arguments = expected2;
 
   my_other_task (42);
   assert (tasks_submitted == 1);

+ 2 - 2
gcc-plugin/tests/lib-user.c

@@ -38,7 +38,7 @@ main (int argc, char *argv[])
   static const char forty_two = 42;
   static const int  sizeof_x = sizeof x;
 
-  struct insert_task_argument expected_pointer_task[] =
+  struct task_insert_argument expected_pointer_task[] =
     {
       { STARPU_VALUE, &forty_two, sizeof forty_two },
       { STARPU_R,  x },
@@ -47,7 +47,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_pointer_task;
+  expected_task_insert_arguments = expected_pointer_task;
 
   expected_register_arguments.pointer = (void *) x;
   expected_register_arguments.elements = sizeof x / sizeof x[0];

+ 10 - 10
gcc-plugin/tests/mocks.h

@@ -62,7 +62,7 @@ typedef double         cl_double;
 /* Number of tasks submitted.  */
 static unsigned int tasks_submitted;
 
-struct insert_task_argument
+struct task_insert_argument
 {
   /* `STARPU_VALUE', etc. */
   int type;
@@ -75,18 +75,18 @@ struct insert_task_argument
 };
 
 /* Pointer to a zero-terminated array listing the expected
-   `starpu_insert_task' arguments.  */
-const struct insert_task_argument *expected_insert_task_arguments;
+   `starpu_task_insert' arguments.  */
+const struct task_insert_argument *expected_task_insert_arguments;
 
 /* Expected targets of the codelets submitted.  */
-static int expected_insert_task_targets = STARPU_CPU | STARPU_OPENCL;
+static int expected_task_insert_targets = STARPU_CPU | STARPU_OPENCL;
 
 
 int
-starpu_insert_task (struct starpu_codelet *cl, ...)
+starpu_task_insert (struct starpu_codelet *cl, ...)
 {
   assert (cl->name != NULL && strlen (cl->name) > 0);
-  assert (cl->where == expected_insert_task_targets);
+  assert (cl->where == expected_task_insert_targets);
 
   assert ((cl->where & STARPU_CPU) == 0
 	  ? cl->cpu_funcs[0] == NULL
@@ -106,8 +106,8 @@ starpu_insert_task (struct starpu_codelet *cl, ...)
 
   va_start (args, cl);
 
-  const struct insert_task_argument *expected;
-  for (expected = expected_insert_task_arguments,
+  const struct task_insert_argument *expected;
+  for (expected = expected_task_insert_arguments,
 	 cl_args_offset = 1, scalars = 0, pointers = 0;
        expected->type != 0;
        expected++)
@@ -528,9 +528,9 @@ clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
 		const void *value)
 {
   size_t n;
-  const struct insert_task_argument *arg;
+  const struct task_insert_argument *arg;
 
-  for (n = 0, arg = expected_insert_task_arguments;
+  for (n = 0, arg = expected_task_insert_arguments;
        n < index;
        n++, arg++)
     assert (arg->pointer != NULL);

+ 3 - 3
gcc-plugin/tests/opencl.c

@@ -46,15 +46,15 @@ main ()
 #pragma starpu register a
 
   static int x = 123;
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &x, sizeof x },
       { STARPU_RW, a },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
-  expected_insert_task_targets = STARPU_OPENCL;
+  expected_task_insert_arguments = expected;
+  expected_task_insert_targets = STARPU_OPENCL;
   size_t y = 8; expected_cl_enqueue_kernel_arguments.global_work_size = &y;
 
   my_task (123, a);

+ 2 - 2
gcc-plugin/tests/output-pointer.c

@@ -84,14 +84,14 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof x[0];
   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) x, 42, sizeof x[0]);
 
-  struct insert_task_argument expected[] =
+  struct task_insert_argument expected[] =
     {
       { STARPU_VALUE, &size, sizeof size },
       { STARPU_W, x },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected;
+  expected_task_insert_arguments = expected;
 
   /* Invoke the task, which makes sure it gets called with EXPECTED.  */
   my_pointer_task (size, x);

+ 4 - 4
gcc-plugin/tests/pointers.c

@@ -92,14 +92,14 @@ main (int argc, char *argv[])
   expected_register_arguments.element_size = sizeof *y;
   starpu_vector_data_register (&handle, STARPU_MAIN_RAM, (uintptr_t) y, 1, sizeof *y);
 
-  struct insert_task_argument expected_pointer_task[] =
+  struct task_insert_argument expected_pointer_task[] =
     {
       { STARPU_R,  x },
       { STARPU_RW, y },
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_pointer_task;
+  expected_task_insert_arguments = expected_pointer_task;
 
   /* Invoke the task, which should make sure it gets called with
      EXPECTED.  */
@@ -110,7 +110,7 @@ main (int argc, char *argv[])
 
   /* Likewise with `my_mixed_task'.  */
 
-  struct insert_task_argument expected_mixed_task[] =
+  struct task_insert_argument expected_mixed_task[] =
     {
       { STARPU_RW, x },
       { STARPU_VALUE, &z, sizeof z },
@@ -118,7 +118,7 @@ main (int argc, char *argv[])
       { 0, 0, 0 }
     };
 
-  expected_insert_task_arguments = expected_mixed_task;
+  expected_task_insert_arguments = expected_mixed_task;
 
   my_mixed_task (x, 0x77, y);
 

+ 1 - 3
include/starpu.h

@@ -140,14 +140,12 @@ void starpu_topology_print(FILE *f);
 int starpu_asynchronous_copy_disabled(void);
 int starpu_asynchronous_cuda_copy_disabled(void);
 int starpu_asynchronous_opencl_copy_disabled(void);
+int starpu_asynchronous_mic_copy_disabled(void);
 
-void starpu_profiling_init();
 void starpu_display_stats();
 
 void starpu_get_version(int *major, int *minor, int *release);
 
-int starpu_worker_get_mp_nodeid(int id);
-
 #ifdef __cplusplus
 }
 #endif

+ 0 - 2
include/starpu_data.h

@@ -81,8 +81,6 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
 void starpu_data_display_memory_stats();
 
-/* XXX These macros are provided to avoid breaking old codes. But consider
- * these function names as deprecated. */
 #define starpu_data_malloc_pinned_if_possible	starpu_malloc
 #define starpu_data_free_pinned_if_possible	starpu_free
 

+ 6 - 1
include/starpu_data_interfaces.h

@@ -96,7 +96,7 @@ enum starpu_data_interface_id
 	STARPU_VOID_INTERFACE_ID=6,
 	STARPU_MULTIFORMAT_INTERFACE_ID=7,
 	STARPU_COO_INTERFACE_ID=8,
-	STARPU_MAX_INTERFACE_ID=9 /* maximum number of data interfaces */
+	STARPU_MAX_INTERFACE_ID=9
 };
 
 struct starpu_data_interface_ops
@@ -400,6 +400,11 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
 starpu_data_handle_t starpu_data_lookup(const void *ptr);
 
+struct starpu_disk_interface
+{
+	uintptr_t dev_handle;
+};
+
 #ifdef __cplusplus
 }
 #endif

+ 1 - 1
include/starpu_deprecated_api.h

@@ -24,7 +24,7 @@ extern "C"
 #endif
 
 #if defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
-#warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
+#warning Your application is using deprecated types. You may want to update to use the latest API, by using tools/dev/rename.sh.
 #endif /* defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API) */
 
 #ifdef STARPU_USE_DEPRECATED_ONE_ZERO_API

+ 15 - 9
include/starpu_disk.h

@@ -18,33 +18,39 @@
 #ifndef __STARPU_DISK_H__
 #define __STARPU_DISK_H__
 
+#include <sys/types.h>
+
 /* list of functions to use on disk */
 struct starpu_disk_ops {
  	 void *  (*alloc)  (void *base, size_t size);
 	 void    (*free)   (void *base, void *obj, size_t size);
 	 void *  (*open)   (void *base, void *pos, size_t size);     /* open an existing file */
 	 void    (*close)  (void *base, void *obj, size_t size);
-	ssize_t  (*read)   (void *base, void *obj, void *buf, off_t offset, size_t size);        /* ~= pread */
-	ssize_t  (*write)  (void *base, void *obj, const void *buf, off_t offset, size_t size); 
+	 int     (*read)   (void *base, void *obj, void *buf, off_t offset, size_t size, void * async); 
+	 int     (*write)  (void *base, void *obj, const void *buf, off_t offset, size_t size, void * async); 
+	 int     (*async_write)  (void *base, void *obj, void *buf, off_t offset, size_t size, void * async); 
+	 int     (*async_read)   (void *base, void *obj, void *buf, off_t offset, size_t size, void * async); 
 	/* readv, writev, read2d, write2d, etc. */
 	 void *  (*plug)   (void *parameter);
 	 void    (*unplug) (void *base);
-	  int    (*copy)   (void *base_src, void* obj_src, off_t offset_src,  void *base_dst, void* obj_dst, off_t offset_dst, size_t size);
-	  int    (*bandwidth) (unsigned node);
+	 int    (*copy)   (void *base_src, void* obj_src, off_t offset_src,  void *base_dst, void* obj_dst, off_t offset_dst, size_t size, void * async_channel);
+	 int    (*bandwidth)    (unsigned node);
+	 void   (*wait_request) (void * async_channel);
+	 int    (*test_request) (void * async_channel);
+	 int	(*full_read)    (unsigned node, void * base, void * obj, void ** ptr, size_t * size);
+	 int 	(*full_write)   (unsigned node, void * base, void * obj, void * ptr, size_t size);
 };
 
-
 /* Posix functions to use disk memory */
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
+extern struct starpu_disk_ops starpu_disk_leveldb_ops;
 
-/*functions to add an existing memory */
 void starpu_disk_close(unsigned node, void *obj, size_t size);
 
-void * starpu_disk_open(unsigned node, void *pos, size_t size);
+void *starpu_disk_open(unsigned node, void *pos, size_t size);
 
-/* interface to create and to free a memory disk */
-int starpu_disk_register(struct starpu_disk_ops * func, void *parameter, size_t size);
+int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size);
 
 #endif /* __STARPU_DISK_H__ */

+ 3 - 2
include/starpu_fxt.h

@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 {
 	char symbol[256];
 	int workerid;
-	enum starpu_perfmodel_archtype archtype;
+	struct starpu_perfmodel_arch arch;
 	uint32_t hash;
 	size_t size;
 	float time;
@@ -54,7 +54,7 @@ struct starpu_fxt_options
 	int file_rank;
 
 	char worker_names[STARPU_NMAXWORKERS][256];
-	enum starpu_perfmodel_archtype worker_archtypes[STARPU_NMAXWORKERS];
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 	int nworkers;
 
 	struct starpu_fxt_codelet_event **dumped_codelets;
@@ -65,6 +65,7 @@ void starpu_fxt_options_init(struct starpu_fxt_options *options);
 void starpu_fxt_generate_trace(struct starpu_fxt_options *options);
 void starpu_fxt_start_profiling(void);
 void starpu_fxt_stop_profiling(void);
+void starpu_fxt_write_data_trace(char *filename_in);
 
 #ifdef __cplusplus
 }

+ 25 - 41
include/starpu_perfmodel.h

@@ -23,6 +23,7 @@
 #include <stdio.h>
 
 #include <starpu_util.h>
+#include <starpu_worker.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -32,36 +33,15 @@ extern "C"
 struct starpu_task;
 struct starpu_data_descr;
 
-enum starpu_perfmodel_archtype
+#define STARPU_NARCH STARPU_ANY_WORKER
+
+struct starpu_perfmodel_arch
 {
-	STARPU_CPU_DEFAULT = 0,
-	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
-	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
+	enum starpu_worker_archtype type;
+	int devid;
+	int ncore;
 };
 
-#ifdef __STDC_VERSION__
-#  if __STDC_VERSION__ > 199901L || STARPU_GNUC_PREREQ(4, 6)
-
-/* Make sure the following assertions hold, since StarPU relies on it.  */
-
-_Static_assert(STARPU_CPU_DEFAULT == 0,
-	       "invalid STARPU_CPU_DEFAULT value");
-_Static_assert(STARPU_CPU_DEFAULT < STARPU_CUDA_DEFAULT,
-	       "invalid STARPU_{CPU,CUDA}_DEFAULT values");
-_Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
-	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
-_Static_assert(STARPU_OPENCL_DEFAULT < STARPU_MIC_DEFAULT,
-	       "invalid STARPU_{OPENCL,MIC}_DEFAULT values");
-_Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
-	       "invalid STARPU_{MIC,SCC}_DEFAULT values");
-
-#  endif
-#endif
-
-#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
-
 struct starpu_perfmodel_history_entry
 {
 	double mean;
@@ -113,8 +93,8 @@ struct starpu_perfmodel_history_table;
 struct starpu_perfmodel_per_arch
 {
 	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
-	double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-	size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_list *list;
@@ -142,38 +122,42 @@ struct starpu_perfmodel
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 
-	struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
+	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
 
 	const char *symbol;
 
+	unsigned is_init;
 	unsigned is_loaded;
 	unsigned benchmarking;
 	starpu_pthread_rwlock_t model_rwlock;
 };
 
-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid);
+void starpu_perfmodel_init(struct starpu_perfmodel *model);
+void starpu_perfmodel_init_with_file(FILE*f, struct starpu_perfmodel *model);
+
+struct starpu_perfmodel_arch *starpu_worker_get_perf_archtype(int workerid);
 
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 
-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl);
-void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
+char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch *arch, char *archname, size_t maxlen, unsigned nimpl);
 
-double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint);
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
 int starpu_perfmodel_list(FILE *output);
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
+void starpu_perfmodel_directory(FILE *output);
 
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
-double starpu_get_latency_RAM_CUDA(unsigned cudadev);
-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev);
-double starpu_get_latency_CUDA_RAM(unsigned cudadev);
-
+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
+double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 #ifdef __cplusplus
 }

+ 1 - 0
include/starpu_profiling.h

@@ -80,6 +80,7 @@ struct starpu_profiling_bus_info
 	int transfer_count;
 };
 
+void starpu_profiling_init(void);
 void starpu_profiling_set_id(int new_id);
 int starpu_profiling_status_set(int status);
 int starpu_profiling_status_get(void);

+ 27 - 42
include/starpu_sched_ctx.h

@@ -24,14 +24,17 @@ extern "C"
 {
 #endif
 
+#define STARPU_SCHED_CTX_POLICY_NAME		 (1<<16)
+#define STARPU_SCHED_CTX_POLICY_STRUCT		 (2<<16)
+#define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
+#define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 
-unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
-
-struct starpu_sched_policy;
-unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *policy, int *workerids, int nworkers, const char *sched_name);
+unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
 unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap);
 
+void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args);
+
 void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
@@ -50,37 +53,6 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids);
 
-struct starpu_sched_ctx_performance_counters
-{
-	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
-	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
-	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
-	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
-	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
-	void (*notify_delete_context)(unsigned sched_ctx);
-};
-
-#ifdef STARPU_USE_SC_HYPERVISOR
-void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
-void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
-#endif //STARPU_USE_SC_HYPERVISOR
-
-void starpu_sched_ctx_notify_hypervisor_exists(void);
-
-unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
-
-void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
-
-void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
-
-
-struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
-
-void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
-
-struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
-
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
@@ -89,13 +61,9 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_contains_type_of_worker(enum starpu_worker_archtype arch, unsigned sched_ctx_id);
 
-unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
-
-unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
-
-void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
+unsigned starpu_sched_ctx_worker_get_id(unsigned sched_ctx_id);
 
-double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
+unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 
 int starpu_sched_get_min_priority(void);
 
@@ -113,14 +81,31 @@ int starpu_sched_ctx_set_min_priority(unsigned sched_ctx_id, int min_prio);
 
 int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 
+int starpu_sched_ctx_min_priority_is_set(unsigned sched_ctx_id);
+
+int starpu_sched_ctx_max_priority_is_set(unsigned sched_ctx_id);
+
 #define STARPU_MIN_PRIO		(starpu_sched_get_min_priority())
 #define STARPU_MAX_PRIO		(starpu_sched_get_max_priority())
 
 #define STARPU_DEFAULT_PRIO	0
 
-/* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
+
+void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
+
+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
+
+void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
+
+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
+
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
+#ifdef STARPU_USE_SC_HYPERVISOR
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+#endif //STARPU_USE_SC_HYPERVISOR
+
 #ifdef __cplusplus
 }
 #endif

+ 51 - 0
include/starpu_sched_ctx_hypervisor.h

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010 - 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_SCHED_CTX_HYPERVISOR_H__
+#define __STARPU_SCHED_CTX_HYPERVISOR_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+struct starpu_sched_ctx_performance_counters
+{
+	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker);
+	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
+	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag);
+	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
+	void (*notify_ready_task)(unsigned sched_ctx_id, struct starpu_task *task);
+	void (*notify_empty_ctx)(unsigned sched_ctx_id, struct starpu_task *task);
+	void (*notify_delete_context)(unsigned sched_ctx);
+};
+
+#ifdef STARPU_USE_SC_HYPERVISOR
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void *perf_counters);
+#endif //STARPU_USE_SC_HYPERVISOR
+
+void starpu_sched_ctx_notify_hypervisor_exists(void);
+
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_SCHED_CTX_HYPERVISOR_H__ */

+ 12 - 7
include/starpu_scheduler.h

@@ -51,6 +51,10 @@ struct starpu_sched_policy **starpu_sched_get_predefined_policies();
 
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond);
 
+/* This function must be called to wake up a worker that is sleeping on the cond. 
+ * It returns 0 whenever the worker is not in a sleeping state */
+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
+
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
@@ -64,18 +68,19 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 int starpu_get_prefetch_flag(void);
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype);
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch *perf_arch);
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
+double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch *arch, unsigned nimpl);
 
+void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
 #ifdef __cplusplus
 }
 #endif

+ 7 - 0
include/starpu_task.h

@@ -125,6 +125,13 @@ struct starpu_task
 
 	void (*callback_func)(void *);
 	void *callback_arg;
+	/* must StarPU release callback_arg ? - 0 by default */
+	unsigned callback_arg_free;
+
+	void (*prologue_callback_func)(void *);
+	void *prologue_callback_arg;
+	/* must StarPU release prologue_callback_arg ? - 0 by default */
+	unsigned prologue_callback_arg_free;
 
 	unsigned use_tag;
 	starpu_tag_t tag_id;

+ 18 - 13
include/starpu_task_util.h

@@ -31,19 +31,24 @@ extern "C"
 
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
-#define STARPU_VALUE		 (1<<19)
-#define STARPU_CALLBACK		 (1<<20)
-#define STARPU_CALLBACK_WITH_ARG (1<<21)
-#define STARPU_CALLBACK_ARG	 (1<<22)
-#define STARPU_PRIORITY		 (1<<23)
-#define STARPU_EXECUTE_ON_NODE	 (1<<24)
-#define STARPU_EXECUTE_ON_DATA	 (1<<25)
-#define STARPU_DATA_ARRAY        (1<<26)
-#define STARPU_TAG               (1<<27)
-#define STARPU_HYPERVISOR_TAG	 (1<<28)
-#define STARPU_FLOPS	         (1<<29)
-#define STARPU_SCHED_CTX	 (1<<30)
-
+#define STARPU_VALUE		 (1<<16)
+#define STARPU_CALLBACK		 (2<<16)
+#define STARPU_CALLBACK_WITH_ARG (3<<16)
+#define STARPU_CALLBACK_ARG	 (4<<16)
+#define STARPU_PRIORITY		 (5<<16)
+#define STARPU_EXECUTE_ON_NODE	 (6<<16)
+#define STARPU_EXECUTE_ON_DATA	 (7<<16)
+#define STARPU_DATA_ARRAY        (8<<16)
+#define STARPU_TAG               (9<<16)
+#define STARPU_HYPERVISOR_TAG	 (10<<16)
+#define STARPU_FLOPS	         (11<<16)
+#define STARPU_SCHED_CTX	 (12<<16)
+#define STARPU_PROLOGUE_CALLBACK   (13<<16)
+#define STARPU_PROLOGUE_CALLBACK_ARG (14<<16)
+
+struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
+int starpu_task_insert(struct starpu_codelet *cl, ...);
+/* the function starpu_insert_task has the same semantics as starpu_task_insert, it is kept to avoid breaking old codes */
 int starpu_insert_task(struct starpu_codelet *cl, ...);
 
 void starpu_codelet_unpack_args(void *cl_arg, ...);

+ 5 - 5
include/starpu_thread.h

@@ -18,6 +18,11 @@
 #ifndef __STARPU_THREAD_H__
 #define __STARPU_THREAD_H__
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 #ifdef STARPU_SIMGRID
 #include <xbt/synchro_core.h>
 #include <msg/msg.h>
@@ -25,11 +30,6 @@
 #include <pthread.h>
 #endif
 
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
 /*
  * Encapsulation of the pthread_create function.
  */

+ 2 - 2
include/starpu_thread_util.h

@@ -250,8 +250,8 @@
 	}                                                                      \
 } while (0)
 
-#define STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             \
-	int p_ret = pthread_barrier_wait(barrier);                             \
+#define STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             	\
+	int p_ret = pthread_barrier_wait((barrier));				\
 	if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { \
 		fprintf(stderr,                                                \
 			"%s:%d pthread_barrier_wait: %s\n",                    \

+ 0 - 0
include/starpu_util.h


Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov