Corentin Salingue лет назад: 12
Родитель
Сommit
da301c4c6d
77 измененных файлов с 1075 добавлено и 12653 удалено
  1. 5 2
      ChangeLog
  2. 15 1
      configure.ac
  3. 4 4
      doc/doxygen/Makefile.am
  4. 10 2
      doc/doxygen/chapters/api/performance_model.doxy
  5. 9 0
      doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
  6. 16 0
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  7. 7 0
      doc/doxygen/chapters/configure_options.doxy
  8. 1 1
      doc/doxygen/chapters/scheduling_context_hypervisor.doxy
  9. 0 122
      doc/texinfo/Makefile.am
  10. 0 1368
      doc/texinfo/chapters/advanced-examples.texi
  11. 0 4311
      doc/texinfo/chapters/api.texi
  12. 0 960
      doc/texinfo/chapters/basic-examples.texi
  13. 0 485
      doc/texinfo/chapters/c-extensions.texi
  14. 0 672
      doc/texinfo/chapters/configuration.texi
  15. 0 507
      doc/texinfo/chapters/fdl-1.3.texi
  16. 0 58
      doc/texinfo/chapters/fft-support.texi
  17. 0 301
      doc/texinfo/chapters/hypervisor_api.texi
  18. 0 338
      doc/texinfo/chapters/installing.texi
  19. 0 201
      doc/texinfo/chapters/introduction.texi
  20. 0 55
      doc/texinfo/chapters/mic-scc-support.texi
  21. 0 418
      doc/texinfo/chapters/mpi-support.texi
  22. 0 608
      doc/texinfo/chapters/perf-feedback.texi
  23. 0 569
      doc/texinfo/chapters/perf-optimization.texi
  24. 0 130
      doc/texinfo/chapters/sc_hypervisor.texi
  25. 0 47
      doc/texinfo/chapters/scaling-vector-example.texi
  26. 0 116
      doc/texinfo/chapters/sched_ctx.texi
  27. 0 17
      doc/texinfo/chapters/socl.texi
  28. 0 111
      doc/texinfo/chapters/tips-tricks.texi
  29. 0 118
      doc/texinfo/chapters/vector_scal_c.texi
  30. 0 68
      doc/texinfo/chapters/vector_scal_cpu.texi
  31. 0 35
      doc/texinfo/chapters/vector_scal_cuda.texi
  32. 0 61
      doc/texinfo/chapters/vector_scal_opencl.texi
  33. 0 16
      doc/texinfo/chapters/vector_scal_opencl_codelet.texi
  34. 0 40
      doc/texinfo/dev/starpu_check_documented.py
  35. 0 78
      doc/texinfo/dev/starpu_check_undocumented.sh
  36. 0 28
      doc/texinfo/dev/starpu_funcs.cocci
  37. 0 160
      doc/texinfo/starpu.css
  38. 0 272
      doc/texinfo/starpu.texi
  39. 3 0
      examples/Makefile.am
  40. 183 0
      examples/sched_ctx/dummy_sched_with_ctx.c
  41. 3 0
      include/starpu_perfmodel.h
  42. 8 1
      include/starpu_sched_ctx.h
  43. 21 17
      mpi/src/starpu_mpi.c
  44. 3 0
      mpi/tests/Makefile.am
  45. 2 0
      mpi/tests/datatypes.c
  46. 159 41
      mpi/tests/mpi_earlyrecv2.c
  47. 2 1
      sc_hypervisor/examples/Makefile.am
  48. 5 4
      sc_hypervisor/examples/app_driven_test/app_driven_test.c
  49. 137 0
      sc_hypervisor/examples/lp_test/lp_resize_test.c
  50. 4 2
      sc_hypervisor/examples/lp_test/lp_test.c
  51. 11 5
      sc_hypervisor/include/sc_hypervisor.h
  52. 2 2
      sc_hypervisor/include/sc_hypervisor_lp.h
  53. 7 6
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  54. 14 13
      sc_hypervisor/include/sc_hypervisor_policy.h
  55. 43 30
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  56. 49 22
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  57. 2 2
      sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
  58. 52 41
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  59. 19 19
      sc_hypervisor/src/hypervisor_policies/ispeed_policy.c
  60. 47 14
      sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
  61. 1 1
      sc_hypervisor/src/policies_utils/lp_programs.c
  62. 8 9
      sc_hypervisor/src/policies_utils/lp_tools.c
  63. 36 24
      sc_hypervisor/src/policies_utils/policy_tools.c
  64. 39 46
      sc_hypervisor/src/policies_utils/speed.c
  65. 2 1
      sc_hypervisor/src/policies_utils/task_pool.c
  66. 23 14
      sc_hypervisor/src/sc_hypervisor.c
  67. 4 4
      sc_hypervisor/src/sc_hypervisor_intern.h
  68. 27 27
      src/common/list.h
  69. 12 2
      src/core/perfmodel/perfmodel_bus.c
  70. 50 12
      src/core/sched_ctx.c
  71. 1 1
      src/core/sched_ctx.h
  72. 3 6
      src/core/sched_policy.c
  73. 3 1
      src/core/sched_policy.h
  74. 11 2
      src/core/task.c
  75. 5 1
      src/core/workers.c
  76. 1 1
      tests/disk/disk_copy.c
  77. 6 1
      tools/gdbinit

+ 5 - 2
ChangeLog

@@ -170,8 +170,6 @@ Small features:
   * New function starpu_get_version() to return as 3 integers the
     release version of StarPU.
   * Enable by default data allocation cache
-  * Explicitly name the non-sleeping-non-running time "Overhead", and use
-    another color in vite traces.
 
 Changes:
   * Rename all filter functions to follow the pattern
@@ -238,6 +236,11 @@ Small changes:
   * Fix forcing calibration of never-calibrated archs.
   * CUDA applications are no longer compiled with the "-arch sm_13"
     option. It is specifically added to applications which need it.
+  * Explicitly name the non-sleeping-non-running time "Overhead", and use
+    another color in vite traces.
+  * Use C99 variadic macro support, not GNU.
+  * Fix performance regression: dmda queues were inadvertently made
+    LIFOs in r9611.
 
 StarPU 1.0.3 (svn revision 7379)
 ==============================================

+ 15 - 1
configure.ac

@@ -2175,11 +2175,25 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
 			enable_build_doc=$enableval, enable_build_doc=yes)
 
-# Check whether doxygen is installed
+# Check whether doxygen and pdflatex are installed
 AC_PATH_PROG(doxygencommand, doxygen)
 if test "$doxygencommand" = "" ; then
 	enable_build_doc="no"
+else
+	DOXYGEN_VERSION_MAJOR=`$doxygencommand --version| cut -d '.' -f1`
+	DOXYGEN_VERSION_MINOR=`$doxygencommand --version| cut -d '.' -f2`
+	if test $DOXYGEN_VERSION_MAJOR -ge 1 -a $DOXYGEN_VERSION_MINOR -ge 8 ; then
+	   	enable_build_doc="yes"
+	else
+	   	enable_build_doc="no"
+	fi
+fi
+AC_PATH_PROG(pdflatex, pdflatex)
+if test "pdflatexcommand" = "" ; then
+	enable_build_doc="no"
 fi
+AC_MSG_CHECKING(whether documentation should be compiled)
+AC_MSG_RESULT($enable_build_doc)
 
 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 

+ 4 - 4
doc/doxygen/Makefile.am

@@ -99,7 +99,7 @@ starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	sed 's/#undef \(.*\)/#define \1 1/' $< > $@
 
 chapters/version.sty: $(chapters)
-	@-for f in $(chapters) ; do \
+	@for f in $(chapters) ; do \
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp
 	@if test -s timestamp ; then \
@@ -112,12 +112,12 @@ chapters/version.sty: $(chapters)
 		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
 	@echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
-	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
 chapters/version.html: $(chapters)
-	@-for f in $(chapters) ; do \
+	@for f in $(chapters) ; do \
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
         done | sort -r | head -1 > timestamp
 	@if test -s timestamp ; then \
@@ -130,7 +130,7 @@ chapters/version.html: $(chapters)
 	else \
 		echo "Its contents was last updated on <em>unknown_date</em>." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	fi
-	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 

+ 10 - 2
doc/doxygen/chapters/api/performance_model.doxy

@@ -262,10 +262,18 @@ of use can be seen in \ref PerformanceModelExample.
 
 \fn double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
 \ingroup API_Performance_Model
-Used to compute the velocity of resources
+Used to compute the execution time of tasks
 
 \fn double starpu_get_latency_RAM_CUDA(unsigned cudadev)
 \ingroup API_Performance_Model
-Used to compute the velocity of resources
+Used to compute the execution time of tasks
+
+\fn double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
+\ingroup API_Performance_Mode
+Used to compute the execution time of tasks
+
+\fn double starpu_get_latency_CUDA_RAM(unsigned cudadev)
+\ingroup API_Performance_Model
+Used to compute the execution time of tasks
 
 */

+ 9 - 0
doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy

@@ -169,6 +169,15 @@ Forbid resizing of a context
 Allow resizing of a context. The user can then provide information to
 the hypervisor concerning the conditions of resizing.
 
+\fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
+\ingroup API_Scheduling_Context_Hypervisor
+Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag
+finished executing 
+
+\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+\ingroup API_Scheduling_Context_Hypervisor
+Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
+
 \fn void sc_hypervisor_ioctl(unsigned sched_ctx, ...)
 \ingroup API_Scheduling_Context_Hypervisor
 Inputs conditions to the context sched_ctx with the following

+ 16 - 0
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -85,6 +85,16 @@ just been created. It will be further used to indicate the context the
 tasks will be submitted to. The return value should be at most
 \ref STARPU_NMAX_SCHED_CTXS.
 
+\fn unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *policy, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name)
+\ingroup API_Scheduling_Contexts
+This function creates a scheduling context which uses the scheduling
+policy \p policy (the pointer to the custom scheduling policy) and assigns the workers in \p workerids_ctx to
+execute the tasks submitted to it.
+The return value represents the identifier of the context that has
+just been created. It will be further used to indicate the context the
+tasks will be submitted to. The return value should be at most
+\ref STARPU_NMAX_SCHED_CTXS.
+
 \fn unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap)
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
@@ -225,6 +235,12 @@ Delete the worker collection of the specified scheduling context
 \ingroup API_Scheduling_Contexts
 Return the worker collection managed by the indicated context
 
+\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
+\ingroup API_Scheduling_Contexts
+Returns the list of workers in the array \p workerids, the returned value is the 
+number of workers. The user should free the \p workerids table after finishing
+using it (it is allocated inside the function with the proper size)
+
 @name Scheduling Context Link with Hypervisor
 \ingroup API_Scheduling_Contexts
 

+ 7 - 0
doc/doxygen/chapters/configure_options.doxy

@@ -270,6 +270,13 @@ fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 </dd>
 
+<dt>--enable-maxmicthreads</dt>
+<dd>
+\anchor enable-maxmicthreads
+\addindex __configure__--enable-maxmicthreads
+Specify the maximum number of MIC threads
+</dd>
+
 <dt>--disable-asynchronous-mic-copy</dt>
 <dd>
 \anchor disable-asynchronous-mic-copy

+ 1 - 1
doc/doxygen/chapters/scheduling_context_hypervisor.doxy

@@ -114,7 +114,7 @@ sc_hypervisor_ioctl(sched_ctx_id,
 \endcode
 
 The <b>Gflops rate</b> based strategy resizes the scheduling contexts such that they all finish at the same time.
-The velocity of each of them is considered and once one of them is significantly slower the resizing process is triggered.
+The speed of each of them is considered and once one of them is significantly slower the resizing process is triggered.
 In order to do these computations the user has to input the total number of instructions needed to be executed by the
 parallel kernels and the number of instruction to be executed by each
 task.

+ 0 - 122
doc/texinfo/Makefile.am

@@ -1,122 +0,0 @@
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2009, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-#
-# Permission is granted to copy, distribute and/or modify this document
-# under the terms of the GNU Free Documentation License, Version 1.3
-# or any later version published by the Free Software Foundation;
-# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
-#
-# See the GNU Free Documentation License in COPYING.GFDL for more details.
-
-info_TEXINFOS = starpu.texi
-
-chapters =	\
-	chapters/advanced-examples.texi \
-	chapters/api.texi \
-	chapters/basic-examples.texi \
-	chapters/c-extensions.texi \
-	chapters/configuration.texi \
-	chapters/fdl-1.3.texi \
-	chapters/fft-support.texi \
-	chapters/hypervisor_api.texi \
-	chapters/installing.texi \
-	chapters/introduction.texi \
-	chapters/mpi-support.texi \
-	chapters/perf-feedback.texi \
-	chapters/perf-optimization.texi \
-	chapters/scaling-vector-example.texi \
-	chapters/sc_hypervisor.texi \
-	chapters/sched_ctx.texi \
-	chapters/socl.texi \
-	chapters/tips-tricks.texi \
-	chapters/vector_scal_cpu.texi \
-	chapters/vector_scal_c.texi \
-	chapters/vector_scal_cuda.texi \
-	chapters/vector_scal_opencl_codelet.texi \
-	chapters/vector_scal_opencl.texi
-
-starpu_TEXINFOS = 		\
-	chapters/version.texi 	\
-	$(chapters)
-
-MAINTAINERCLEANFILES = starpu.pdf starpu.html
-
-EXTRA_DIST = starpu.css				\
-	tutorial/README				\
-	tutorial/Makefile			\
-	tutorial/hello_world.c			\
-	tutorial/hello_world_plugin.c		\
-	tutorial/vector_scal.c			\
-	tutorial/vector_scal_cpu.c		\
-	tutorial/vector_scal_cuda.cu		\
-	tutorial/vector_scal_opencl.c		\
-	tutorial/vector_scal_opencl_kernel.cl	\
-	tutorial/vector_scal_plugin.c		\
-	tutorial/vector_scal_plugin_cuda.cu
-
-starpu_tutorial_dir	=	$(docdir)/tutorial
-starpu_tutorial__DATA	=			\
-	tutorial/README				\
-	tutorial/Makefile			\
-	tutorial/hello_world.c			\
-	tutorial/hello_world_plugin.c		\
-	tutorial/vector_scal.c			\
-	tutorial/vector_scal_cpu.c		\
-	tutorial/vector_scal_cuda.cu		\
-	tutorial/vector_scal_opencl.c		\
-	tutorial/vector_scal_opencl_kernel.cl	\
-	tutorial/vector_scal_plugin.c		\
-	tutorial/vector_scal_plugin_cuda.cu
-
-dist_pdf_DATA = starpu.pdf
-dist_html_DATA = starpu.html
-
-AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
-
-uninstall-local:
-	$(RM) $(DESTDIR)$(infodir)/dir
-
-chapters/version.texi: $(chapters)
-	@-for f in $(starpu_TEXINFOS) ; do \
-                if test -f $(top_srcdir)/doc/$$f ; then stat --format=%Y $(top_srcdir)/doc/$$f 2>/dev/null ; fi \
-        done | sort -r | head -1 > timestamp
-	@if test -s timestamp ; then \
-		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
-		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
-	fi
-	@if test -s timestamp_updated ; then \
-		echo "@set UPDATED " `cat timestamp_updated` > $(top_srcdir)/doc/chapters/version.texi;\
-		echo "@set UPDATED-MONTH" `cat timestamp_updated_month` >> $(top_srcdir)/doc/chapters/version.texi;\
-	else \
-		echo "@set UPDATED unknown_date" > $(top_srcdir)/doc/chapters/version.texi ;\
-		echo "@set UPDATED-MONTH unknown_date" >> $(top_srcdir)/doc/chapters/version.texi; \
-	fi
-	@echo "@set EDITION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
-	@echo "@set VERSION $(VERSION)" >> $(top_srcdir)/doc/chapters/version.texi
-	@-for f in timestamp timestamp_updated timestamp_updated_month ; do \
-		if test -f $$f ; then $(RM) $$f ; fi ;\
-	done
-
-#$(top_srcdir)/doc/starpu.texi: vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
-#vector_scal_c.texi: $(top_srcdir)/examples/basic_examples/vector_scal.c
-#	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
-#vector_scal_cuda.texi: $(top_srcdir)/examples/basic_examples/vector_scal_cuda.cu
-#	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
-#vector_scal_opencl.texi: $(top_srcdir)/examples/basic_examples/vector_scal_opencl.c
-#	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
-#vector_scal_opencl_codelet.texi: $(top_srcdir)/examples/basic_examples/vector_scal_opencl_codelet.cl
-#	cat $< | sed 's/{/@{/g' | sed 's/}/@}/g' | sed 's/\t/    /g' > $@
-#
-#CLEANFILES= \
-#	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
-
-# Rule to update documentation on web server. Should only be used locally.
-PUBLISHHOST	?= sync
-update-web: starpu.html
-	sed -i 's/gcc\.html#Attribute-Syntax/http:\/\/gcc.gnu.org\/onlinedocs\/gcc\/Attribute-Syntax.html#Attribute-Syntax/' starpu.html
-	scp starpu.pdf starpu.html $(PUBLISHHOST):/web/runtime/html/StarPU
-
-showcheck:
-	-cat /dev/null

Разница между файлами не показана из-за своего большого размера
+ 0 - 1368
doc/texinfo/chapters/advanced-examples.texi


Разница между файлами не показана из-за своего большого размера
+ 0 - 4311
doc/texinfo/chapters/api.texi


+ 0 - 960
doc/texinfo/chapters/basic-examples.texi

@@ -1,960 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Hello World using the C Extension::
-* Hello World using StarPU's API::
-* Vector Scaling Using the C Extension::
-* Vector Scaling Using StarPU's API::
-* Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
-@end menu
-
-@node Hello World using the C Extension
-@section Hello World using the C Extension
-
-This section shows how to implement a simple program that submits a task
-to StarPU using the StarPU C extension (@pxref{C
-Extensions})@footnote{The complete example, and additional examples,
-is available in the @file{gcc-plugin/examples} directory of the StarPU
-distribution.}. A similar example showing how to directly use the StarPU's API is shown
-in @ref{Hello World using StarPU's API}.
-
-GCC from version 4.5 permit to use the StarPU GCC plug-in (@pxref{C
-Extensions}). This makes writing a task both simpler and less error-prone.
-In a nutshell, all it takes is to declare a task, declare and define its
-implementations (for CPU, OpenCL, and/or CUDA), and invoke the task like
-a regular C function.  The example below defines @code{my_task}, which
-has a single implementation for CPU:
-
-@cartouche
-@smallexample
-#include <stdio.h>
-
-/* @b{Task declaration.}  */
-static void my_task (int x) __attribute__ ((task));
-
-/* @b{Definition of the CPU implementation of `my_task'.}  */
-static void my_task (int x)
-@{
-  printf ("Hello, world!  With x = %d\n", x);
-@}
-
-int main ()
-@{
-  /* @b{Initialize StarPU.}  */
-#pragma starpu initialize
-
-  /* @b{Do an asynchronous call to `my_task'.}  */
-  my_task (42);
-
-  /* @b{Wait for the call to complete.}  */
-#pragma starpu wait
-
-  /* @b{Terminate.}  */
-#pragma starpu shutdown
-
-  return 0;
-@}
-@end smallexample
-@end cartouche
-
-@noindent
-The code can then be compiled and linked with GCC and the
-@code{-fplugin} flag:
-
-@example
-$ gcc `pkg-config starpu-1.1 --cflags` hello-starpu.c \
-    -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` \
-    `pkg-config starpu-1.1 --libs`
-@end example
-
-The code can also be compiled without the StarPU C extension and will
-behave as a normal sequential code.
-
-@example
-$ gcc hello-starpu.c
-hello-starpu.c:33:1: warning: ‘task’ attribute directive ignored [-Wattributes]
-$ ./a.out
-Hello, world! With x = 42
-@end example
-
-As can be seen above, the C extensions allows programmers to
-use StarPU tasks by essentially annotating ``regular'' C code.
-
-@node Hello World using StarPU's API
-@section Hello World using StarPU's API
-
-This section shows how to achieve the same result as in the previous
-section using StarPU's standard C API.
-
-@menu
-* Required Headers::
-* Defining a Codelet::
-* Submitting a Task::
-* Execution of Hello World::
-@end menu
-
-@node Required Headers
-@subsection Required Headers
-
-The @code{starpu.h} header should be included in any code using StarPU.
-
-@cartouche
-@smallexample
-#include <starpu.h>
-@end smallexample
-@end cartouche
-
-
-@node Defining a Codelet
-@subsection Defining a Codelet
-
-@cartouche
-@smallexample
-struct params
-@{
-    int i;
-    float f;
-@};
-void cpu_func(void *buffers[], void *cl_arg)
-@{
-    struct params *params = cl_arg;
-
-    printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);
-@}
-
-struct starpu_codelet cl =
-@{
-    .where = STARPU_CPU,
-    .cpu_funcs = @{ cpu_func, NULL @},
-    .cpu_funcs_name = @{ "cpu_func", NULL @},
-    .nbuffers = 0
-@};
-@end smallexample
-@end cartouche
-
-A codelet is a structure that represents a computational kernel. Such a codelet
-may contain an implementation of the same kernel on different architectures
-(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
-structure is properly initialized to zero, either by using the
-function starpu_codelet_init (@pxref{starpu_codelet_init}), or by letting the
-compiler implicitly do it as examplified above.
-
-The @code{nbuffers} field specifies the number of data buffers that are
-manipulated by the codelet: here the codelet does not access or modify any data
-that is controlled by our data management library. Note that the argument
-passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
-structure) does not count as a buffer since it is not managed by our data
-management library, but just contain trivial parameters.
-
-@c TODO need a crossref to the proper description of "where" see bla for more ...
-We create a codelet which may only be executed on the CPUs. The @code{where}
-field is a bitmask that defines where the codelet may be executed. Here, the
-@code{STARPU_CPU} value means that only CPUs can execute this codelet
-(@pxref{Codelets and Tasks} for more details on this field). Note that
-the @code{where} field is optional, when unset its value is
-automatically set based on the availability of the different
-@code{XXX_funcs} fields.
-When a CPU core executes a codelet, it calls the @code{cpu_func} function,
-which @emph{must} have the following prototype:
-
-@code{void (*cpu_func)(void *buffers[], void *cl_arg);}
-
-In this example, we can ignore the first argument of this function which gives a
-description of the input and output buffers (e.g. the size and the location of
-the matrices) since there is none.
-The second argument is a pointer to a buffer passed as an
-argument to the codelet by the means of the @code{cl_arg} field of the
-@code{starpu_task} structure.
-
-@c TODO rewrite so that it is a little clearer ?
-Be aware that this may be a pointer to a
-@emph{copy} of the actual buffer, and not the pointer given by the programmer:
-if the codelet modifies this buffer, there is no guarantee that the initial
-buffer will be modified as well: this for instance implies that the buffer
-cannot be used as a synchronization medium. If synchronization is needed, data
-has to be registered to StarPU, see @ref{Vector Scaling Using StarPU's API}.
-
-@node Submitting a Task
-@subsection Submitting a Task
-
-@cartouche
-@smallexample
-void callback_func(void *callback_arg)
-@{
-    printf("Callback function (arg %x)\n", callback_arg);
-@}
-
-int main(int argc, char **argv)
-@{
-    /* @b{initialize StarPU} */
-    starpu_init(NULL);
-
-    struct starpu_task *task = starpu_task_create();
-
-    task->cl = &cl; /* @b{Pointer to the codelet defined above} */
-
-    struct params params = @{ 1, 2.0f @};
-    task->cl_arg = &params;
-    task->cl_arg_size = sizeof(params);
-
-    task->callback_func = callback_func;
-    task->callback_arg = 0x42;
-
-    /* @b{starpu_task_submit will be a blocking call} */
-    task->synchronous = 1;
-
-    /* @b{submit the task to StarPU} */
-    starpu_task_submit(task);
-
-    /* @b{terminate StarPU} */
-    starpu_shutdown();
-
-    return 0;
-@}
-@end smallexample
-@end cartouche
-
-Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
-@code{NULL} argument specifies that we use default configuration. Tasks cannot
-be submitted after the termination of StarPU by a call to
-@code{starpu_shutdown}.
-
-In the example above, a task structure is allocated by a call to
-@code{starpu_task_create}. This function only allocates and fills the
-corresponding structure with the default settings (@pxref{Codelets and
-Tasks, starpu_task_create}), but it does not submit the task to StarPU.
-
-@c not really clear ;)
-The @code{cl} field is a pointer to the codelet which the task will
-execute: in other words, the codelet structure describes which computational
-kernel should be offloaded on the different architectures, and the task
-structure is a wrapper containing a codelet and the piece of data on which the
-codelet should operate.
-
-The optional @code{cl_arg} field is a pointer to a buffer (of size
-@code{cl_arg_size}) with some parameters for the kernel
-described by the codelet. For instance, if a codelet implements a computational
-kernel that multiplies its input vector by a constant, the constant could be
-specified by the means of this buffer, instead of registering it as a StarPU
-data. It must however be noted that StarPU avoids making copy whenever possible
-and rather passes the pointer as such, so the buffer which is pointed at must
-kept allocated until the task terminates, and if several tasks are submitted
-with various parameters, each of them must be given a pointer to their own
-buffer.
-
-Once a task has been executed, an optional callback function is be called.
-While the computational kernel could be offloaded on various architectures, the
-callback function is always executed on a CPU. The @code{callback_arg}
-pointer is passed as an argument of the callback. The prototype of a callback
-function must be:
-
-@cartouche
-@example
-void (*callback_function)(void *);
-@end example
-@end cartouche
-
-If the @code{synchronous} field is non-zero, task submission will be
-synchronous: the @code{starpu_task_submit} function will not return until the
-task was executed. Note that the @code{starpu_shutdown} method does not
-guarantee that asynchronous tasks have been executed before it returns,
-@code{starpu_task_wait_for_all} can be used to that effect, or data can be
-unregistered (@code{starpu_data_unregister(vector_handle);}), which will
-implicitly wait for all the tasks scheduled to work on it, unless explicitly
-disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
-@code{starpu_data_set_sequential_consistency_flag}.
-
-@node Execution of Hello World
-@subsection Execution of Hello World
-
-@smallexample
-$ make hello_world
-cc $(pkg-config --cflags starpu-1.1)  $(pkg-config --libs starpu-1.1) hello_world.c -o hello_world
-$ ./hello_world
-Hello world (params = @{1, 2.000000@} )
-Callback function (arg 42)
-@end smallexample
-
-@node Vector Scaling Using the C Extension
-@section Vector Scaling Using the C Extension
-
-@menu
-* Adding an OpenCL Task Implementation::
-* Adding a CUDA Task Implementation::
-@end menu
-
-The previous example has shown how to submit tasks. In this section,
-we show how StarPU tasks can manipulate data.
-
-We will first show how to use the C language extensions provided by
-the GCC plug-in (@pxref{C Extensions})@footnote{The complete example, and
-additional examples, is available in the @file{gcc-plugin/examples}
-directory of the StarPU distribution.}. These extensions map directly
-to StarPU's main concepts: tasks, task implementations for CPU,
-OpenCL, or CUDA, and registered data buffers. The standard C version
-that uses StarPU's standard C programming interface is given in the
-next section (@pxref{Vector Scaling Using StarPU's API, standard C
-version of the example}).
-
-First of all, the vector-scaling task and its simple CPU implementation
-has to be defined:
-
-@cartouche
-@smallexample
-/* @b{Declare the `vector_scal' task.}  */
-static void vector_scal (unsigned size, float vector[size],
-                         float factor)
-  __attribute__ ((task));
-
-/* @b{Define the standard CPU implementation.}  */
-static void
-vector_scal (unsigned size, float vector[size], float factor)
-@{
-  unsigned i;
-  for (i = 0; i < size; i++)
-    vector[i] *= factor;
-@}
-@end smallexample
-@end cartouche
-
-Next, the body of the program, which uses the task defined above, can be
-implemented:
-
-@cartouche
-@smallexample
-int
-main (void)
-@{
-#pragma starpu initialize
-
-#define NX     0x100000
-#define FACTOR 3.14
-
-  @{
-    float vector[NX]
-       __attribute__ ((heap_allocated, registered));
-
-    size_t i;
-    for (i = 0; i < NX; i++)
-      vector[i] = (float) i;
-
-    vector_scal (NX, vector, FACTOR);
-
-#pragma starpu wait
-  @} /* @b{VECTOR is automatically freed here.}  */
-
-#pragma starpu shutdown
-
-  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
-@}
-@end smallexample
-@end cartouche
-
-@noindent
-The @code{main} function above does several things:
-
-@itemize
-@item
-It initializes StarPU.
-
-@item
-It allocates @var{vector} in the heap; it will automatically be freed
-when its scope is left.  Alternatively, good old @code{malloc} and
-@code{free} could have been used, but they are more error-prone and
-require more typing.
-
-@item
-It @dfn{registers} the memory pointed to by @var{vector}.  Eventually,
-when OpenCL or CUDA task implementations are added, this will allow
-StarPU to transfer that memory region between GPUs and the main memory.
-Removing this @code{pragma} is an error.
-
-@item
-It invokes the @code{vector_scal} task.  The invocation looks the same
-as a standard C function call.  However, it is an @dfn{asynchronous
-invocation}, meaning that the actual call is performed in parallel with
-the caller's continuation.
-
-@item
-It @dfn{waits} for the termination of the @code{vector_scal}
-asynchronous call.
-
-@item
-Finally, StarPU is shut down.
-
-@end itemize
-
-The program can be compiled and linked with GCC and the @code{-fplugin}
-flag:
-
-@example
-$ gcc `pkg-config starpu-1.1 --cflags` vector_scal.c \
-    -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` \
-    `pkg-config starpu-1.1 --libs`
-@end example
-
-And voil@`a!
-
-@node Adding an OpenCL Task Implementation
-@subsection Adding an OpenCL Task Implementation
-
-Now, this is all fine and great, but you certainly want to take
-advantage of these newfangled GPUs that your lab just bought, don't you?
-
-So, let's add an OpenCL implementation of the @code{vector_scal} task.
-We assume that the OpenCL kernel is available in a file,
-@file{vector_scal_opencl_kernel.cl}, not shown here.  The OpenCL task
-implementation is similar to that used with the standard C API
-(@pxref{Definition of the OpenCL Kernel}).  It is declared and defined
-in our C file like this:
-
-@cartouche
-@smallexample
-/* @b{The OpenCL programs, loaded from 'main' (see below).}  */
-static struct starpu_opencl_program cl_programs;
-
-static void vector_scal_opencl (unsigned size, float vector[size],
-                                float factor)
-  __attribute__ ((task_implementation ("opencl", vector_scal)));
-
-static void
-vector_scal_opencl (unsigned size, float vector[size], float factor)
-@{
-  int id, devid, err;
-  cl_kernel kernel;
-  cl_command_queue queue;
-  cl_event event;
-
-  /* @b{VECTOR is GPU memory pointer, not a main memory pointer.}  */
-  cl_mem val = (cl_mem) vector;
-
-  id = starpu_worker_get_id ();
-  devid = starpu_worker_get_devid (id);
-
-  /* @b{Prepare to invoke the kernel.  In the future, this will be largely
-     automated.}  */
-  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
-                                   "vector_mult_opencl", devid);
-  if (err != CL_SUCCESS)
-    STARPU_OPENCL_REPORT_ERROR (err);
-
-  err = clSetKernelArg (kernel, 0, sizeof (size), &size);
-  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
-  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
-  if (err)
-    STARPU_OPENCL_REPORT_ERROR (err);
-
-  size_t global = 1, local = 1;
-  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global,
-                                &local, 0, NULL, &event);
-  if (err != CL_SUCCESS)
-    STARPU_OPENCL_REPORT_ERROR (err);
-
-  clFinish (queue);
-  starpu_opencl_collect_stats (event);
-  clReleaseEvent (event);
-
-  /* @b{Done with KERNEL.}  */
-  starpu_opencl_release_kernel (kernel);
-@}
-@end smallexample
-@end cartouche
-
-@noindent
-The OpenCL kernel itself must be loaded from @code{main}, sometime after
-the @code{initialize} pragma:
-
-@cartouche
-@smallexample
-  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
-                                       &cl_programs, "");
-@end smallexample
-@end cartouche
-
-@noindent
-And that's it.  The @code{vector_scal} task now has an additional
-implementation, for OpenCL, which StarPU's scheduler may choose to use
-at run-time.  Unfortunately, the @code{vector_scal_opencl} above still
-has to go through the common OpenCL boilerplate; in the future,
-additional extensions will automate most of it.
-
-@node Adding a CUDA Task Implementation
-@subsection Adding a CUDA Task Implementation
-
-Adding a CUDA implementation of the task is very similar, except that
-the implementation itself is typically written in CUDA, and compiled
-with @code{nvcc}.  Thus, the C file only needs to contain an external
-declaration for the task implementation:
-
-@cartouche
-@smallexample
-extern void vector_scal_cuda (unsigned size, float vector[size],
-                              float factor)
-  __attribute__ ((task_implementation ("cuda", vector_scal)));
-@end smallexample
-@end cartouche
-
-The actual implementation of the CUDA task goes into a separate
-compilation unit, in a @file{.cu} file.  It is very close to the
-implementation when using StarPU's standard C API (@pxref{Definition of
-the CUDA Kernel}).
-
-@cartouche
-@smallexample
-/* @b{CUDA implementation of the `vector_scal' task, to be compiled
-   with `nvcc'.}  */
-
-#include <starpu.h>
-#include <stdlib.h>
-
-static __global__ void
-vector_mult_cuda (unsigned n, float *val, float factor)
-@{
-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i < n)
-    val[i] *= factor;
-@}
-
-/* @b{Definition of the task implementation declared in the C file.}   */
-extern "C" void
-vector_scal_cuda (size_t size, float vector[], float factor)
-@{
-  unsigned threads_per_block = 64;
-  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
-
-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
-
-  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
-@}
-@end smallexample
-@end cartouche
-
-The complete source code, in the @file{gcc-plugin/examples/vector_scal}
-directory of the StarPU distribution, also shows how an SSE-specialized
-CPU task implementation can be added.
-
-For more details on the C extensions provided by StarPU's GCC plug-in,
-@xref{C Extensions}.
-
-@node Vector Scaling Using StarPU's API
-@section Vector Scaling Using StarPU's API
-
-This section shows how to achieve the same result as explained in the
-previous section using StarPU's standard C API.
-
-The full source code for
-this example is given in @ref{Full source code for the 'Scaling a
-Vector' example}.
-
-@menu
-* Source Code of Vector Scaling::
-* Execution of Vector Scaling::  Running the program
-@end menu
-
-@node Source Code of Vector Scaling
-@subsection Source Code of Vector Scaling
-
-Programmers can describe the data layout of their application so that StarPU is
-responsible for enforcing data coherency and availability across the machine.
-Instead of handling complex (and non-portable) mechanisms to perform data
-movements, programmers only declare which piece of data is accessed and/or
-modified by a task, and StarPU makes sure that when a computational kernel
-starts somewhere (e.g. on a GPU), its data are available locally.
-
-Before submitting those tasks, the programmer first needs to declare the
-different pieces of data to StarPU using the @code{starpu_*_data_register}
-functions. To ease the development of applications for StarPU, it is possible
-to describe multiple types of data layout. A type of data layout is called an
-@b{interface}. There are different predefined interfaces available in StarPU:
-here we will consider the @b{vector interface}.
-
-The following lines show how to declare an array of @code{NX} elements of type
-@code{float} using the vector interface:
-
-@cartouche
-@smallexample
-float vector[NX];
-
-starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
-                            sizeof(vector[0]));
-@end smallexample
-@end cartouche
-
-The first argument, called the @b{data handle}, is an opaque pointer which
-designates the array in StarPU. This is also the structure which is used to
-describe which data is used by a task. The second argument is the node number
-where the data originally resides. Here it is 0 since the @code{vector} array is in
-the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
-the number of elements in the vector and the size of each element.
-The following shows how to construct a StarPU task that will manipulate the
-vector and a constant factor.
-
-@cartouche
-@smallexample
-float factor = 3.14;
-struct starpu_task *task = starpu_task_create();
-
-task->cl = &cl;                      /* @b{Pointer to the codelet defined below} */
-task->handles[0] = vector_handle;    /* @b{First parameter of the codelet} */
-task->cl_arg = &factor;
-task->cl_arg_size = sizeof(factor);
-task->synchronous = 1;
-
-starpu_task_submit(task);
-@end smallexample
-@end cartouche
-
-Since the factor is a mere constant float value parameter,
-it does not need a preliminary registration, and
-can just be passed through the @code{cl_arg} pointer like in the previous
-example.  The vector parameter is described by its handle.
-There are two fields in each element of the @code{buffers} array.
-@code{handle} is the handle of the data, and @code{mode} specifies how the
-kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
-write-only and @code{STARPU_RW} for read and write access).
-
-The definition of the codelet can be written as follows:
-
-@cartouche
-@smallexample
-void scal_cpu_func(void *buffers[], void *cl_arg)
-@{
-    unsigned i;
-    float *factor = cl_arg;
-
-    /* @b{length of the vector} */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* @b{CPU copy of the vector pointer} */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
-
-    for (i = 0; i < n; i++)
-        val[i] *= *factor;
-@}
-
-struct starpu_codelet cl =
-@{
-    .cpu_funcs = @{ scal_cpu_func, NULL @},
-    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
-    .nbuffers = 1,
-    .modes = @{ STARPU_RW @}
-@};
-@end smallexample
-@end cartouche
-
-The first argument is an array that gives
-a description of all the buffers passed in the @code{task->handles}@ array. The
-size of this array is given by the @code{nbuffers} field of the codelet
-structure. For the sake of genericity, this array contains pointers to the
-different interfaces describing each buffer.  In the case of the @b{vector
-interface}, the location of the vector (resp. its length) is accessible in the
-@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
-read-write fashion, any modification will automatically affect future accesses
-to this vector made by other tasks.
-
-The second argument of the @code{scal_cpu_func} function contains a pointer to the
-parameters of the codelet (given in @code{task->cl_arg}), so that we read the
-constant factor from this pointer.
-
-@node Execution of Vector Scaling
-@subsection Execution of Vector Scaling
-
-@smallexample
-$ make vector_scal
-cc $(pkg-config --cflags starpu-1.1)  $(pkg-config --libs starpu-1.1)  vector_scal.c   -o vector_scal
-$ ./vector_scal
-0.000000 3.000000 6.000000 9.000000 12.000000
-@end smallexample
-
-@node Vector Scaling on an Hybrid CPU/GPU Machine
-@section Vector Scaling on an Hybrid CPU/GPU Machine
-
-Contrary to the previous examples, the task submitted in this example may not
-only be executed by the CPUs, but also by a CUDA device.
-
-@menu
-* Definition of the CUDA Kernel::
-* Definition of the OpenCL Kernel::
-* Definition of the Main Code::
-* Execution of Hybrid Vector Scaling::
-@end menu
-
-@node Definition of the CUDA Kernel
-@subsection Definition of the CUDA Kernel
-
-The CUDA implementation can be written as follows. It needs to be compiled with
-a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
-that the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPU
-memory, so that it can be passed as such to the @code{vector_mult_cuda} kernel
-call.
-
-@cartouche
-@smallexample
-#include <starpu.h>
-
-static __global__ void vector_mult_cuda(unsigned n, float *val,
-                                        float factor)
-@{
-    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
-    if (i < n)
-        val[i] *= factor;
-@}
-
-extern "C" void scal_cuda_func(void *buffers[], void *_args)
-@{
-    float *factor = (float *)_args;
-
-    /* @b{length of the vector} */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* @b{CUDA copy of the vector pointer} */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
-    unsigned threads_per_block = 64;
-    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
-
-@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>}
-@i{                    (n, val, *factor);}
-
-@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
-@}
-@end smallexample
-@end cartouche
-
-@node Definition of the OpenCL Kernel
-@subsection Definition of the OpenCL Kernel
-
-The OpenCL implementation can be written as follows. StarPU provides
-tools to compile a OpenCL kernel stored in a file.
-
-@cartouche
-@smallexample
-__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
-@{
-        const int i = get_global_id(0);
-        if (i < nx) @{
-                val[i] *= factor;
-        @}
-@}
-@end smallexample
-@end cartouche
-
-Contrary to CUDA and CPU, @code{STARPU_VECTOR_GET_DEV_HANDLE} has to be used,
-which returns a @code{cl_mem} (which is not a device pointer, but an OpenCL
-handle), which can be passed as such to the OpenCL kernel. The difference is
-important when using partitioning, see @ref{Partitioning Data}.
-
-@cartouche
-@smallexample
-#include <starpu.h>
-
-@i{extern struct starpu_opencl_program programs;}
-
-void scal_opencl_func(void *buffers[], void *_args)
-@{
-    float *factor = _args;
-@i{    int id, devid, err;}
-@i{    cl_kernel kernel;}
-@i{    cl_command_queue queue;}
-@i{    cl_event event;}
-
-    /* @b{length of the vector} */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* @b{OpenCL copy of the vector pointer} */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
-
-@i{    id = starpu_worker_get_id();}
-@i{    devid = starpu_worker_get_devid(id);}
-
-@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
-@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
-@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
-
-@i{    err = clSetKernelArg(kernel, 0, sizeof(n), &n);}
-@i{    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);}
-@i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
-@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
-
-@i{    @{}
-@i{        size_t global=n;}
-@i{        size_t local=1;}
-@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,}
-@i{                                     &global, &local, 0, NULL, &event);}
-@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
-@i{    @}}
-
-@i{    clFinish(queue);}
-@i{    starpu_opencl_collect_stats(event);}
-@i{    clReleaseEvent(event);}
-
-@i{    starpu_opencl_release_kernel(kernel);}
-@}
-@end smallexample
-@end cartouche
-
-
-@node Definition of the Main Code
-@subsection Definition of the Main Code
-
-The CPU implementation is the same as in the previous section.
-
-Here is the source of the main application. You can notice that the fields
-@code{cuda_funcs} and @code{opencl_funcs} of the codelet are set to
-define the pointers to the CUDA and OpenCL implementations of the
-task.
-
-@cartouche
-@smallexample
-#include <starpu.h>
-
-#define NX 2048
-
-extern void scal_cuda_func(void *buffers[], void *_args);
-extern void scal_cpu_func(void *buffers[], void *_args);
-extern void scal_opencl_func(void *buffers[], void *_args);
-
-/* @b{Definition of the codelet} */
-static struct starpu_codelet cl =
-@{
-    .cuda_funcs = @{ scal_cuda_func, NULL @},
-    .cpu_funcs = @{ scal_cpu_func, NULL @},
-    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
-    .opencl_funcs = @{ scal_opencl_func, NULL @},
-    .nbuffers = 1,
-    .modes = @{ STARPU_RW @}
-@}
-
-#ifdef STARPU_USE_OPENCL
-/* @b{The compiled version of the OpenCL program} */
-struct starpu_opencl_program programs;
-#endif
-
-int main(int argc, char **argv)
-@{
-    float *vector;
-    int i, ret;
-    float factor=3.0;
-    struct starpu_task *task;
-    starpu_data_handle_t vector_handle;
-
-    starpu_init(NULL);                            /* @b{Initialising StarPU} */
-
-#ifdef STARPU_USE_OPENCL
-    starpu_opencl_load_opencl_from_file(
-            "examples/basic_examples/vector_scal_opencl_codelet.cl",
-            &programs, NULL);
-#endif
-
-    vector = malloc(NX*sizeof(vector[0]));
-    assert(vector);
-    for(i=0 ; i<NX ; i++) vector[i] = i;
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-    /* @b{Registering data within StarPU} */
-    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
-                                NX, sizeof(vector[0]));
-
-    /* @b{Definition of the task} */
-    task = starpu_task_create();
-    task->cl = &cl;
-    task->handles[0] = vector_handle;
-    task->cl_arg = &factor;
-    task->cl_arg_size = sizeof(factor);
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-    /* @b{Submitting the task} */
-    ret = starpu_task_submit(task);
-    if (ret == -ENODEV) @{
-            fprintf(stderr, "No worker may execute this task\n");
-            return 1;
-    @}
-
-@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
-    /* @b{Waiting for its termination} */
-    starpu_task_wait_for_all();
-
-    /* @b{Update the vector in RAM} */
-    starpu_data_acquire(vector_handle, STARPU_R);
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-    /* @b{Access the data} */
-    for(i=0 ; i<NX; i++) @{
-      fprintf(stderr, "%f ", vector[i]);
-    @}
-    fprintf(stderr, "\n");
-
-    /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */
-    starpu_data_release(vector_handle);
-    starpu_data_unregister(vector_handle);
-    starpu_shutdown();
-
-    return 0;
-@}
-@end smallexample
-@end cartouche
-
-@node Execution of Hybrid Vector Scaling
-@subsection Execution of Hybrid Vector Scaling
-
-The Makefile given at the beginning of the section must be extended to
-give the rules to compile the CUDA source code. Note that the source
-file of the OpenCL kernel does not need to be compiled now, it will
-be compiled at run-time when calling the function
-@code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).
-
-@cartouche
-@smallexample
-CFLAGS  += $(shell pkg-config --cflags starpu-1.1)
-LDFLAGS += $(shell pkg-config --libs starpu-1.1)
-CC       = gcc
-
-vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
-
-%.o: %.cu
-       nvcc $(CFLAGS) $< -c $@
-
-clean:
-       rm -f vector_scal *.o
-@end smallexample
-@end cartouche
-
-@smallexample
-$ make
-@end smallexample
-
-and to execute it, with the default configuration:
-
-@smallexample
-$ ./vector_scal
-0.000000 3.000000 6.000000 9.000000 12.000000
-@end smallexample
-
-or for example, by disabling CPU devices:
-
-@smallexample
-$ STARPU_NCPU=0 ./vector_scal
-0.000000 3.000000 6.000000 9.000000 12.000000
-@end smallexample
-
-or by disabling CUDA devices (which may permit to enable the use of OpenCL,
-see @ref{Enabling OpenCL}):
-
-@smallexample
-$ STARPU_NCUDA=0 ./vector_scal
-0.000000 3.000000 6.000000 9.000000 12.000000
-@end smallexample

+ 0 - 485
doc/texinfo/chapters/c-extensions.texi

@@ -1,485 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@cindex C extensions
-@cindex GCC plug-in
-
-When GCC plug-in support is available, StarPU builds a plug-in for the
-GNU Compiler Collection (GCC), which defines extensions to languages of
-the C family (C, C++, Objective-C) that make it easier to write StarPU
-code@footnote{This feature is only available for GCC 4.5 and later; it
-is known to work with GCC 4.5, 4.6, and 4.7.  You
-may need to install a specific @code{-dev} package of your distro, such
-as @code{gcc-4.6-plugin-dev} on Debian and derivatives.  In addition,
-the plug-in's test suite is only run when
-@url{http://www.gnu.org/software/guile/, GNU@tie{}Guile} is found at
-@code{configure}-time.  Building the GCC plug-in
-can be disabled by configuring with @code{--disable-gcc-extensions}.}.
-
-Those extensions include syntactic sugar for defining
-tasks and their implementations, invoking a task, and manipulating data
-buffers.  Use of these extensions can be made conditional on the
-availability of the plug-in, leading to valid C sequential code when the
-plug-in is not used (@pxref{Conditional Extensions}).
-
-When StarPU has been installed with its GCC plug-in, programs that use
-these extensions can be compiled this way:
-
-@example
-$ gcc -c -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` foo.c
-@end example
-
-@noindent
-When the plug-in is not available, the above @command{pkg-config}
-command returns the empty string.
-
-In addition, the @code{-fplugin-arg-starpu-verbose} flag can be used to
-obtain feedback from the compiler as it analyzes the C extensions used
-in source files.
-
-This section describes the C extensions implemented by StarPU's GCC
-plug-in.  It does not require detailed knowledge of the StarPU library.
-
-Note: as of StarPU @value{VERSION}, this is still an area under
-development and subject to change.
-
-@menu
-* Defining Tasks::              Defining StarPU tasks
-* Synchronization and Other Pragmas:: Synchronization, and more.
-* Registered Data Buffers::     Manipulating data buffers
-* Conditional Extensions::      Using C extensions only when available
-@end menu
-
-@node Defining Tasks
-@section Defining Tasks
-
-@cindex task
-@cindex task implementation
-
-The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
-
-@enumerate
-@item
-tasks may have several implementations---e.g., one for CPUs, one written
-in OpenCL, one written in CUDA;
-@item
-tasks may have several implementations of the same target---e.g.,
-several CPU implementations;
-@item
-when a task is invoked, it may run in parallel, and StarPU is free to
-choose any of its implementations.
-@end enumerate
-
-Tasks and their implementations must be @emph{declared}.  These
-declarations are annotated with @dfn{attributes} (@pxref{Attribute
-Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
-(GCC)}): the declaration of a task is a regular C function declaration
-with an additional @code{task} attribute, and task implementations are
-declared with a @code{task_implementation} attribute.
-
-The following function attributes are provided:
-
-@table @code
-
-@item task
-@cindex @code{task} attribute
-Declare the given function as a StarPU task.  Its return type must be
-@code{void}.  When a function declared as @code{task} has a user-defined
-body, that body is interpreted as the @dfn{implicit definition of the
-task's CPU implementation} (see example below).  In all cases, the
-actual definition of a task's body is automatically generated by the
-compiler.
-
-Under the hood, declaring a task leads to the declaration of the
-corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
-more task implementations are declared in the same compilation unit,
-then the codelet and the function itself are also defined; they inherit
-the scope of the task.
-
-Scalar arguments to the task are passed by value and copied to the
-target device if need be---technically, they are passed as the
-@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
-
-@cindex @code{output} type attribute
-Pointer arguments are assumed to be registered data buffers---the
-@code{buffers} argument of a task (@pxref{Codelets and Tasks,
-@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
-read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
-buffers are assumed to be used read-write (@code{STARPU_RW}).  In
-addition, the @code{output} type attribute can be as a type qualifier
-for output pointer or array parameters (@code{STARPU_W}).
-
-@item task_implementation (@var{target}, @var{task})
-@cindex @code{task_implementation} attribute
-Declare the given function as an implementation of @var{task} to run on
-@var{target}.  @var{target} must be a string, currently one of
-@code{"cpu"}, @code{"opencl"}, or @code{"cuda"}.
-@c FIXME: Update when OpenCL support is ready.
-
-@end table
-
-Here is an example:
-
-@cartouche
-@smallexample
-#define __output  __attribute__ ((output))
-
-static void matmul (const float *A, const float *B,
-                    __output float *C,
-                    unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task));
-
-static void matmul_cpu (const float *A, const float *B,
-                        __output float *C,
-                        unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task_implementation ("cpu", matmul)));
-
-
-static void
-matmul_cpu (const float *A, const float *B, __output float *C,
-            unsigned nx, unsigned ny, unsigned nz)
-@{
-  unsigned i, j, k;
-
-  for (j = 0; j < ny; j++)
-    for (i = 0; i < nx; i++)
-      @{
-        for (k = 0; k < nz; k++)
-          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
-      @}
-@}
-@end smallexample
-@end cartouche
-
-@noindent
-A @code{matmult} task is defined; it has only one implementation,
-@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
-@var{B} are input buffers, whereas @var{C} is considered an input/output
-buffer.
-
-@cindex implicit task CPU implementation
-For convenience, when a function declared with the @code{task} attribute
-has a user-defined body, that body is assumed to be that of the CPU
-implementation of a task, which we call an @dfn{implicit task CPU
-implementation}.  Thus, the above snippet can be simplified like this:
-
-@cartouche
-@smallexample
-#define __output  __attribute__ ((output))
-
-static void matmul (const float *A, const float *B,
-                    __output float *C,
-                    unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task));
-
-/* Implicit definition of the CPU implementation of the
-   `matmul' task.  */
-static void
-matmul (const float *A, const float *B, __output float *C,
-        unsigned nx, unsigned ny, unsigned nz)
-@{
-  unsigned i, j, k;
-
-  for (j = 0; j < ny; j++)
-    for (i = 0; i < nx; i++)
-      @{
-        for (k = 0; k < nz; k++)
-          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
-      @}
-@}
-@end smallexample
-@end cartouche
-
-@noindent
-Use of implicit CPU task implementations as above has the advantage that
-the code is valid sequential code when StarPU's GCC plug-in is not used
-(@pxref{Conditional Extensions}).
-
-CUDA and OpenCL implementations can be declared in a similar way:
-
-@cartouche
-@smallexample
-static void matmul_cuda (const float *A, const float *B, float *C,
-                         unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task_implementation ("cuda", matmul)));
-
-static void matmul_opencl (const float *A, const float *B, float *C,
-                           unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task_implementation ("opencl", matmul)));
-@end smallexample
-@end cartouche
-
-@noindent
-The CUDA and OpenCL implementations typically either invoke a kernel
-written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
-@pxref{OpenCL Kernel}), or call a library function that uses CUDA or
-OpenCL under the hood, such as CUBLAS functions:
-
-@cartouche
-@smallexample
-static void
-matmul_cuda (const float *A, const float *B, float *C,
-             unsigned nx, unsigned ny, unsigned nz)
-@{
-  cublasSgemm ('n', 'n', nx, ny, nz,
-               1.0f, A, 0, B, 0,
-               0.0f, C, 0);
-  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
-@}
-@end smallexample
-@end cartouche
-
-A task can be invoked like a regular C function:
-
-@cartouche
-@smallexample
-matmul (&A[i * zdim * bydim + k * bzdim * bydim],
-        &B[k * xdim * bzdim + j * bxdim * bzdim],
-        &C[i * xdim * bydim + j * bxdim * bydim],
-        bxdim, bydim, bzdim);
-@end smallexample
-@end cartouche
-
-@noindent
-This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
-implementation may run in parallel with the continuation of the caller.
-
-The next section describes how memory buffers must be handled in
-StarPU-GCC code.  For a complete example, see the
-@code{gcc-plugin/examples} directory of the source distribution, and
-@ref{Vector Scaling Using the C Extension, the vector-scaling
-example}.
-
-
-@node Synchronization and Other Pragmas
-@section Initialization, Termination, and Synchronization
-
-The following pragmas allow user code to control StarPU's life time and
-to synchronize with tasks.
-
-@table @code
-
-@item #pragma starpu initialize
-Initialize StarPU.  This call is compulsory and is @emph{never} added
-implicitly.  One of the reasons this has to be done explicitly is that
-it provides greater control to user code over its resource usage.
-
-@item #pragma starpu shutdown
-Shut down StarPU, giving it an opportunity to write profiling info to a
-file on disk, for instance (@pxref{Off-line, off-line performance
-feedback}).
-
-@item #pragma starpu wait
-Wait for all task invocations to complete, as with
-@code{starpu_wait_for_all} (@pxref{Codelets and Tasks,
-starpu_wait_for_all}).
-
-@end table
-
-@node Registered Data Buffers
-@section Registered Data Buffers
-
-Data buffers such as matrices and vectors that are to be passed to tasks
-must be @dfn{registered}.  Registration allows StarPU to handle data
-transfers among devices---e.g., transferring an input buffer from the
-CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
-Management Library}).
-
-The following pragmas are provided:
-
-@table @code
-
-@item #pragma starpu register @var{ptr} [@var{size}]
-Register @var{ptr} as a @var{size}-element buffer.  When @var{ptr} has
-an array type whose size is known, @var{size} may be omitted.
-Alternatively, the @code{registered} attribute can be used (see below.)
-
-@item #pragma starpu unregister @var{ptr}
-Unregister the previously-registered memory area pointed to by
-@var{ptr}.  As a side-effect, @var{ptr} points to a valid copy in main
-memory.
-
-@item #pragma starpu acquire @var{ptr}
-Acquire in main memory an up-to-date copy of the previously-registered
-memory area pointed to by @var{ptr}, for read-write access.
-
-@item #pragma starpu release @var{ptr}
-Release the previously-register memory area pointed to by @var{ptr},
-making it available to the tasks.
-
-@end table
-
-Additionally, the following attributes offer a simple way to allocate
-and register storage for arrays:
-
-@table @code
-
-@item registered
-@cindex @code{registered} attribute
-This attributes applies to local variables with an array type.  Its
-effect is to automatically register the array's storage, as per
-@code{#pragma starpu register}.  The array is automatically unregistered
-when the variable's scope is left.  This attribute is typically used in
-conjunction with the @code{heap_allocated} attribute, described below.
-
-@item heap_allocated
-@cindex @code{heap_allocated} attribute
-This attributes applies to local variables with an array type.  Its
-effect is to automatically allocate the array's storage on
-the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
-Management API, starpu_malloc}).  The heap-allocated array is automatically
-freed when the variable's scope is left, as with
-automatic variables.
-
-@end table
-
-@noindent
-The following example illustrates use of the @code{heap_allocated}
-attribute:
-
-@example
-extern void cholesky(unsigned nblocks, unsigned size,
-                    float mat[nblocks][nblocks][size])
-  __attribute__ ((task));
-
-int
-main (int argc, char *argv[])
-@{
-#pragma starpu initialize
-
-  /* ... */
-
-  int nblocks, size;
-  parse_args (&nblocks, &size);
-
-  /* Allocate an array of the required size on the heap,
-     and register it.  */
-
-  @{
-    float matrix[nblocks][nblocks][size]
-      __attribute__ ((heap_allocated, registered));
-
-    cholesky (nblocks, size, matrix);
-
-#pragma starpu wait
-
-  @}   /* MATRIX is automatically unregistered & freed here.  */
-
-#pragma starpu shutdown
-
-  return EXIT_SUCCESS;
-@}
-@end example
-
-@node Conditional Extensions
-@section Using C Extensions Conditionally
-
-The C extensions described in this chapter are only available when GCC
-and its StarPU plug-in are in use.  Yet, it is possible to make use of
-these extensions when they are available---leading to hybrid CPU/GPU
-code---and discard them when they are not available---leading to valid
-sequential code.
-
-To that end, the GCC plug-in defines a C preprocessor macro when it is
-being used:
-
-@defmac STARPU_GCC_PLUGIN
-Defined for code being compiled with the StarPU GCC plug-in.  When
-defined, this macro expands to an integer denoting the version of the
-supported C extensions.
-@end defmac
-
-The code below illustrates how to define a task and its implementations
-in a way that allows it to be compiled without the GCC plug-in:
-
-@smallexample
-/* This program is valid, whether or not StarPU's GCC plug-in
-   is being used.  */
-
-#include <stdlib.h>
-
-/* The attribute below is ignored when GCC is not used.  */
-static void matmul (const float *A, const float *B, float * C,
-                    unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task));
-
-static void
-matmul (const float *A, const float *B, float * C,
-        unsigned nx, unsigned ny, unsigned nz)
-@{
-  /* Code of the CPU kernel here...  */
-@}
-
-#ifdef STARPU_GCC_PLUGIN
-/* Optional OpenCL task implementation.  */
-
-static void matmul_opencl (const float *A, const float *B, float * C,
-                           unsigned nx, unsigned ny, unsigned nz)
-  __attribute__ ((task_implementation ("opencl", matmul)));
-
-static void
-matmul_opencl (const float *A, const float *B, float * C,
-               unsigned nx, unsigned ny, unsigned nz)
-@{
-  /* Code that invokes the OpenCL kernel here...  */
-@}
-#endif
-
-int
-main (int argc, char *argv[])
-@{
-  /* The pragmas below are simply ignored when StarPU-GCC
-     is not used.  */
-#pragma starpu initialize
-
-  float A[123][42][7], B[123][42][7], C[123][42][7];
-
-#pragma starpu register A
-#pragma starpu register B
-#pragma starpu register C
-
-  /* When StarPU-GCC is used, the call below is asynchronous;
-     otherwise, it is synchronous.  */
-  matmul ((float *) A, (float *) B, (float *) C, 123, 42, 7);
-
-#pragma starpu wait
-#pragma starpu shutdown
-
-  return EXIT_SUCCESS;
-@}
-@end smallexample
-
-@noindent
-The above program is a valid StarPU program when StarPU's GCC plug-in is
-used; it is also a valid sequential program when the plug-in is not
-used.
-
-Note that attributes such as @code{task} as well as @code{starpu}
-pragmas are simply ignored by GCC when the StarPU plug-in is not loaded.
-However, @command{gcc -Wall} emits a warning for unknown attributes and
-pragmas, which can be inconvenient.  In addition, other compilers may be
-unable to parse the attribute syntax@footnote{In practice, Clang and
-several proprietary compilers implement attributes.}, so you may want to
-wrap attributes in macros like this:
-
-@smallexample
-/* Use the `task' attribute only when StarPU's GCC plug-in
-   is available.   */
-#ifdef STARPU_GCC_PLUGIN
-# define __task  __attribute__ ((task))
-#else
-# define __task
-#endif
-
-static void matmul (const float *A, const float *B, float *C,
-                    unsigned nx, unsigned ny, unsigned nz) __task;
-@end smallexample
-
-
-@c Local Variables:
-@c TeX-master: "../starpu.texi"
-@c ispell-local-dictionary: "american"
-@c End:

+ 0 - 672
doc/texinfo/chapters/configuration.texi

@@ -1,672 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Compilation configuration::   
-* Execution configuration through environment variables::  
-@end menu
-
-@node Compilation configuration
-@section Compilation configuration
-
-The following arguments can be given to the @code{configure} script.
-
-@menu
-* Common configuration::
-* Configuring workers::
-* Extension configuration::
-* Advanced configuration::
-@end menu
-
-@node Common configuration
-@subsection Common configuration
-
-@defvr {Configure option} --enable-debug
-Enable debugging messages.
-@end defvr
-
-@defvr {Configure option} --enable-debug
-Enable debugging messages.
-@end defvr
-
-@defvr {Configure option} --enable-fast
-Disable assertion checks, which saves computation time.
-@end defvr
-
-@defvr {Configure option} --enable-verbose
-Increase the verbosity of the debugging messages.  This can be disabled
-at runtime by setting the environment variable @code{STARPU_SILENT} to
-any value.
-
-@smallexample
-$ STARPU_SILENT=1 ./vector_scal
-@end smallexample
-@end defvr
-
-@defvr {Configure option} --enable-coverage
-Enable flags for the @code{gcov} coverage tool.
-@end defvr
-
-@defvr {Configure option} --enable-quick-check
-Specify tests and examples should be run on a smaller data set, i.e
-allowing a faster execution time
-@end defvr
-
-@defvr {Configure option} --enable-long-check
-Enable some exhaustive checks which take a really long time.
-@end defvr
-
-@defvr {Configure option} --with-hwloc
-Specify hwloc should be used by StarPU. hwloc should be found by the
-means of the tools @code{pkg-config}.
-@end defvr
-
-@defvr {Configure option} --with-hwloc=@var{prefix}
-Specify hwloc should be used by StarPU. hwloc should be found in the
-directory specified by @var{prefix}.
-@end defvr
-
-@defvr {Configure option} --without-hwloc
-Specify hwloc should not be used by StarPU.
-@end defvr
-
-@defvr {Configure option} --disable-build-doc
-Disable the creation of the documentation. This should be done on a
-machine which does not have the tools @code{makeinfo} and @code{tex}.
-@end defvr
-
-Additionally, the @command{configure} script recognize many variables, which
-can be listed by typing @code{./configure --help}. For example,
-@code{./configure NVCCFLAGS="-arch sm_13"} adds a flag for the compilation of
-CUDA kernels.
-
-@node Configuring workers
-@subsection Configuring workers
-
-@defvr {Configure option} --enable-maxcpus=@var{count}
-Use at most @var{count} CPU cores.  This information is then
-available as the @code{STARPU_MAXCPUS} macro.
-@end defvr
-
-@defvr {Configure option} --disable-cpu
-Disable the use of CPUs of the machine. Only GPUs etc. will be used.
-@end defvr
-
-@defvr {Configure option} --enable-maxcudadev=@var{count}
-Use at most @var{count} CUDA devices.  This information is then
-available as the @code{STARPU_MAXCUDADEVS} macro.
-@end defvr
-
-@defvr {Configure option} --disable-cuda
-Disable the use of CUDA, even if a valid CUDA installation was detected.
-@end defvr
-
-@defvr {Configure option} --with-cuda-dir=@var{prefix}
-Search for CUDA under @var{prefix}, which should notably contain
-@file{include/cuda.h}.
-@end defvr
-
-@defvr {Configure option} --with-cuda-include-dir=@var{dir}
-Search for CUDA headers under @var{dir}, which should
-notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
-value given to @code{--with-cuda-dir}.
-@end defvr
-
-@defvr {Configure option} --with-cuda-lib-dir=@var{dir}
-Search for CUDA libraries under @var{dir}, which should notably contain
-the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
-@code{/lib} appended to the value given to @code{--with-cuda-dir}.
-@end defvr
-
-@defvr {Configure option} --disable-cuda-memcpy-peer
-Explicitly disable peer transfers when using CUDA 4.0.
-@end defvr
-
-@defvr {Configure option} --enable-maxopencldev=@var{count}
-Use at most @var{count} OpenCL devices.  This information is then
-available as the @code{STARPU_MAXOPENCLDEVS} macro.
-@end defvr
-
-@defvr {Configure option} --disable-opencl
-Disable the use of OpenCL, even if the SDK is detected.
-@end defvr
-
-@defvr {Configure option} --with-opencl-dir=@var{prefix}
-Search for an OpenCL implementation under @var{prefix}, which should
-notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
-Mac OS).
-@end defvr
-
-@defvr {Configure option} --with-opencl-include-dir=@var{dir}
-Search for OpenCL headers under @var{dir}, which should notably contain
-@file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
-@code{/include} appended to the value given to @code{--with-opencl-dir}.
-@end defvr
-
-@defvr {Configure option} --with-opencl-lib-dir=@var{dir}
-Search for an OpenCL library under @var{dir}, which should notably
-contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
-@code{/lib} appended to the value given to @code{--with-opencl-dir}.
-@end defvr
-
-@defvr {Configure option} --enable-opencl-simulator
-Enable considering the provided OpenCL implementation as a simulator, i.e. use
-the kernel duration returned by OpenCL profiling information as wallclock time
-instead of the actual measured real time. This requires simgrid support.
-@end defvr
-
-@defvr {Configure option} --enable-maximplementations=@var{count}
-Allow for at most @var{count} codelet implementations for the same
-target device.  This information is then available as the
-@code{STARPU_MAXIMPLEMENTATIONS} macro.
-@end defvr
-
-@defvr {Configure option} --enable-max-sched-ctxs=@var{count}
-Allow for at most @var{count} scheduling contexts
-This information is then available as the
-@code{STARPU_NMAX_SCHED_CTXS} macro.
-@end defvr
-
-@defvr {Configure option} --disable-asynchronous-copy
-Disable asynchronous copies between CPU and GPU devices.
-The AMD implementation of OpenCL is known to
-fail when copying data asynchronously. When using this implementation,
-it is therefore necessary to disable asynchronous data transfers.
-@end defvr
-
-@defvr {Configure option} --disable-asynchronous-cuda-copy
-Disable asynchronous copies between CPU and CUDA devices.
-@end defvr
-
-@defvr {Configure option} --disable-asynchronous-opencl-copy
-Disable asynchronous copies between CPU and OpenCL devices.
-The AMD implementation of OpenCL is known to
-fail when copying data asynchronously. When using this implementation,
-it is therefore necessary to disable asynchronous data transfers.
-@end defvr
-
-@node Extension configuration
-@subsection Extension configuration
-
-@defvr {Configure option} --disable-socl
-Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
-default, it is enabled when an OpenCL implementation is found.
-@end defvr
-
-@defvr {Configure option} --disable-starpu-top
-Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
-is enabled when the required dependencies are found.
-@end defvr
-
-@defvr {Configure option} --disable-gcc-extensions
-Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
-enabled when the GCC compiler provides a plug-in support.
-@end defvr
-
-@defvr {Configure option} --with-mpicc=@var{path}
-Use the @command{mpicc} compiler at @var{path}, for StarPU-MPI.
-(@pxref{StarPU MPI support}).
-@end defvr
-
-@defvr {Configure option} --enable-mpi-progression-hook
-Enable the activity polling method for StarPU-MPI.
-@end defvr
-
-@node Advanced configuration
-@subsection Advanced configuration
-
-@defvr {Configure option} --enable-perf-debug
-Enable performance debugging through gprof.
-@end defvr
-
-@defvr {Configure option} --enable-model-debug
-Enable performance model debugging.
-@end defvr
-
-@defvr {Configure option} --enable-stats
-@c see ../../src/datawizard/datastats.c
-Enable gathering of various data statistics (@pxref{Data statistics}).
-@end defvr
-
-@defvr {Configure option} --enable-maxbuffers
-@anchor{--enable-maxbuffers}
-Define the maximum number of buffers that tasks will be able to take
-as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
-@end defvr
-
-@defvr {Configure option} --enable-allocation-cache
-Enable the use of a data allocation cache to avoid the cost of it with
-CUDA. Still experimental.
-@end defvr
-
-@defvr {Configure option} --enable-opengl-render
-Enable the use of OpenGL for the rendering of some examples.
-@c TODO: rather default to enabled when detected
-@end defvr
-
-@defvr {Configure option} --enable-blas-lib
-Specify the blas library to be used by some of the examples. The
-library has to be 'atlas' or 'goto'.
-@end defvr
-
-@defvr {Configure option} --disable-starpufft
-Disable the build of libstarpufft, even if fftw or cuFFT is available.
-@end defvr
-
-@defvr {Configure option} --with-magma=@var{prefix}
-Search for MAGMA under @var{prefix}.  @var{prefix} should notably
-contain @file{include/magmablas.h}.
-@end defvr
-
-@defvr {Configure option} --with-fxt=@var{prefix}
-Search for FxT under @var{prefix}.
-@url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
-traces of scheduling events, which can then be rendered them using ViTE
-(@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
-notably contain @code{include/fxt/fxt.h}.
-@end defvr
-
-@defvr {Configure option} --with-perf-model-dir=@var{dir}
-Store performance models under @var{dir}, instead of the current user's
-home.
-@end defvr
-
-@defvr {Configure option} --with-goto-dir=@var{prefix}
-Search for GotoBLAS under @var{prefix}, which should notably contain @file{libgoto.so} or @file{libgoto2.so}.
-@end defvr
-
-@defvr {Configure option} --with-atlas-dir=@var{prefix}
-Search for ATLAS under @var{prefix}, which should notably contain
-@file{include/cblas.h}.
-@end defvr
-
-@defvr {Configure option} --with-mkl-cflags=@var{cflags}
-Use @var{cflags} to compile code that uses the MKL library.
-@end defvr
-
-@defvr {Configure option} --with-mkl-ldflags=@var{ldflags}
-Use @var{ldflags} when linking code that uses the MKL library.  Note
-that the
-@url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
-MKL website} provides a script to determine the linking flags.
-@end defvr
-
-@defvr {Configure option} --disable-build-examples
-Disable the build of examples.
-@end defvr
-
-
-@defvr {Configure option} --enable-sc-hypervisor
-Enable the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}).
-By default, it is disabled.
-@end defvr
-
-@defvr {Configure option} --enable-memory-stats
-Enable memory statistics (@pxref{Memory feedback}).
-@end defvr
-
-@defvr {Configure option} --enable-simgrid
-Enable simulation of execution in simgrid, to allow easy experimentation with
-various numbers of cores and GPUs, or amount of memory, etc. Experimental.
-
-The path to simgrid can be specified through the @code{SIMGRID_CFLAGS} and
-@code{SIMGRID_LIBS} environment variables, for instance:
-@example
-export SIMGRID_CFLAGS="-I/usr/local/simgrid/include"
-export SIMGRID_LIBS="-L/usr/local/simgrid/lib -lsimgrid"
-@end example
-@end defvr
-
-@node Execution configuration through environment variables
-@section Execution configuration through environment variables
-
-@menu
-* Workers::                     Configuring workers
-* Scheduling::                  Configuring the Scheduling engine
-* Extensions::
-* Misc::                        Miscellaneous and debug
-@end menu
-
-@node Workers
-@subsection Configuring workers
-
-@defvr {Environment variable} STARPU_NCPU
-Specify the number of CPU workers (thus not including workers dedicated to control accelerators). Note that by default, StarPU will not allocate
-more CPU workers than there are physical CPUs, and that some CPUs are used to control
-the accelerators.
-@end defvr
-
-@defvr {Environment variable} STARPU_NCPUS
-This variable is deprecated. You should use @code{STARPU_NCPU}.
-@end defvr
-
-@defvr {Environment variable} STARPU_NCUDA
-Specify the number of CUDA devices that StarPU can use. If
-@code{STARPU_NCUDA} is lower than the number of physical devices, it is
-possible to select which CUDA devices should be used by the means of the
-@code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
-create as many CUDA workers as there are CUDA devices.
-@end defvr
-
-@defvr {Environment variable} STARPU_NOPENCL
-OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
-@end defvr
-
-@defvr {Environment variable} STARPU_OPENCL_ON_CPUS
-By default, the OpenCL driver only enables GPU and accelerator
-devices. By setting the environment variable
-@code{STARPU_OPENCL_ON_CPUS} to 1, the OpenCL driver will also enable
-CPU devices.
-@end defvr
-
-@defvr {Environment variable} STARPU_OPENCL_ONLY_ON_CPUS
-By default, the OpenCL driver enables GPU and accelerator
-devices. By setting the environment variable
-@code{STARPU_OPENCL_ONLY_ON_CPUS} to 1, the OpenCL driver will ONLY enable
-CPU devices.
-@end defvr
-
-@defvr {Environment variable} STARPU_WORKERS_NOBIND
-Setting it to non-zero will prevent StarPU from binding its threads to
-CPUs. This is for instance useful when running the testsuite in parallel.
-@end defvr
-
-@defvr {Environment variable} STARPU_WORKERS_CPUID
-Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
-specifies on which logical CPU the different workers should be
-bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
-worker will be bound to logical CPU #0, the second CPU worker will be bound to
-logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
-determined by the OS, or provided by the @code{hwloc} library in case it is
-available.
-
-Note that the first workers correspond to the CUDA workers, then come the
-OpenCL workers, and finally the CPU workers. For example if
-we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPU=2}
-and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
-by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
-the logical CPUs #1 and #3 will be used by the CPU workers.
-
-If the number of workers is larger than the array given in
-@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
-round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
-third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
-
-This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
-@code{starpu_conf} structure passed to @code{starpu_init} is set.
-@end defvr
-
-@defvr {Environment variable} STARPU_WORKERS_CUDAID
-Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
-possible to select which CUDA devices should be used by StarPU. On a machine
-equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
-@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
-they should use CUDA devices #1 and #3 (the logical ordering of the devices is
-the one reported by CUDA).
-
-This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
-the @code{starpu_conf} structure passed to @code{starpu_init} is set.
-@end defvr
-
-@defvr {Environment variable} STARPU_WORKERS_OPENCLID
-OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
-
-This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
-the @code{starpu_conf} structure passed to @code{starpu_init} is set.
-@end defvr
-
-@defvr {Environment variable} @code{STARPU_SINGLE_COMBINED_WORKER}
-If set, StarPU will create several workers which won't be able to work
-concurrently. It will by default create combined workers which size goes from 1
-to the total number of CPU workers in the system. @code{STARPU_MIN_WORKERSIZE}
-and @code{STARPU_MAX_WORKERSIZE} can be used to change this default.
-@end defvr
-
-@defvr {Environment variable} @code{STARPU_MIN_WORKERSIZE}
-When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MIN_WORKERSIZE}
-permits to specify the minimum size of the combined workers (instead of the default 1)
-@end defvr
-
-@defvr {Environment variable} @code{STARPU_MAX_WORKERSIZE}
-When @code{STARPU_SINGLE_COMBINED_WORKER} is set, @code{STARPU_MAX_WORKERSIZE}
-permits to specify the minimum size of the combined workers (instead of the
-number of CPU workers in the system)
-@end defvr
-
-@defvr {Environment variable} STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
-Let the user decide how many elements are allowed between combined workers
-created from hwloc information. For instance, in the case of sockets with 6
-cores without shared L2 caches, if @code{SYNTHESIZE_ARITY_COMBINED_WORKER} is
-set to 6, no combined worker will be synthesized beyond one for the socket
-and one per core. If it is set to 3, 3 intermediate combined workers will be
-synthesized, to divide the socket cores into 3 chunks of 2 cores. If it set to
-2, 2 intermediate combined workers will be synthesized, to divide the the socket
-cores into 2 chunks of 3 cores, and then 3 additional combined workers will be
-synthesized, to divide the former synthesized workers into a bunch of 2 cores,
-and the remaining core (for which no combined worker is synthesized since there
-is already a normal worker for it).
-
-The default, 2, thus makes StarPU tend to building a binary trees of combined
-workers.
-@end defvr
-
-@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_COPY
-Disable asynchronous copies between CPU and GPU devices.
-The AMD implementation of OpenCL is known to
-fail when copying data asynchronously. When using this implementation,
-it is therefore necessary to disable asynchronous data transfers.
-@end defvr
-
-@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
-Disable asynchronous copies between CPU and CUDA devices.
-@end defvr
-
-@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
-Disable asynchronous copies between CPU and OpenCL devices.
-The AMD implementation of OpenCL is known to
-fail when copying data asynchronously. When using this implementation,
-it is therefore necessary to disable asynchronous data transfers.
-@end defvr
-
-@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
-Disable asynchronous copies between CPU and MIC devices.
-@end defvr
-
-@defvr {Environment variable} STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
-This permits to test the performance effect of GPU-Direct.
-@end defvr
-
-@node Scheduling
-@subsection Configuring the Scheduling engine
-
-@defvr {Environment variable} STARPU_SCHED
-Choose between the different scheduling policies proposed by StarPU: work
-random, stealing, greedy, with performance models, etc.
-
-Use @code{STARPU_SCHED=help} to get the list of available schedulers.
-@end defvr
-
-@defvr {Environment variable} STARPU_CALIBRATE
-If this variable is set to 1, the performance models are calibrated during
-the execution. If it is set to 2, the previous values are dropped to restart
-calibration from scratch. Setting this variable to 0 disable calibration, this
-is the default behaviour.
-
-Note: this currently only applies to @code{dm} and @code{dmda} scheduling policies.
-@end defvr
-
-@defvr {Environment variable} STARPU_BUS_CALIBRATE
-If this variable is set to 1, the bus is recalibrated during intialization.
-@end defvr
-
-@defvr {Environment variable} STARPU_PREFETCH
-@anchor{STARPU_PREFETCH}
-This variable indicates whether data prefetching should be enabled (0 means
-that it is disabled). If prefetching is enabled, when a task is scheduled to be
-executed e.g. on a GPU, StarPU will request an asynchronous transfer in
-advance, so that data is already present on the GPU when the task starts. As a
-result, computation and data transfers are overlapped.
-Note that prefetching is enabled by default in StarPU.
-@end defvr
-
-@defvr {Environment variable} STARPU_SCHED_ALPHA
-To estimate the cost of a task StarPU takes into account the estimated
-computation time (obtained thanks to performance models). The alpha factor is
-the coefficient to be applied to it before adding it to the communication part.
-@end defvr
-
-@defvr {Environment variable} STARPU_SCHED_BETA
-To estimate the cost of a task StarPU takes into account the estimated
-data transfer time (obtained thanks to performance models). The beta factor is
-the coefficient to be applied to it before adding it to the computation part.
-@end defvr
-
-@defvr {Environment variable} STARPU_SCHED_GAMMA
-Define the execution time penalty of a joule (@pxref{Power-based scheduling}).
-@end defvr
-
-@defvr {Environment variable} STARPU_IDLE_POWER
-Define the idle power of the machine (@pxref{Power-based scheduling}).
-@end defvr
-
-@defvr {Environment variable} STARPU_PROFILING
-Enable on-line performance monitoring (@pxref{Enabling on-line performance monitoring}).
-@end defvr
-
-@node Extensions
-@subsection Extensions
-
-@defvr {Environment variable} SOCL_OCL_LIB_OPENCL
-THE SOCL test suite is only run when the environment variable
-@code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
-of the libOpenCL.so file of the OCL ICD implementation.
-@end defvr
-
-@defvr {Environment variable} STARPU_COMM_STATS
-@anchor{STARPU_COMM_STATS}
-Communication statistics for starpumpi (@pxref{StarPU MPI support})
-will be enabled when the environment variable @code{STARPU_COMM_STATS}
-is defined to an value other than 0.
-@end defvr
-
-@defvr {Environment variable} STARPU_MPI_CACHE
-@anchor{STARPU_MPI_CACHE}
-Communication cache for starpumpi (@pxref{StarPU MPI support}) will be
-disabled when the environment variable @code{STARPU_MPI_CACHE} is set
-to 0. It is enabled by default or for any other values of the variable
-@code{STARPU_MPI_CACHE}.
-@end defvr
-
-@node Misc
-@subsection Miscellaneous and debug
-
-@defvr {Environment variable} STARPU_HOME
-@anchor{STARPU_HOME}
-This specifies the main directory in which StarPU stores its
-configuration files. The default is @code{$HOME} on Unix environments,
-and @code{$USERPROFILE} on Windows environments.
-@end defvr
-
-@defvr {Environment variable} STARPU_HOSTNAME
-When set, force the hostname to be used when dealing performance model
-files. Models are indexed by machine name. When running for example on
-a homogenenous cluster, it is possible to share the models between
-machines by setting @code{export STARPU_HOSTNAME=some_global_name}.
-@end defvr
-
-@defvr {Environment variable} STARPU_OPENCL_PROGRAM_DIR
-@anchor{STARPU_OPENCL_PROGRAM_DIR}
-This specifies the directory where the OpenCL codelet source files are
-located. The function @ref{starpu_opencl_load_program_source} looks
-for the codelet in the current directory, in the directory specified
-by the environment variable @code{STARPU_OPENCL_PROGRAM_DIR}, in the
-directory @code{share/starpu/opencl} of the installation directory of
-StarPU, and finally in the source directory of StarPU.
-@end defvr
-
-@defvr {Environment variable} STARPU_SILENT
-This variable allows to disable verbose mode at runtime when StarPU
-has been configured with the option @code{--enable-verbose}. It also
-disables the display of StarPU information and warning messages.
-@end defvr
-
-@defvr {Environment variable} STARPU_LOGFILENAME
-This variable specifies in which file the debugging output should be saved to.
-@end defvr
-
-@defvr {Environment variable} STARPU_FXT_PREFIX
-This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
-@end defvr
-
-@defvr {Environment variable} STARPU_LIMIT_CUDA_devid_MEM
-This variable specifies the maximum number of megabytes that should be
-available to the application on the CUDA device with the identifier
-@code{devid}. This variable is intended to be used for experimental
-purposes as it emulates devices that have a limited amount of memory.
-When defined, the variable overwrites the value of the variable
-@code{STARPU_LIMIT_CUDA_MEM}.
-@end defvr
-
-@defvr {Environment variable} STARPU_LIMIT_CUDA_MEM
-This variable specifies the maximum number of megabytes that should be
-available to the application on each CUDA devices. This variable is
-intended to be used for experimental purposes as it emulates devices
-that have a limited amount of memory.
-@end defvr
-
-@defvr {Environment variable} STARPU_LIMIT_OPENCL_devid_MEM
-This variable specifies the maximum number of megabytes that should be
-available to the application on the OpenCL device with the identifier
-@code{devid}. This variable is intended to be used for experimental
-purposes as it emulates devices that have a limited amount of memory.
-When defined, the variable overwrites the value of the variable
-@code{STARPU_LIMIT_OPENCL_MEM}.
-@end defvr
-
-@defvr {Environment variable} STARPU_LIMIT_OPENCL_MEM
-This variable specifies the maximum number of megabytes that should be
-available to the application on each OpenCL devices. This variable is
-intended to be used for experimental purposes as it emulates devices
-that have a limited amount of memory.
-@end defvr
-
-@defvr {Environment variable} STARPU_LIMIT_CPU_MEM
-This variable specifies the maximum number of megabytes that should be
-available to the application on each CPU device. This variable is
-intended to be used for experimental purposes as it emulates devices
-that have a limited amount of memory.
-@end defvr
-
-@defvr {Environment variable} STARPU_GENERATE_TRACE
-When set to @code{1}, this variable indicates that StarPU should automatically
-generate a Paje trace when @code{starpu_shutdown()} is called.
-@end defvr
-
-@defvr {Environment variable} STARPU_MEMORY_STATS
-When set to 0, disable the display of memory statistics on data which
-have not been unregistered at the end of the execution (@pxref{Memory
-feedback}).
-@end defvr
-
-@defvr {Environment variable} STARPU_BUS_STATS
-When defined, statistics about data transfers will be displayed when calling
-@code{starpu_shutdown()} (@pxref{Profiling}).
-@end defvr
-
-@defvr {Environment variable} STARPU_WORKER_STATS
-When defined, statistics about the workers will be displayed when calling
-@code{starpu_shutdown()} (@pxref{Profiling}). When combined with the
-environment variable @code{STARPU_PROFILING}, it displays the power
-consumption (@pxref{Power-based scheduling}).
-@end defvr
-
-@defvr {Environment variable} STARPU_STATS
-When set to 0, data statistics will not be displayed at the
-end of the execution of an application (@pxref{Data statistics}).
-@end defvr

+ 0 - 507
doc/texinfo/chapters/fdl-1.3.texi

@@ -1,507 +0,0 @@
-@c -*-texinfo-*-
-
-@c The GNU Free Documentation License.
-@center Version 1.3, 3 November 2008
-
-@c This file is intended to be included within another document,
-@c hence no sectioning command or @node.
-
-@display
-Copyright @copyright{} 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
-@uref{http://fsf.org/}
-
-Everyone is permitted to copy and distribute verbatim copies
-of this license document, but changing it is not allowed.
-@end display
-
-@enumerate 0
-@item
-PREAMBLE
-
-The purpose of this License is to make a manual, textbook, or other
-functional and useful document @dfn{free} in the sense of freedom: to
-assure everyone the effective freedom to copy and redistribute it,
-with or without modifying it, either commercially or noncommercially.
-Secondarily, this License preserves for the author and publisher a way
-to get credit for their work, while not being considered responsible
-for modifications made by others.
-
-This License is a kind of ``copyleft'', which means that derivative
-works of the document must themselves be free in the same sense.  It
-complements the GNU General Public License, which is a copyleft
-license designed for free software.
-
-We have designed this License in order to use it for manuals for free
-software, because free software needs free documentation: a free
-program should come with manuals providing the same freedoms that the
-software does.  But this License is not limited to software manuals;
-it can be used for any textual work, regardless of subject matter or
-whether it is published as a printed book.  We recommend this License
-principally for works whose purpose is instruction or reference.
-
-@item
-APPLICABILITY AND DEFINITIONS
-
-This License applies to any manual or other work, in any medium, that
-contains a notice placed by the copyright holder saying it can be
-distributed under the terms of this License.  Such a notice grants a
-world-wide, royalty-free license, unlimited in duration, to use that
-work under the conditions stated herein.  The ``Document'', below,
-refers to any such manual or work.  Any member of the public is a
-licensee, and is addressed as ``you''.  You accept the license if you
-copy, modify or distribute the work in a way requiring permission
-under copyright law.
-
-A ``Modified Version'' of the Document means any work containing the
-Document or a portion of it, either copied verbatim, or with
-modifications and/or translated into another language.
-
-A ``Secondary Section'' is a named appendix or a front-matter section
-of the Document that deals exclusively with the relationship of the
-publishers or authors of the Document to the Document's overall
-subject (or to related matters) and contains nothing that could fall
-directly within that overall subject.  (Thus, if the Document is in
-part a textbook of mathematics, a Secondary Section may not explain
-any mathematics.)  The relationship could be a matter of historical
-connection with the subject or with related matters, or of legal,
-commercial, philosophical, ethical or political position regarding
-them.
-
-The ``Invariant Sections'' are certain Secondary Sections whose titles
-are designated, as being those of Invariant Sections, in the notice
-that says that the Document is released under this License.  If a
-section does not fit the above definition of Secondary then it is not
-allowed to be designated as Invariant.  The Document may contain zero
-Invariant Sections.  If the Document does not identify any Invariant
-Sections then there are none.
-
-The ``Cover Texts'' are certain short passages of text that are listed,
-as Front-Cover Texts or Back-Cover Texts, in the notice that says that
-the Document is released under this License.  A Front-Cover Text may
-be at most 5 words, and a Back-Cover Text may be at most 25 words.
-
-A ``Transparent'' copy of the Document means a machine-readable copy,
-represented in a format whose specification is available to the
-general public, that is suitable for revising the document
-straightforwardly with generic text editors or (for images composed of
-pixels) generic paint programs or (for drawings) some widely available
-drawing editor, and that is suitable for input to text formatters or
-for automatic translation to a variety of formats suitable for input
-to text formatters.  A copy made in an otherwise Transparent file
-format whose markup, or absence of markup, has been arranged to thwart
-or discourage subsequent modification by readers is not Transparent.
-An image format is not Transparent if used for any substantial amount
-of text.  A copy that is not ``Transparent'' is called ``Opaque''.
-
-Examples of suitable formats for Transparent copies include plain
-ASCII without markup, Texinfo input format, La@TeX{} input
-format, SGML or XML using a publicly available
-DTD, and standard-conforming simple HTML,
-PostScript or PDF designed for human modification.  Examples
-of transparent image formats include PNG, XCF and
-JPG.  Opaque formats include proprietary formats that can be
-read and edited only by proprietary word processors, SGML or
-XML for which the DTD and/or processing tools are
-not generally available, and the machine-generated HTML,
-PostScript or PDF produced by some word processors for
-output purposes only.
-
-The ``Title Page'' means, for a printed book, the title page itself,
-plus such following pages as are needed to hold, legibly, the material
-this License requires to appear in the title page.  For works in
-formats which do not have any title page as such, ``Title Page'' means
-the text near the most prominent appearance of the work's title,
-preceding the beginning of the body of the text.
-
-The ``publisher'' means any person or entity that distributes copies
-of the Document to the public.
-
-A section ``Entitled XYZ'' means a named subunit of the Document whose
-title either is precisely XYZ or contains XYZ in parentheses following
-text that translates XYZ in another language.  (Here XYZ stands for a
-specific section name mentioned below, such as ``Acknowledgements'',
-``Dedications'', ``Endorsements'', or ``History''.)  To ``Preserve the Title''
-of such a section when you modify the Document means that it remains a
-section ``Entitled XYZ'' according to this definition.
-
-The Document may include Warranty Disclaimers next to the notice which
-states that this License applies to the Document.  These Warranty
-Disclaimers are considered to be included by reference in this
-License, but only as regards disclaiming warranties: any other
-implication that these Warranty Disclaimers may have is void and has
-no effect on the meaning of this License.
-
-@item
-VERBATIM COPYING
-
-You may copy and distribute the Document in any medium, either
-commercially or noncommercially, provided that this License, the
-copyright notices, and the license notice saying this License applies
-to the Document are reproduced in all copies, and that you add no other
-conditions whatsoever to those of this License.  You may not use
-technical measures to obstruct or control the reading or further
-copying of the copies you make or distribute.  However, you may accept
-compensation in exchange for copies.  If you distribute a large enough
-number of copies you must also follow the conditions in section 3.
-
-You may also lend copies, under the same conditions stated above, and
-you may publicly display copies.
-
-@item
-COPYING IN QUANTITY
-
-If you publish printed copies (or copies in media that commonly have
-printed covers) of the Document, numbering more than 100, and the
-Document's license notice requires Cover Texts, you must enclose the
-copies in covers that carry, clearly and legibly, all these Cover
-Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
-the back cover.  Both covers must also clearly and legibly identify
-you as the publisher of these copies.  The front cover must present
-the full title with all words of the title equally prominent and
-visible.  You may add other material on the covers in addition.
-Copying with changes limited to the covers, as long as they preserve
-the title of the Document and satisfy these conditions, can be treated
-as verbatim copying in other respects.
-
-If the required texts for either cover are too voluminous to fit
-legibly, you should put the first ones listed (as many as fit
-reasonably) on the actual cover, and continue the rest onto adjacent
-pages.
-
-If you publish or distribute Opaque copies of the Document numbering
-more than 100, you must either include a machine-readable Transparent
-copy along with each Opaque copy, or state in or with each Opaque copy
-a computer-network location from which the general network-using
-public has access to download using public-standard network protocols
-a complete Transparent copy of the Document, free of added material.
-If you use the latter option, you must take reasonably prudent steps,
-when you begin distribution of Opaque copies in quantity, to ensure
-that this Transparent copy will remain thus accessible at the stated
-location until at least one year after the last time you distribute an
-Opaque copy (directly or through your agents or retailers) of that
-edition to the public.
-
-It is requested, but not required, that you contact the authors of the
-Document well before redistributing any large number of copies, to give
-them a chance to provide you with an updated version of the Document.
-
-@item
-MODIFICATIONS
-
-You may copy and distribute a Modified Version of the Document under
-the conditions of sections 2 and 3 above, provided that you release
-the Modified Version under precisely this License, with the Modified
-Version filling the role of the Document, thus licensing distribution
-and modification of the Modified Version to whoever possesses a copy
-of it.  In addition, you must do these things in the Modified Version:
-
-@enumerate A
-@item
-Use in the Title Page (and on the covers, if any) a title distinct
-from that of the Document, and from those of previous versions
-(which should, if there were any, be listed in the History section
-of the Document).  You may use the same title as a previous version
-if the original publisher of that version gives permission.
-
-@item
-List on the Title Page, as authors, one or more persons or entities
-responsible for authorship of the modifications in the Modified
-Version, together with at least five of the principal authors of the
-Document (all of its principal authors, if it has fewer than five),
-unless they release you from this requirement.
-
-@item
-State on the Title page the name of the publisher of the
-Modified Version, as the publisher.
-
-@item
-Preserve all the copyright notices of the Document.
-
-@item
-Add an appropriate copyright notice for your modifications
-adjacent to the other copyright notices.
-
-@item
-Include, immediately after the copyright notices, a license notice
-giving the public permission to use the Modified Version under the
-terms of this License, in the form shown in the Addendum below.
-
-@item
-Preserve in that license notice the full lists of Invariant Sections
-and required Cover Texts given in the Document's license notice.
-
-@item
-Include an unaltered copy of this License.
-
-@item
-Preserve the section Entitled ``History'', Preserve its Title, and add
-to it an item stating at least the title, year, new authors, and
-publisher of the Modified Version as given on the Title Page.  If
-there is no section Entitled ``History'' in the Document, create one
-stating the title, year, authors, and publisher of the Document as
-given on its Title Page, then add an item describing the Modified
-Version as stated in the previous sentence.
-
-@item
-Preserve the network location, if any, given in the Document for
-public access to a Transparent copy of the Document, and likewise
-the network locations given in the Document for previous versions
-it was based on.  These may be placed in the ``History'' section.
-You may omit a network location for a work that was published at
-least four years before the Document itself, or if the original
-publisher of the version it refers to gives permission.
-
-@item
-For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve
-the Title of the section, and preserve in the section all the
-substance and tone of each of the contributor acknowledgements and/or
-dedications given therein.
-
-@item
-Preserve all the Invariant Sections of the Document,
-unaltered in their text and in their titles.  Section numbers
-or the equivalent are not considered part of the section titles.
-
-@item
-Delete any section Entitled ``Endorsements''.  Such a section
-may not be included in the Modified Version.
-
-@item
-Do not retitle any existing section to be Entitled ``Endorsements'' or
-to conflict in title with any Invariant Section.
-
-@item
-Preserve any Warranty Disclaimers.
-@end enumerate
-
-If the Modified Version includes new front-matter sections or
-appendices that qualify as Secondary Sections and contain no material
-copied from the Document, you may at your option designate some or all
-of these sections as invariant.  To do this, add their titles to the
-list of Invariant Sections in the Modified Version's license notice.
-These titles must be distinct from any other section titles.
-
-You may add a section Entitled ``Endorsements'', provided it contains
-nothing but endorsements of your Modified Version by various
-parties---for example, statements of peer review or that the text has
-been approved by an organization as the authoritative definition of a
-standard.
-
-You may add a passage of up to five words as a Front-Cover Text, and a
-passage of up to 25 words as a Back-Cover Text, to the end of the list
-of Cover Texts in the Modified Version.  Only one passage of
-Front-Cover Text and one of Back-Cover Text may be added by (or
-through arrangements made by) any one entity.  If the Document already
-includes a cover text for the same cover, previously added by you or
-by arrangement made by the same entity you are acting on behalf of,
-you may not add another; but you may replace the old one, on explicit
-permission from the previous publisher that added the old one.
-
-The author(s) and publisher(s) of the Document do not by this License
-give permission to use their names for publicity for or to assert or
-imply endorsement of any Modified Version.
-
-@item
-COMBINING DOCUMENTS
-
-You may combine the Document with other documents released under this
-License, under the terms defined in section 4 above for modified
-versions, provided that you include in the combination all of the
-Invariant Sections of all of the original documents, unmodified, and
-list them all as Invariant Sections of your combined work in its
-license notice, and that you preserve all their Warranty Disclaimers.
-
-The combined work need only contain one copy of this License, and
-multiple identical Invariant Sections may be replaced with a single
-copy.  If there are multiple Invariant Sections with the same name but
-different contents, make the title of each such section unique by
-adding at the end of it, in parentheses, the name of the original
-author or publisher of that section if known, or else a unique number.
-Make the same adjustment to the section titles in the list of
-Invariant Sections in the license notice of the combined work.
-
-In the combination, you must combine any sections Entitled ``History''
-in the various original documents, forming one section Entitled
-``History''; likewise combine any sections Entitled ``Acknowledgements'',
-and any sections Entitled ``Dedications''.  You must delete all
-sections Entitled ``Endorsements.''
-
-@item
-COLLECTIONS OF DOCUMENTS
-
-You may make a collection consisting of the Document and other documents
-released under this License, and replace the individual copies of this
-License in the various documents with a single copy that is included in
-the collection, provided that you follow the rules of this License for
-verbatim copying of each of the documents in all other respects.
-
-You may extract a single document from such a collection, and distribute
-it individually under this License, provided you insert a copy of this
-License into the extracted document, and follow this License in all
-other respects regarding verbatim copying of that document.
-
-@item
-AGGREGATION WITH INDEPENDENT WORKS
-
-A compilation of the Document or its derivatives with other separate
-and independent documents or works, in or on a volume of a storage or
-distribution medium, is called an ``aggregate'' if the copyright
-resulting from the compilation is not used to limit the legal rights
-of the compilation's users beyond what the individual works permit.
-When the Document is included in an aggregate, this License does not
-apply to the other works in the aggregate which are not themselves
-derivative works of the Document.
-
-If the Cover Text requirement of section 3 is applicable to these
-copies of the Document, then if the Document is less than one half of
-the entire aggregate, the Document's Cover Texts may be placed on
-covers that bracket the Document within the aggregate, or the
-electronic equivalent of covers if the Document is in electronic form.
-Otherwise they must appear on printed covers that bracket the whole
-aggregate.
-
-@item
-TRANSLATION
-
-Translation is considered a kind of modification, so you may
-distribute translations of the Document under the terms of section 4.
-Replacing Invariant Sections with translations requires special
-permission from their copyright holders, but you may include
-translations of some or all Invariant Sections in addition to the
-original versions of these Invariant Sections.  You may include a
-translation of this License, and all the license notices in the
-Document, and any Warranty Disclaimers, provided that you also include
-the original English version of this License and the original versions
-of those notices and disclaimers.  In case of a disagreement between
-the translation and the original version of this License or a notice
-or disclaimer, the original version will prevail.
-
-If a section in the Document is Entitled ``Acknowledgements'',
-``Dedications'', or ``History'', the requirement (section 4) to Preserve
-its Title (section 1) will typically require changing the actual
-title.
-
-@item
-TERMINATION
-
-You may not copy, modify, sublicense, or distribute the Document
-except as expressly provided under this License.  Any attempt
-otherwise to copy, modify, sublicense, or distribute it is void, and
-will automatically terminate your rights under this License.
-
-However, if you cease all violation of this License, then your license
-from a particular copyright holder is reinstated (a) provisionally,
-unless and until the copyright holder explicitly and finally
-terminates your license, and (b) permanently, if the copyright holder
-fails to notify you of the violation by some reasonable means prior to
-60 days after the cessation.
-
-Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, receipt of a copy of some or all of the same material does
-not give you any rights to use it.
-
-@item
-FUTURE REVISIONS OF THIS LICENSE
-
-The Free Software Foundation may publish new, revised versions
-of the GNU Free Documentation License from time to time.  Such new
-versions will be similar in spirit to the present version, but may
-differ in detail to address new problems or concerns.  See
-@uref{http://www.gnu.org/copyleft/}.
-
-Each version of the License is given a distinguishing version number.
-If the Document specifies that a particular numbered version of this
-License ``or any later version'' applies to it, you have the option of
-following the terms and conditions either of that specified version or
-of any later version that has been published (not as a draft) by the
-Free Software Foundation.  If the Document does not specify a version
-number of this License, you may choose any version ever published (not
-as a draft) by the Free Software Foundation.  If the Document
-specifies that a proxy can decide which future versions of this
-License can be used, that proxy's public statement of acceptance of a
-version permanently authorizes you to choose that version for the
-Document.
-
-@item
-RELICENSING
-
-``Massive Multiauthor Collaboration Site'' (or ``MMC Site'') means any
-World Wide Web server that publishes copyrightable works and also
-provides prominent facilities for anybody to edit those works.  A
-public wiki that anybody can edit is an example of such a server.  A
-``Massive Multiauthor Collaboration'' (or ``MMC'') contained in the
-site means any set of copyrightable works thus published on the MMC
-site.
-
-``CC-BY-SA'' means the Creative Commons Attribution-Share Alike 3.0
-license published by Creative Commons Corporation, a not-for-profit
-corporation with a principal place of business in San Francisco,
-California, as well as future copyleft versions of that license
-published by that same organization.
-
-``Incorporate'' means to publish or republish a Document, in whole or
-in part, as part of another Document.
-
-An MMC is ``eligible for relicensing'' if it is licensed under this
-License, and if all works that were first published under this License
-somewhere other than this MMC, and subsequently incorporated in whole
-or in part into the MMC, (1) had no cover texts or invariant sections,
-and (2) were thus incorporated prior to November 1, 2008.
-
-The operator of an MMC Site may republish an MMC contained in the site
-under CC-BY-SA on the same site at any time before August 1, 2009,
-provided the MMC is eligible for relicensing.
-
-@end enumerate
-
-@page
-@heading ADDENDUM: How to use this License for your documents
-
-To use this License in a document you have written, include a copy of
-the License in the document and put the following copyright and
-license notices just after the title page:
-
-@smallexample
-@group
-  Copyright (C)  @var{year}  @var{your name}.
-  Permission is granted to copy, distribute and/or modify this document
-  under the terms of the GNU Free Documentation License, Version 1.3
-  or any later version published by the Free Software Foundation;
-  with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
-  Texts.  A copy of the license is included in the section entitled ``GNU
-  Free Documentation License''.
-@end group
-@end smallexample
-
-If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
-replace the ``with@dots{}Texts.'' line with this:
-
-@smallexample
-@group
-    with the Invariant Sections being @var{list their titles}, with
-    the Front-Cover Texts being @var{list}, and with the Back-Cover Texts
-    being @var{list}.
-@end group
-@end smallexample
-
-If you have Invariant Sections without Cover Texts, or some other
-combination of the three, merge those two alternatives to suit the
-situation.
-
-If your document contains nontrivial examples of program code, we
-recommend releasing these examples in parallel under your choice of
-free software license, such as the GNU General Public License,
-to permit their use in free software.
-
-@c Local Variables:
-@c ispell-local-pdict: "ispell-dict"
-@c End:

+ 0 - 58
doc/texinfo/chapters/fft-support.texi

@@ -1,58 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-StarPU provides @code{libstarpufft}, a library whose design is very similar to
-both fftw and cufft, the difference being that it takes benefit from both CPUs
-and GPUs. It should however be noted that GPUs do not have the same precision as
-CPUs, so the results may different by a negligible amount.
-
-Different precisions are available, namely float, double and long
-double precisions, with the following fftw naming conventions:
-
-@enumerate
-@item double precision structures and functions are named e.g. @code{starpufft_execute}
-@item float precision structures and functions are named e.g. @code{starpufftf_execute}
-@item long double precision structures and functions are named e.g. @code{starpufftl_execute}
-@end enumerate
-
-The documentation below is given with names for double precision, replace
-@code{starpufft_} with @code{starpufftf_} or @code{starpufftl_} as appropriate.
-
-Only complex numbers are supported at the moment.
-
-The application has to call @code{starpu_init} before calling starpufft functions.
-
-Either main memory pointers or data handles can be provided.
-
-@enumerate
-@item To provide main memory pointers, use @code{starpufft_start} or
-@code{starpufft_execute}. Only one FFT can be performed at a time, because
-StarPU will have to register the data on the fly. In the @code{starpufft_start}
-case, @code{starpufft_cleanup} needs to be called to unregister the data.
-@item To provide data handles (which is preferrable),
-use @code{starpufft_start_handle} (preferred) or
-@code{starpufft_execute_handle}. Several FFTs Several FFT tasks can be submitted
-for a given plan, which permits e.g. to start a series of FFT with just one
-plan. @code{starpufft_start_handle} is preferrable since it does not wait for
-the task completion, and thus permits to enqueue a series of tasks.
-@end enumerate
-
-All functions are defined in @ref{FFT Support}.
-
-@section Compilation
-
-The flags required to compile or link against the FFT library are accessible
-with the following commands:
-
-@example
-$ pkg-config --cflags starpufft-1.0  # options for the compiler
-$ pkg-config --libs starpufft-1.0    # options for the linker
-@end example
-
-Also pass the @code{--static} option if the application is to be linked statically.
-

+ 0 - 301
doc/texinfo/chapters/hypervisor_api.texi

@@ -1,301 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-
-@cindex Scheduling Context Hypervisor's API
-
-@menu
-* Managing the hypervisor::				Initialize the hypervisor
-* Registering Scheduling Contexts to the hypervisor:: 	Contexts have to register to the hypervisor
-* The user's input in the resizing process:: 		The user can help the hypervisor decide how to resize
-* Performance Counters::              			StarPU provides information to the Hypervisor through performance counters
-* Defining a new hypervisor policy::      		New Policies can be implemented
-@end menu
-
-@node Managing the hypervisor
-@section Managing the hypervisor
-There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
-
-@deftypefun {struct starpu_sched_ctx_performance_counters *} sc_hypervisor_init ({struct sc_hypervisor_policy *} @var{policy})
-Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
-These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
-@end deftypefun
-
-Note: The Hypervisor is actually a worker that takes this role once certain conditions trigger the resizing process (there is no additional thread assigned to the hypervisor).
-
-@deftypefun void sc_hypervisor_shutdown (void)
-The hypervisor and all information concerning it is cleaned. There is no synchronization between this function and starpu_shutdown. Thus, this should be done after starpu_shutdown(),
-because the performance counters will still need allocated callback functions.
-@end deftypefun
-
-@node Registering Scheduling Contexts to the hypervisor
-@section Registering Scheduling Contexts to the hypervisor
-Scheduling Contexts that have to be resized by the hypervisor must be first registered to the hypervisor. Whenever we want to exclude contexts from the resizing process we have to unregister them from the hypervisor.
-
-@deftypefun void sc_hypervisor_register_ctx (unsigned @var{sched_ctx}, double @var{total_flops})
-Register the context to the hypervisor, and indicate the number of flops the context will execute (needed for Gflops rate based strategy @pxref{Resizing strategies} or any other custom strategy needing it, for the others we can pass 0.0)
-@end deftypefun
-
-@deftypefun void sc_hypervisor_unregister_ctx (unsigned @var{sched_ctx})
-Unregister the context from the hypervisor
-@end deftypefun
-
-@node The user's input in the resizing process
-@section The user's input in the resizing process
-The user can totally forbid the resizing of a certain context or can then change his mind and allow it (in this case the resizing is managed by the hypervisor, that can forbid it or allow it)
-
-@deftypefun void sc_hypervisor_stop_resize (unsigned @var{sched_ctx})
-Forbid resizing of a context
-@end deftypefun
-
-@deftypefun void sc_hypervisor_start_resize (unsigned @var{sched_ctx})
-Allow resizing of a context
-@end deftypefun
-
-The user can then provide information to the hypervisor concerning the conditions of resizing.
-
-@deftypefun void sc_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
-Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
-
-@defmac HYPERVISOR_MAX_IDLE
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 3 arguments:
-an array of int for the workerids to apply the condition, an int to indicate the size of the array, and a double value indicating
-the maximum idle time allowed for a worker before the resizing process should be triggered
-@end defmac
-
-@defmac HYPERVISOR_PRIORITY
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 3 arguments:
-an array of int for the workerids to apply the condition, an int to indicate the size of the array, and an int value indicating
-the priority of the workers previously mentioned.
-The workers with the smallest priority are moved the first.
-@end defmac
-
-@defmac HYPERVISOR_MIN_WORKERS
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the minimum number of workers a context should have, underneath this limit the context cannot execute.
-@end defmac
-
-@defmac HYPERVISOR_MAX_WORKERS
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the maximum number of workers a context should have, above this limit the context would not be able to scale
-@end defmac
-
-@defmac HYPERVISOR_GRANULARITY
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the granularity of the resizing process (the number of workers should be moved from the context once it is resized)
-This parameter is ignore for the Gflops rate based strategy @pxref{Resizing strategies}, the number of workers that have to be moved is calculated by the strategy.
-@end defmac
-
-@defmac HYPERVISOR_FIXED_WORKERS
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 2 arguments:
-an array of int for the workerids to apply the condition and an int to indicate the size of the array.
-These workers are not allowed to be moved from the context.
-@end defmac
-
-@defmac HYPERVISOR_MIN_TASKS
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument (int)
-that indicated the minimum number of tasks that have to be executed before the context could be resized.
-This parameter is ignored for the Application Driven strategy @pxref{Resizing strategies} where the user indicates exactly when the resize should be done.
-@end defmac
-
-@defmac HYPERVISOR_NEW_WORKERS_MAX_IDLE
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument, a double value indicating
-the maximum idle time allowed for workers that have just been moved from other contexts in the current context.
-@end defmac
-
-@defmac HYPERVISOR_TIME_TO_APPLY
-This macro is used when calling sc_hypervisor_ioctl and must be followed by 1 argument (int) indicating the tag
-an executed task should have such that this configuration should be taken into account.
-@end defmac
-@end deftypefun
-
-@node Performance Counters
-@section Performance Counters
-
-The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
-
-@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
-@anchor{struct starpu_sched_ctx_performance_counters}
-
-@table @asis
-@item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
-Informs the hypervisor for how long a worker has been idle in the specified context
-@item @code{void (*notify_idle_end)(unsigned sched_ctx_id, int worker)}
-Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context.
-The idle counter it though reset.
-@item @code{void (*notify_pushed_task)(unsigned sched_ctx_id, int worker)}
-Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
-@item @code{void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops)}
-Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
-@item @code{void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid)}
-Notifies the hypervisor a task has just been executed
-
-@end table
-@end deftp
-
-TODO maybe they should be hidden to the user
-
-@node Defining a new hypervisor policy
-@section Defining a new hypervisor policy
-
-@menu
-* Hypervisor Policy API:: Hypervisor Policy API
-* Hypervisor example::
-@end menu
-
-@node Hypervisor Policy API
-@subsection Hypervisor Policy API
-
-While Scheduling Context Hypervisor Plugin comes with a variety of resizing policies (@pxref{Resizing strategies}),
-it may sometimes be desirable to implement custom
-policies to address specific problems.  The API described below allows
-users to write their own resizing policy.
-
-@deftp {Data Type} {struct sc_hypervisor_policy}
-This structure contains all the methods that implement a hypervisor resizing policy.
-
-@table @asis
-@item @code{const char* name}
-Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
-@item @code{unsigned custom}
-Indicates whether the policy is custom or not
-@item @code{void (*handle_idle_cycle)(unsigned sched_ctx_id, int worker)}
-It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
-@item @code{void (*handle_pushed_task)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
-@item @code{void (*handle_poped_task)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
-@item @code{void (*handle_idle_end)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is executed on the indicated worker and context after a long period of idle time
-@item @code{void (*handle_post_exec_hook)(unsigned sched_ctx_id, struct starpu_htbl32_node* resize_requests, int task_tag)}
-It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
-@end table
-@end deftp
-
-The Hypervisor provides also a structure with configuration information of each context, which can be used to construct new resize strategies.
-
-@deftp {Data Type} {struct sc_hypervisor_policy_config }
-This structure contains all configuration information of a context
-
-@table @asis
-@item @code{int min_nworkers}
-Indicates the minimum number of workers needed by the context
-@item @code{int max_nworkers}
-Indicates the maximum number of workers needed by the context
-@item @code{int granularity}
-Indicates the workers granularity of the context
-@item @code{int priority[STARPU_NMAXWORKERS]}
-Indicates the priority of each worker in the context
-@item @code{double max_idle[STARPU_NMAXWORKERS]}
-Indicates the maximum idle time accepted before a resize is triggered
-@item @code{int fixed_workers[STARPU_NMAXWORKERS]}
-Indicates which workers can be moved and which ones are fixed
-@item @code{double new_workers_max_idle}
-Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
-@end table
-@end deftp
-
-Additionally, the hypervisor provides a structure with information obtained from StarPU by means of the performance counters
-
-
-@deftp {Data Type} {struct sc_hypervisor_wrapper}
-This structure is a wrapper of the contexts available in StarPU
-and contains all information about a context obtained by incrementing the performance counters
-
-@table @asis
-@item @code{unsigned sched_ctx}
-The context wrapped
-@item @code{struct sc_hypervisor_policy_config *config}
-The corresponding resize configuration
-@item @code{double current_idle_time[STARPU_NMAXWORKERS]}
-The idle time counter of each worker of the context
-@item @code{int pushed_tasks[STARPU_NMAXWORKERS]}
-The number of pushed tasks of each worker of the context
-@item @code{int poped_tasks[STARPU_NMAXWORKERS]}
-The number of poped tasks of each worker of the context
-@item @code{double total_flops}
-The total number of flops to execute by the context
-@item @code{double total_elapsed_flops[STARPU_NMAXWORKERS]}
-The number of flops executed by each workers of the context
-@item @code{double elapsed_flops[STARPU_NMAXWORKERS]}
-The number of flops executed by each worker of the context from last resize
-@item @code{double remaining_flops}
-The number of flops that still have to be executed by the workers in the context
-@item @code{double start_time}
-The time when he started executed
-@item @code{struct sc_hypervisor_resize_ack resize_ack}
-The structure confirming the last resize finished and a new one can be done
-@end table
-@end deftp
-
-@deftp {Data Type} {struct sc_hypervisor_resize_ack}
-This structures checks if the workers moved to another context are actually taken into account in that context
-@table @asis
-@item @code{int receiver_sched_ctx}
-The context receiving the new workers
-@item @code{int *moved_workers}
-The workers moved to the receiver context
-@item @code{int nmoved_workers}
-The number of workers moved
-@item @code{int *acked_workers}
-If the value corresponding to a worker is 1, this one is taken into account in the new context if 0 not yet
-@end table
-@end deftp
-
-The following functions can be used in the resizing strategies.
-
-@deftypefun void sc_hypervisor_move_workers (unsigned @var{sender_sched_ctx}, unsigned @var{receiver_sched_ctx}, {int *}@var{workers_to_move}, unsigned @var{nworkers_to_move}, unsigned @var{now});
-Moves workers from one context to another
-@end deftypefun
-
-@deftypefun {struct sc_hypervisor_policy_config *} sc_hypervisor_get_config (unsigned @var{sched_ctx});
-Returns the configuration structure of a context
-@end deftypefun
-
-@deftypefun {int *} sc_hypervisor_get_sched_ctxs ();
-Gets the contexts managed by the hypervisor
-@end deftypefun
-
-@deftypefun int sc_hypervisor_get_nsched_ctxs ();
-Gets the number of contexts managed by the hypervisor
-@end deftypefun
-
-@deftypefun {struct sc_hypervisor_wrapper *} sc_hypervisor_get_wrapper (unsigned @var{sched_ctx});
-Returns the wrapper corresponding the context @code{sched_ctx}
-@end deftypefun
-
-@deftypefun double sc_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sc_hypervisor_wrapper *} @var{sc_w});
-Returns the flops of a context elapsed from the last resize
-@end deftypefun
-
-@deftypefun {char *} sc_hypervisor_get_policy ();
-Returns the name of the resizing policy the hypervisor uses
-@end deftypefun
-
-@node Hypervisor example
-@subsection Hypervisor example
-
-@cartouche
-@smallexample
-
-struct sc_hypervisor_policy dummy_policy =
-@{
-       .handle_poped_task = dummy_handle_poped_task,
-       .handle_pushed_task = dummy_handle_pushed_task,
-       .handle_idle_cycle = dummy_handle_idle_cycle,
-       .handle_idle_end = dummy_handle_idle_end,
-       .handle_post_exec_hook = dummy_handle_post_exec_hook,
-       .custom = 1,
-       .name = "dummy"
-@};
-
-@end smallexample
-@end cartouche
-
-@c Local Variables:
-@c TeX-master: "../starpu.texi"
-@c ispell-local-dictionary: "american"
-@c End:

+ 0 - 338
doc/texinfo/chapters/installing.texi

@@ -1,338 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Installing a Binary Package::
-* Installing from Source::
-* Setting up Your Own Code::
-* Benchmarking StarPU::
-@end menu
-
-@node Installing a Binary Package
-@section Installing a Binary Package
-
-One of the StarPU developers being a Debian Developer, the packages
-are well integrated and very uptodate. To see which packages are
-available, simply type:
-
-@example
-$ apt-cache search starpu
-@end example
-
-To install what you need, type:
-
-@example
-$ sudo apt-get install libstarpu-1.0 libstarpu-dev
-@end example
-
-@node Installing from Source
-@section Installing from Source
-
-StarPU can be built and installed by the standard means of the GNU
-autotools. The following chapter is intended to briefly remind how these tools
-can be used to install StarPU.
-
-@menu
-* Optional Dependencies::
-* Getting Sources::
-* Configuring StarPU::
-* Building StarPU::
-* Installing StarPU::
-@end menu
-
-@node Optional Dependencies
-@subsection Optional Dependencies
-
-The @url{http://www.open-mpi.org/software/hwloc, @code{hwloc} topology
-discovery library} is not mandatory to use StarPU but strongly
-recommended.  It allows for topology aware scheduling, which improves
-performance.  @code{hwloc} is available in major free operating system
-distributions, and for most operating systems.
-
-If @code{hwloc} is not available on your system, the option
-@code{--without-hwloc} should be explicitely given when calling the
-@code{configure} script. If @code{hwloc} is installed with a @code{pkg-config} file,
-no option is required, it will be detected automatically, otherwise
-@code{with-hwloc=prefix} should be used to specify the location
-of @code{hwloc}.
-
-@node Getting Sources
-@subsection Getting Sources
-
-StarPU's sources can be obtained from the
-@url{http://runtime.bordeaux.inria.fr/StarPU/files/,download page} of
-the StarPU website.
-
-All releases and the development tree of StarPU are freely available
-on INRIA's gforge under the LGPL license. Some releases are available
-under the BSD license.
-
-The latest release can be downloaded from the
-@url{http://gforge.inria.fr/frs/?group_id=1570,INRIA's gforge} or
-directly from the @url{http://runtime.bordeaux.inria.fr/StarPU/files/,StarPU download page}.
-
-The latest nightly snapshot can be downloaded from the @url{http://starpu.gforge.inria.fr/testing/,StarPU gforge website}.
-
-@example
-$ wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
-@end example
-
-And finally, current development version is also accessible via svn.
-It should be used only if you need the very latest changes (i.e. less
-than a day!)@footnote{The client side of the software Subversion can
-be obtained from @url{http://subversion.tigris.org}. If you
-are running on Windows, you will probably prefer to use
-@url{http://tortoisesvn.tigris.org/, TortoiseSVN}.}.
-
-@example
-svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk StarPU
-@end example
-
-@node Configuring StarPU
-@subsection Configuring StarPU
-
-Running @code{autogen.sh} is not necessary when using the tarball
-releases of StarPU.  If you are using the source code from the svn
-repository, you first need to generate the configure scripts and the
-Makefiles. This requires the availability of @code{autoconf},
-@code{automake} >= 2.60, and @code{makeinfo}.
-
-@example
-$ ./autogen.sh
-@end example
-
-You then need to configure StarPU. Details about options that are
-useful to give to @code{./configure} are given in @ref{Compilation
-configuration}.
-
-@example
-$ ./configure
-@end example
-
-If @code{configure} does not detect some software or produces errors, please
-make sure to post the content of @code{config.log} when reporting the issue.
-
-By default, the files produced during the compilation are placed in
-the source directory. As the compilation generates a lot of files, it
-is advised to to put them all in a separate directory. It is then
-easier to cleanup, and this allows to compile several configurations
-out of the same source tree. For that, simply enter the directory
-where you want the compilation to produce its files, and invoke the
-@code{configure} script located in the StarPU source directory.
-
-@example
-$ mkdir build
-$ cd build
-$ ../configure
-@end example
-
-@node Building StarPU
-@subsection Building StarPU
-
-@example
-$ make
-@end example
-
-Once everything is built, you may want to test the result. An
-extensive set of regression tests is provided with StarPU. Running the
-tests is done by calling @code{make check}. These tests are run every night
-and the result from the main profile is publicly
-@url{http://starpu.gforge.inria.fr/testing/,available}.
-
-@example
-$ make check
-@end example
-
-@node Installing StarPU
-@subsection Installing StarPU
-
-In order to install StarPU at the location that was specified during
-configuration:
-
-@example
-$ make install
-@end example
-
-Libtool interface versioning information are included in
-libraries names (libstarpu-1.0.so, libstarpumpi-1.0.so and
-libstarpufft-1.0.so).
-
-@node Setting up Your Own Code
-@section Setting up Your Own Code
-
-@menu
-* Setting Flags for Compiling::
-* Running a Basic StarPU Application::
-* Kernel Threads Started by StarPU::
-* Enabling OpenCL::
-@end menu
-
-@node Setting Flags for Compiling
-@subsection Setting Flags for Compiling, Linking and Running Applications
-
-StarPU provides a pkg-config executable to obtain relevant compiler
-and linker flags.
-Compiling and linking an application against StarPU may require to use
-specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
-To this end, it is possible to use the @code{pkg-config} tool.
-
-If StarPU was not installed at some standard location, the path of StarPU's
-library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
-that @code{pkg-config} can find it. For example if StarPU was installed in
-@code{$prefix_dir}:
-
-@example
-$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
-@end example
-
-The flags required to compile or link against StarPU are then
-accessible with the following commands@footnote{It is still possible to use the API
-provided in the version 0.9 of StarPU by calling @code{pkg-config}
-with the @code{libstarpu} package. Similar packages are provided for
-@code{libstarpumpi} and @code{libstarpufft}.}:
-
-@example
-$ pkg-config --cflags starpu-1.1  # options for the compiler
-$ pkg-config --libs starpu-1.1    # options for the linker
-@end example
-
-Make sure that @code{pkg-config --libs starpu-1.1} actually produces some output
-before going further: @code{PKG_CONFIG_PATH} has to point to the place where
-@code{starpu-1.1.pc} was installed during @code{make install}.
-
-Also pass the @code{--static} option if the application is to be
-linked statically.
-
-It is also necessary to set the variable @code{LD_LIBRARY_PATH} to
-locate dynamic libraries at runtime.
-
-@example
-$ LD_LIBRARY_PATH=$prefix_dir/lib:$LD_LIBRARY_PATH
-@end example
-
-When using a Makefile, the following lines can be added to set the
-options for the compiler and the linker:
-
-@cartouche
-@example
-CFLAGS          +=      $$(pkg-config --cflags starpu-1.1)
-LDFLAGS         +=      $$(pkg-config --libs starpu-1.1)
-@end example
-@end cartouche
-
-@node Running a Basic StarPU Application
-@subsection Running a Basic StarPU Application
-
-Basic examples using StarPU are built in the directory
-@code{examples/basic_examples/} (and installed in
-@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
-@code{vector_scal}.
-
-@example
-$ ./examples/basic_examples/vector_scal
-BEFORE: First element was 1.000000
-AFTER: First element is 3.140000
-@end example
-
-When StarPU is used for the first time, the directory
-@code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
-that directory (@pxref{STARPU_HOME}).
-
-Please note that buses are benchmarked when StarPU is launched for the
-first time. This may take a few minutes, or less if @code{hwloc} is
-installed. This step is done only once per user and per machine.
-
-@node Kernel Threads Started by StarPU
-@subsection Kernel Threads Started by StarPU
-
-StarPU automatically binds one thread per CPU core. It does not use
-SMT/hyperthreading because kernels are usually already optimized for using a
-full core, and using hyperthreading would make kernel calibration rather random.
-
-Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
-
-While StarPU tasks are executing, the application is not supposed to do
-computations in the threads it starts itself, tasks should be used instead.
-
-TODO: add a StarPU function to bind an application thread (e.g. the main thread)
-to a dedicated core (and thus disable the corresponding StarPU CPU worker).
-
-@node Enabling OpenCL
-@subsection Enabling OpenCL
-
-When both CUDA and OpenCL drivers are enabled, StarPU will launch an
-OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
-This design choice was necessary as OpenCL and CUDA can not run at the
-same time on the same NVIDIA GPU, as there is currently no interoperability
-between them.
-
-To enable OpenCL, you need either to disable CUDA when configuring StarPU:
-
-@example
-$ ./configure --disable-cuda
-@end example
-
-or when running applications:
-
-@example
-$ STARPU_NCUDA=0 ./application
-@end example
-
-OpenCL will automatically be started on any device not yet used by
-CUDA. So on a machine running 4 GPUS, it is therefore possible to
-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
-so:
-
-@example
-$ STARPU_NCUDA=2 ./application
-@end example
-
-@node Benchmarking StarPU
-@section Benchmarking StarPU
-
-Some interesting benchmarks are installed among examples in
-@code{$prefix_dir/lib/starpu/examples/}. Make sure to try various
-schedulers, for instance STARPU_SCHED=dmda
-
-@menu
-* Task size overhead::
-* Data transfer latency::
-* Gemm::
-* Cholesky::
-* LU::
-@end menu
-
-@node Task size overhead
-@subsection Task size overhead
-
-This benchmark gives a glimpse into how big a size should be for StarPU overhead
-to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
-of the speedup of tasks of various sizes, depending on the number of CPUs being
-used.
-
-@node Data transfer latency
-@subsection Data transfer latency
-
-@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
-prints the measured latency.
-
-@node Gemm
-@subsection Matrix-matrix multiplication
-
-@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
-multiplication using BLAS and cuBLAS. They output the obtained GFlops.
-
-@node Cholesky
-@subsection Cholesky factorization
-
-@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
-
-@node LU
-@subsection LU factorization
-
-@code{lu*} perform an LU factorization. They use different dependency primitives.

+ 0 - 201
doc/texinfo/chapters/introduction.texi

@@ -1,201 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Motivation::                  Why StarPU ?
-* StarPU in a Nutshell::        The Fundamentals of StarPU
-* Application taskification::   How to taskify an application
-* Glossary::
-* Research Papers::
-@end menu
-
-@node Motivation
-@section Motivation
-
-@c complex machines with heterogeneous cores/devices
-The use of specialized hardware such as accelerators or coprocessors offers an
-interesting approach to overcome the physical limits encountered by processor
-architects. As a result, many machines are now equipped with one or several
-accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
-efforts have been devoted to offload computation onto such accelerators, very
-little attention as been paid to portability concerns on the one hand, and to the
-possibility of having heterogeneous accelerators and processors to interact on the other hand.
-
-StarPU is a runtime system that offers support for heterogeneous multicore
-architectures, it not only offers a unified view of the computational resources
-(i.e. CPUs and accelerators at the same time), but it also takes care of
-efficiently mapping and executing tasks onto an heterogeneous machine while
-transparently handling low-level issues such as data transfers in a portable
-fashion.
-
-@c this leads to a complicated distributed memory design
-@c which is not (easily) manageable by hand
-
-@c added value/benefits of StarPU
-@c   - portability
-@c   - scheduling, perf. portability
-
-@node StarPU in a Nutshell
-@section StarPU in a Nutshell
-
-StarPU is a software tool aiming to allow programmers to exploit the
-computing power of the available CPUs and GPUs, while relieving them
-from the need to specially adapt their programs to the target machine
-and processing units.
-
-At the core of StarPU is its run-time support library, which is
-responsible for scheduling application-provided tasks on heterogeneous
-CPU/GPU machines.  In addition, StarPU comes with programming language
-support, in the form of extensions to languages of the C family
-(@pxref{C Extensions}), as well as an OpenCL front-end (@pxref{SOCL
-OpenCL Extensions}).
-
-@cindex task-based programming model
-StarPU's run-time and programming language extensions support a
-@dfn{task-based programming model}.  Applications submit computational
-tasks, with CPU and/or GPU implementations, and StarPU schedules these
-tasks and associated data transfers on available CPUs and GPUs.  The
-data that a task manipulates are automatically transferred among
-accelerators and the main memory, so that programmers are freed from the
-scheduling issues and technical details associated with these transfers.
-
-StarPU takes particular care of scheduling tasks efficiently, using
-well-known algorithms from the literature (@pxref{Task scheduling
-policy}).  In addition, it allows scheduling experts, such as compiler
-or computational library developers, to implement custom scheduling
-policies in a portable fashion (@pxref{Defining a New Scheduling Policy}).
-
-The remainder of this section describes the main concepts used in StarPU.
-
-@menu
-* Codelet and Tasks::
-* StarPU Data Management Library::
-@end menu
-
-@c explain the notion of codelet and task (i.e. g(A, B)
-@node Codelet and Tasks
-@subsection Codelet and Tasks
-
-@cindex codelet
-One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
-computational kernel that can possibly be implemented on multiple architectures
-such as a CPU, a CUDA device or an OpenCL device.
-
-@c TODO insert illustration f: f_spu, f_cpu, ...
-
-@cindex task
-Another important data structure is the @b{task}. Executing a StarPU task
-consists in applying a codelet on a data set, on one of the architectures on
-which the codelet is implemented. A task thus describes the codelet that it
-uses, but also which data are accessed, and how they are
-accessed during the computation (read and/or write).
-StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
-operation. The task structure can also specify a @b{callback} function that is
-called once StarPU has properly executed the task. It also contains optional
-fields that the application may use to give hints to the scheduler (such as
-priority levels).
-
-@cindex tag
-By default, task dependencies are inferred from data dependency (sequential
-coherence) by StarPU. The application can however disable sequential coherency
-for some data, and dependencies be expressed by hand.
-A task may be identified by a unique 64-bit number chosen by the application
-which we refer as a @b{tag}.
-Task dependencies can be enforced by hand either by the means of callback functions, by
-submitting other tasks, or by expressing dependencies
-between tags (which can thus correspond to tasks that have not been submitted
-yet).
-
-@c TODO insert illustration f(Ar, Brw, Cr) + ..
-
-@c DSM
-@node StarPU Data Management Library
-@subsection StarPU Data Management Library
-
-Because StarPU schedules tasks at runtime, data transfers have to be
-done automatically and ``just-in-time'' between processing units,
-relieving the application programmer from explicit data transfers.
-Moreover, to avoid unnecessary transfers, StarPU keeps data
-where it was last needed, even if was modified there, and it
-allows multiple copies of the same data to reside at the same time on
-several processing units as long as it is not modified.
-
-@node Application taskification
-@section Application taskification
-
-TODO
-
-@c TODO: section describing what taskifying an application means: before
-@c porting to StarPU, turn the program into:
-@c "pure" functions, which only access data from their passed parameters
-@c a main function which just calls these pure functions
-@c
-@c and then it's trivial to use StarPU or any other kind of task-based library:
-@c simply replace calling the function with submitting a task.
-
-@node Glossary
-@section Glossary
-
-A @b{codelet} records pointers to various implementations of the same
-theoretical function.
-
-A @b{memory node} can be either the main RAM or GPU-embedded memory.
-
-A @b{bus} is a link between memory nodes.
-
-A @b{data handle} keeps track of replicates of the same data (@b{registered} by the
-application) over various memory nodes. The data management library manages
-keeping them coherent.
-
-The @b{home} memory node of a data handle is the memory node from which the data
-was registered (usually the main memory node).
-
-A @b{task} represents a scheduled execution of a codelet on some data handles.
-
-A @b{tag} is a rendez-vous point. Tasks typically have their own tag, and can
-depend on other tags. The value is chosen by the application.
-
-A @b{worker} execute tasks. There is typically one per CPU computation core and
-one per accelerator (for which a whole CPU core is dedicated).
-
-A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
-and OpenCL drivers. They usually start several workers to actually drive
-them.
-
-A @b{performance model} is a (dynamic or static) model of the performance of a
-given codelet. Codelets can have execution time performance model as well as
-power consumption performance models.
-
-A data @b{interface} describes the layout of the data: for a vector, a pointer
-for the start, the number of elements and the size of elements ; for a matrix, a
-pointer for the start, the number of elements per row, the offset between rows,
-and the size of each element ; etc. To access their data, codelet functions are
-given interfaces for the local memory node replicates of the data handles of the
-scheduled task.
-
-@b{Partitioning} data means dividing the data of a given data handle (called
-@b{father}) into a series of @b{children} data handles which designate various
-portions of the former.
-
-A @b{filter} is the function which computes children data handles from a father
-data handle, and thus describes how the partitioning should be done (horizontal,
-vertical, etc.)
-
-@b{Acquiring} a data handle can be done from the main application, to safely
-access the data of a data handle from its home node, without having to
-unregister it.
-
-
-@node Research Papers
-@section Research Papers
-
-Research papers about StarPU can be found at
-@url{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}.
-
-A good overview is available in the research report at
-@url{http://hal.archives-ouvertes.fr/inria-00467677}.

+ 0 - 55
doc/texinfo/chapters/mic-scc-support.texi

@@ -1,55 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2013  Universit@'e de Bordeaux 1
-@c See the file starpu.texi for copying conditions.
-
-@section Compilation
-
-SCC support just needs the presence of the RCCE library.
-
-MIC support actually needs two compilations of StarPU, one for the host and one for
-the device. The @code{mic-configure} script can be used to achieve this: it basically
-calls @code{configure} as appropriate from two new directories: @code{build_mic} and
-@code{build_host}. @code{make} and @code{make install} can then be used as usual and will
-recurse into both directories.
-
-@c TODO: move to configuration section ?
-
-It can be parameterized with the following environment variables:
-
-@table @asis
-@item @code{STARPU_MIC_HOST}
-Defines the value of the @code{--host} parameter passed to @code{configure} for the
-cross-compilation. The current default is @code{x86_64-k1om-linux}.
-
-@item @code{STARPU_MIC_CC_PATH}
-Defines the path to the MIC cross-compiler. The current default is @code{/usr/linux-k1om-4.7/bin/}.
-
-@item @code{STARPU_COI_DIR}
-Defines the path to the COI library. The current default is @code{/opt/intel/mic/coi}
-@end table
-
-@section Porting applications to MIC/SCC
-
-The simplest way to port an application to MIC/SCC is to add the
-@code{cpu_funcs_name} field in the codelet, to provide StarPU with the function
-name of the CPU implementation. StarPU will thus simply use the existing CPU
-implementation (cross-rebuilt in the MIC case). The functions have to be
-globally-visible (i.e. not @code{static}) for StarPU to be able to look them up.
-
-For SCC execution, @code{starpu_initialize} also has to be used instead of @code{starpu_init}, so
-as to pass @code{argc} and @code{argv}.
-
-@section Launching programs
-
-SCC programs are started through RCCE
-
-MIC programs are started from the host. StarPU automatically
-starts the same program on MIC devices. It however needs to get
-the MIC-cross-built binary. It will look for the file given by the
-@code{STARPU_MIC_SINK_PROGRAM_NAME} environment variable or in the directory
-given by the @code{STARPU_MIC_SINK_PROGRAM_PATH} environment variable, or in
-the @code{mic_sink_program_path} field of the @code{starpu_config} structure.
-It will also look in the current directory for the same binary name plus a
-@code{-mic} or @code{_mic} suffix.

+ 0 - 418
doc/texinfo/chapters/mpi-support.texi

@@ -1,418 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-The integration of MPI transfers within task parallelism is done in a
-very natural way by the means of asynchronous interactions between the
-application and StarPU.  This is implemented in a separate libstarpumpi library
-which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
-@code{void *} buffers are replaced with @code{starpu_data_handle_t}s, and all
-GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
-use the usual @code{mpirun} command of the MPI implementation to start StarPU on
-the different MPI nodes.
-
-An MPI Insert Task function provides an even more seamless transition to a
-distributed application, by automatically issuing all required data transfers
-according to the task graph and an application-provided distribution.
-
-@menu
-* Simple Example::
-* Point to point communication::
-* Exchanging User Defined Data Interface::
-* MPI Insert Task Utility::
-* MPI Collective Operations::
-@end menu
-
-@node Simple Example
-@section Simple Example
-
-The flags required to compile or link against the MPI layer are
-accessible with the following commands:
-
-@example
-$ pkg-config --cflags starpumpi-1.0  # options for the compiler
-$ pkg-config --libs starpumpi-1.0    # options for the linker
-@end example
-
-You also need pass the @code{--static} option if the application is to
-be linked statically.
-
-@cartouche
-@smallexample
-void increment_token(void)
-@{
-    struct starpu_task *task = starpu_task_create();
-
-    task->cl = &increment_cl;
-    task->handles[0] = token_handle;
-
-    starpu_task_submit(task);
-@}
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-int main(int argc, char **argv)
-@{
-    int rank, size;
-
-    starpu_init(NULL);
-    starpu_mpi_initialize_extended(&rank, &size);
-
-    starpu_vector_data_register(&token_handle, STARPU_MAIN_RAM, (uintptr_t)&token, 1, sizeof(unsigned));
-
-    unsigned nloops = NITER;
-    unsigned loop;
-
-    unsigned last_loop = nloops - 1;
-    unsigned last_rank = size - 1;
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-    for (loop = 0; loop < nloops; loop++) @{
-        int tag = loop*size + rank;
-
-        if (loop == 0 && rank == 0)
-        @{
-            token = 0;
-            fprintf(stdout, "Start with token value %d\n", token);
-        @}
-        else
-        @{
-            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
-                    MPI_COMM_WORLD, NULL, NULL);
-        @}
-
-        increment_token();
-
-        if (loop == last_loop && rank == last_rank)
-        @{
-            starpu_data_acquire(token_handle, STARPU_R);
-            fprintf(stdout, "Finished: token value %d\n", token);
-            starpu_data_release(token_handle);
-        @}
-        else
-        @{
-            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
-                    MPI_COMM_WORLD, NULL, NULL);
-        @}
-    @}
-
-    starpu_task_wait_for_all();
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-    starpu_mpi_shutdown();
-    starpu_shutdown();
-
-    if (rank == last_rank)
-    @{
-        fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
-        STARPU_ASSERT(token == nloops*size);
-    @}
-@end smallexample
-@end cartouche
-
-@node Point to point communication
-@section Point to point communication
-
-The standard point to point communications of MPI have been
-implemented. The semantic is similar to the MPI one, but adapted to
-the DSM provided by StarPU. A MPI request will only be submitted when
-the data is available in the main memory of the node submitting the
-request.
-
-There is two types of asynchronous communications: the classic
-asynchronous communications and the detached communications. The
-classic asynchronous communications (@code{starpu_mpi_isend} and
-@code{starpu_mpi_irecv}) need to be followed by a call to
-@code{starpu_mpi_wait} or to @code{starpu_mpi_test} to wait for or to
-test the completion of the communication. Waiting for or testing the
-completion of detached communications is not possible, this is done
-internally by StarPU-MPI, on completion, the resources are
-automatically released. This mechanism is similar to the pthread
-detach state attribute which determines whether a thread will be
-created in a joinable or a detached state.
-
-For any communication, the call of the function will result in the
-creation of a StarPU-MPI request, the function
-@code{starpu_data_acquire_cb} is then called to asynchronously request
-StarPU to fetch the data in main memory; when the data is available in
-main memory, a StarPU-MPI function is called to put the new request in
-the list of the ready requests if it is a send request, or in an
-hashmap if it is a receive request.
-
-Internally, all MPI communications submitted by StarPU uses a unique
-tag which has a default value, and can be accessed with the functions
-@ref{starpu_mpi_get_communication_tag} and
-@ref{starpu_mpi_set_communication_tag}.
-
-The matching of tags with corresponding requests is done into StarPU-MPI. 
-To handle this, any communication is a double-communication based on a 
-envelope + data system. Every data which will be sent needs to send an 
-envelope which describes the data (particularly its tag) before sending 
-the data, so the receiver can get the matching pending receive request 
-from the hashmap, and submit it to recieve the data correctly.
-
-To this aim, the StarPU-MPI progression thread has a permanent-submitted 
-request destined to receive incoming envelopes from all sources.
-
-The StarPU-MPI progression thread regularly polls this list of ready
-requests. For each new ready request, the appropriate function is
-called to post the corresponding MPI call. For example, calling
-@code{starpu_mpi_isend} will result in posting @code{MPI_Isend}. If
-the request is marked as detached, the request will be put in the list
-of detached requests.
-
-The StarPU-MPI progression thread also polls the list of detached
-requests. For each detached request, it regularly tests the completion
-of the MPI request by calling @code{MPI_Test}. On completion, the data
-handle is released, and if a callback was defined, it is called.
-
-Finally, the StarPU-MPI progression thread checks if an envelope has 
-arrived. If it is, it'll check if the corresponding receive has already
-been submitted by the application. If it is, it'll submit the request
-just as like as it does with those on the list of ready requests.
-If it is not, it'll allocate a temporary handle to store the data that
-will arrive just after, so as when the corresponding receive request
-will be submitted by the application, it'll copy this temporary handle
-into its one instead of submitting a new StarPU-MPI request.
-
-@ref{Communication} gives the list of all the point to point
-communications defined in StarPU-MPI.
-
-@node Exchanging User Defined Data Interface
-@section Exchanging User Defined Data Interface
-
-New data interfaces defined as explained in @ref{Defining a New Data
-Interface} can also be used within StarPU-MPI and exchanged between
-nodes. Two functions needs to be defined through
-the type @code{struct starpu_data_interface_ops} (@pxref{Defining
-Interface}). The pack function takes a handle and returns a
-contiguous memory buffer along with its size where data to be conveyed to another node
-should be copied. The reversed operation is implemented in the unpack
-function which takes a contiguous memory buffer and recreates the data
-handle.
-
-@cartouche
-@smallexample
-static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, ssize_t *count)
-@{
-  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
-
-  struct starpu_complex_interface *complex_interface =
-    (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
-
-  *count = complex_get_size(handle);
-  *ptr = malloc(*count);
-  memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
-  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
-         complex_interface->nx*sizeof(double));
-
-  return 0;
-@}
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
-@{
-  STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
-
-  struct starpu_complex_interface *complex_interface =
-    (struct starpu_complex_interface *)	starpu_data_get_interface_on_node(handle, node);
-
-  memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
-  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double),
-         complex_interface->nx*sizeof(double));
-
-  return 0;
-@}
-@end smallexample
-@end cartouche
-
-@cartouche
-@smallexample
-static struct starpu_data_interface_ops interface_complex_ops =
-@{
-  ...
-  .pack_data = complex_pack_data,
-  .unpack_data = complex_unpack_data
-@};
-@end smallexample
-@end cartouche
-
-@node MPI Insert Task Utility
-@section MPI Insert Task Utility
-
-To save the programmer from having to explicit all communications, StarPU
-provides an "MPI Insert Task Utility". The principe is that the application
-decides a distribution of the data over the MPI nodes by allocating it and
-notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns"
-which data. It also decides, for each handle, an MPI tag which will be used to
-exchange the content of the handle. All MPI nodes then process the whole task
-graph, and StarPU automatically determines which node actually execute which
-task, and trigger the required MPI transfers.
-
-The list of functions are described in @ref{MPI Insert Task}.
-
-Here an stencil example showing how to use @code{starpu_mpi_insert_task}. One
-first needs to define a distribution function which specifies the
-locality of the data. Note that that distribution information needs to
-be given to StarPU by calling @code{starpu_data_set_rank}. A MPI tag
-should also be defined for each data handle by calling
-@code{starpu_data_set_tag}.
-
-@cartouche
-@smallexample
-/* Returns the MPI node number where data is */
-int my_distrib(int x, int y, int nb_nodes) @{
-  /* Block distrib */
-  return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
-
-  // /* Other examples useful for other kinds of computations */
-  // /* / distrib */
-  // return (x+y) % nb_nodes;
-
-  // /* Block cyclic distrib */
-  // unsigned side = sqrt(nb_nodes);
-  // return x % side + (y % side) * size;
-@}
-@end smallexample
-@end cartouche
-
-Now the data can be registered within StarPU. Data which are not
-owned but will be needed for computations can be registered through
-the lazy allocation mechanism, i.e. with a @code{home_node} set to -1.
-StarPU will automatically allocate the memory when it is used for the
-first time.
-
-One can note an optimization here (the @code{else if} test): we only register
-data which will be needed by the tasks that we will execute.
-
-@cartouche
-@smallexample
-    unsigned matrix[X][Y];
-    starpu_data_handle_t data_handles[X][Y];
-
-    for(x = 0; x < X; x++) @{
-        for (y = 0; y < Y; y++) @{
-            int mpi_rank = my_distrib(x, y, size);
-             if (mpi_rank == my_rank)
-                /* Owning data */
-                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM,
-                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
-            else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
-                  || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
-                /* I don't own that index, but will need it for my computations */
-                starpu_variable_data_register(&data_handles[x][y], -1,
-                                              (uintptr_t)NULL, sizeof(unsigned));
-            else
-                /* I know it's useless to allocate anything for this */
-                data_handles[x][y] = NULL;
-            if (data_handles[x][y]) @{
-                starpu_data_set_rank(data_handles[x][y], mpi_rank);
-                starpu_data_set_tag(data_handles[x][y], x*X+y);
-            @}
-        @}
-    @}
-@end smallexample
-@end cartouche
-
-Now @code{starpu_mpi_insert_task()} can be called for the different
-steps of the application.
-
-@cartouche
-@smallexample
-    for(loop=0 ; loop<niter; loop++)
-        for (x = 1; x < X-1; x++)
-            for (y = 1; y < Y-1; y++)
-                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
-                                       STARPU_RW, data_handles[x][y],
-                                       STARPU_R, data_handles[x-1][y],
-                                       STARPU_R, data_handles[x+1][y],
-                                       STARPU_R, data_handles[x][y-1],
-                                       STARPU_R, data_handles[x][y+1],
-                                       0);
-    starpu_task_wait_for_all();
-@end smallexample
-@end cartouche
-
-I.e. all MPI nodes process the whole task graph, but as mentioned above, for
-each task, only the MPI node which owns the data being written to (here,
-@code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
-automatically send the required data.
-
-This can be a concern with a growing number of nodes. To avoid this, the
-application can prune the task for loops according to the data distribution,
-so as to only submit tasks on nodes which have to care about them (either to
-execute them, or to send the required data).
-
-@node MPI Collective Operations
-@section MPI Collective Operations
-
-The functions are described in @ref{Collective Operations}.
-
-@cartouche
-@smallexample
-if (rank == root)
-@{
-    /* Allocate the vector */
-    vector = malloc(nblocks * sizeof(float *));
-    for(x=0 ; x<nblocks ; x++)
-    @{
-        starpu_malloc((void **)&vector[x], block_size*sizeof(float));
-    @}
-@}
-
-/* Allocate data handles and register data to StarPU */
-data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
-for(x = 0; x < nblocks ;  x++)
-@{
-    int mpi_rank = my_distrib(x, nodes);
-    if (rank == root) @{
-        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
-                                    blocks_size, sizeof(float));
-    @}
-    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{
-        /* I own that index, or i will need it for my computations */
-        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
-                                   block_size, sizeof(float));
-    @}
-    else @{
-        /* I know it's useless to allocate anything for this */
-        data_handles[x] = NULL;
-    @}
-    if (data_handles[x]) @{
-        starpu_data_set_rank(data_handles[x], mpi_rank);
-        starpu_data_set_tag(data_handles[x], x*nblocks+y);
-    @}
-@}
-
-/* Scatter the matrix among the nodes */
-starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
-
-/* Calculation */
-for(x = 0; x < nblocks ;  x++) @{
-    if (data_handles[x]) @{
-        int owner = starpu_data_get_rank(data_handles[x]);
-        if (owner == rank) @{
-            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
-        @}
-    @}
-@}
-
-/* Gather the matrix on main node */
-starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
-@end smallexample
-@end cartouche

+ 0 - 608
doc/texinfo/chapters/perf-feedback.texi

@@ -1,608 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Task debugger::               Using the Temanejo task debugger
-* On-line::                     On-line performance feedback
-* Off-line::                    Off-line performance feedback
-* Codelet performance::         Performance of codelets
-* Theoretical lower bound on execution time::
-* Memory feedback::
-* Data statistics::
-@end menu
-
-@node Task debugger
-@section Using the Temanejo task debugger
-
-StarPU can connect to Temanejo (see
-@url{http://www.hlrs.de/temanejo}), to permit
-nice visual task debugging. To do so, build Temanejo's @code{libayudame.so},
-install @code{Ayudame.h} to e.g. @code{/usr/local/include}, apply the
-@code{tools/patch-ayudame} to it to fix C build, re-@code{./configure}, make
-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
-to your application, any options you want to pass it, the path to libayudame.so.
-
-Make sure to specify at least the same number of CPUs in the dialog box as your
-machine has, otherwise an error will happen during execution. Future versions
-of Temanejo should be able to tell StarPU the number of CPUs to use.
-
-Tag numbers have to be below @code{4000000000000000000ULL} to be usable for
-Temanejo (so as to distinguish them from tasks).
-
-@node On-line
-@section On-line performance feedback
-
-@menu
-* Enabling on-line performance monitoring::
-* Task feedback::               Per-task feedback
-* Codelet feedback::            Per-codelet feedback
-* Worker feedback::             Per-worker feedback
-* Bus feedback::                Bus-related feedback
-* StarPU-Top::                  StarPU-Top interface
-@end menu
-
-@node Enabling on-line performance monitoring
-@subsection Enabling on-line performance monitoring
-
-In order to enable online performance monitoring, the application can call
-@code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible to
-detect whether monitoring is already enabled or not by calling
-@code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize all
-previously collected feedback. The @code{STARPU_PROFILING} environment variable
-can also be set to 1 to achieve the same effect.
-
-Likewise, performance monitoring is stopped by calling
-@code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that this
-does not reset the performance counters so that the application may consult
-them later on.
-
-More details about the performance monitoring API are available in section
-@ref{Profiling API}.
-
-@node Task feedback
-@subsection Per-task feedback
-
-If profiling is enabled, a pointer to a @code{struct starpu_profiling_task_info}
-is put in the @code{.profiling_info} field of the @code{starpu_task}
-structure when a task terminates.
-This structure is automatically destroyed when the task structure is destroyed,
-either automatically or by calling @code{starpu_task_destroy}.
-
-The @code{struct starpu_profiling_task_info} indicates the date when the
-task was submitted (@code{submit_time}), started (@code{start_time}), and
-terminated (@code{end_time}), relative to the initialization of
-StarPU with @code{starpu_init}. It also specifies the identifier of the worker
-that has executed the task (@code{workerid}).
-These date are stored as @code{timespec} structures which the user may convert
-into micro-seconds using the @code{starpu_timing_timespec_to_us} helper
-function.
-
-It it worth noting that the application may directly access this structure from
-the callback executed at the end of the task. The @code{starpu_task} structure
-associated to the callback currently being executed is indeed accessible with
-the @code{starpu_task_get_current()} function.
-
-@node Codelet feedback
-@subsection Per-codelet feedback
-
-The @code{per_worker_stats} field of the @code{struct starpu_codelet} structure is
-an array of counters. The i-th entry of the array is incremented every time a
-task implementing the codelet is executed on the i-th worker.
-This array is not reinitialized when profiling is enabled or disabled.
-
-@node Worker feedback
-@subsection Per-worker feedback
-
-The second argument returned by the @code{starpu_profiling_worker_get_info}
-function is a @code{struct starpu_profiling_worker_info} that gives
-statistics about the specified worker. This structure specifies when StarPU
-started collecting profiling information for that worker (@code{start_time}),
-the duration of the profiling measurement interval (@code{total_time}), the
-time spent executing kernels (@code{executing_time}), the time spent sleeping
-because there is no task to execute at all (@code{sleeping_time}), and the
-number of tasks that were executed while profiling was enabled.
-These values give an estimation of the proportion of time spent do real work,
-and the time spent either sleeping because there are not enough executable
-tasks or simply wasted in pure StarPU overhead.
-
-Calling @code{starpu_profiling_worker_get_info} resets the profiling
-information associated to a worker.
-
-When an FxT trace is generated (see @ref{Generating traces}), it is also
-possible to use the @code{starpu_workers_activity} script (described in @ref{starpu-workers-activity}) to
-generate a graphic showing the evolution of these values during the time, for
-the different workers.
-
-@node Bus feedback
-@subsection Bus-related feedback
-
-TODO: ajouter STARPU_BUS_STATS
-
-@c how to enable/disable performance monitoring
-
-@c what kind of information do we get ?
-
-The bus speed measured by StarPU can be displayed by using the
-@code{starpu_machine_display} tool, for instance:
-
-@example
-StarPU has found:
-        3 CUDA devices
-                CUDA 0 (Tesla C2050 02:00.0)
-                CUDA 1 (Tesla C2050 03:00.0)
-                CUDA 2 (Tesla C2050 84:00.0)
-from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
-RAM     0.000000        5176.530428     5176.492994     5191.710722
-CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
-CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
-CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
-@end example
-
-@node StarPU-Top
-@subsection StarPU-Top interface
-
-StarPU-Top is an interface which remotely displays the on-line state of a StarPU
-application and permits the user to change parameters on the fly.
-
-Variables to be monitored can be registered by calling the
-@code{starpu_top_add_data_boolean}, @code{starpu_top_add_data_integer},
-@code{starpu_top_add_data_float} functions, e.g.:
-
-@cartouche
-@smallexample
-starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
-@end smallexample
-@end cartouche
-
-The application should then call @code{starpu_top_init_and_wait} to give its name
-and wait for StarPU-Top to get a start request from the user. The name is used
-by StarPU-Top to quickly reload a previously-saved layout of parameter display.
-
-@cartouche
-@smallexample
-starpu_top_init_and_wait("the application");
-@end smallexample
-@end cartouche
-
-The new values can then be provided thanks to
-@code{starpu_top_update_data_boolean}, @code{starpu_top_update_data_integer},
-@code{starpu_top_update_data_float}, e.g.:
-
-@cartouche
-@smallexample
-starpu_top_update_data_integer(data, mynum);
-@end smallexample
-@end cartouche
-
-Updateable parameters can be registered thanks to @code{starpu_top_register_parameter_boolean}, @code{starpu_top_register_parameter_integer}, @code{starpu_top_register_parameter_float}, e.g.:
-
-@cartouche
-@smallexample
-float alpha;
-starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
-@end smallexample
-@end cartouche
-
-@code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
-
-@cartouche
-@smallexample
-void modif_hook(struct starpu_top_param *d) @{
-    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
-@}
-@end smallexample
-@end cartouche
-
-Task schedulers should notify StarPU-Top when it has decided when a task will be
-scheduled, so that it can show it in its Gantt chart, for instance:
-
-@cartouche
-@smallexample
-starpu_top_task_prevision(task, workerid, begin, end);
-@end smallexample
-@end cartouche
-
-Starting StarPU-Top@footnote{StarPU-Top is started via the binary
-@code{starpu_top}.} and the application can be done two ways:
-
-@itemize
-@item The application is started by hand on some machine (and thus already
-waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
-checkbox should be unchecked, and the hostname and port (default is 2011) on
-which the application is already running should be specified. Clicking on the
-connection button will thus connect to the already-running application.
-@item StarPU-Top is started first, and clicking on the connection button will
-start the application itself (possibly on a remote machine). The SSH checkbox
-should be checked, and a command line provided, e.g.:
-
-@example
-$ ssh myserver STARPU_SCHED=dmda ./application
-@end example
-
-If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
-
-@example
-$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
-@end example
-
-and "localhost" should be used as IP Address to connect to.
-@end itemize
-
-@node Off-line
-@section Off-line performance feedback
-
-@menu
-* Generating traces::           Generating traces with FxT
-* Gantt diagram::               Creating a Gantt Diagram
-* DAG::                         Creating a DAG with graphviz
-* starpu-workers-activity::     Monitoring activity
-@end menu
-
-@node Generating traces
-@subsection Generating traces with FxT
-
-StarPU can use the FxT library (see
-@url{https://savannah.nongnu.org/projects/fkt/}) to generate traces
-with a limited runtime overhead.
-
-You can either get a tarball:
-@example
-$ wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.11.tar.gz
-@end example
-
-or use the FxT library from CVS (autotools are required):
-@example
-$ cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
-$ ./bootstrap
-@end example
-
-Compiling and installing the FxT library in the @code{$FXTDIR} path is
-done following the standard procedure:
-@example
-$ ./configure --prefix=$FXTDIR
-$ make
-$ make install
-@end example
-
-In order to have StarPU to generate traces, StarPU should be configured with
-the @code{--with-fxt} option:
-@example
-$ ./configure --with-fxt=$FXTDIR
-@end example
-
-Or you can simply point the @code{PKG_CONFIG_PATH} to
-@code{$FXTDIR/lib/pkgconfig} and pass @code{--with-fxt} to @code{./configure}
-
-When FxT is enabled, a trace is generated when StarPU is terminated by calling
-@code{starpu_shutdown()}). The trace is a binary file whose name has the form
-@code{prof_file_XXX_YYY} where @code{XXX} is the user name, and
-@code{YYY} is the pid of the process that used StarPU. This file is saved in the
-@code{/tmp/} directory by default, or by the directory specified by
-the @code{STARPU_FXT_PREFIX} environment variable.
-
-@node Gantt diagram
-@subsection Creating a Gantt Diagram
-
-When the FxT trace file @code{filename} has been generated, it is possible to
-generate a trace in the Paje format by calling:
-@example
-$ starpu_fxt_tool -i filename
-@end example
-
-Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
-to @code{1} before application execution will make StarPU do it automatically at
-application shutdown.
-
-This will create a @code{paje.trace} file in the current directory that
-can be inspected with the @url{http://vite.gforge.inria.fr/, ViTE trace
-visualizing open-source tool}.  It is possible to open the
-@code{paje.trace} file with ViTE by using the following command:
-@example
-$ vite paje.trace
-@end example
-
-To get names of tasks instead of "unknown", fill the optional @code{name} field
-of the codelets, or use a performance model for them.
-
-In the MPI execution case, collect the trace files from the MPI nodes, and
-specify them all on the @code{starpu_fxt_tool} command, for instance:
-
-@smallexample
-$ starpu_fxt_tool -i filename1 -i filename2
-@end smallexample
-
-By default, all tasks are displayed using a green color. To display tasks with
-varying colors, pass option @code{-c} to @code{starpu_fxt_tool}.
-
-Traces can also be inspected by hand by using the @code{fxt_print} tool, for instance:
-
-@smallexample
-$ fxt_print -o -f filename
-@end smallexample
-
-Timings are in nanoseconds (while timings as seen in @code{vite} are in milliseconds).
-
-@node DAG
-@subsection Creating a DAG with graphviz
-
-When the FxT trace file @code{filename} has been generated, it is possible to
-generate a task graph in the DOT format by calling:
-@example
-$ starpu_fxt_tool -i filename
-@end example
-
-This will create a @code{dag.dot} file in the current directory. This file is a
-task graph described using the DOT language. It is possible to get a
-graphical output of the graph by using the graphviz library:
-@example
-$ dot -Tpdf dag.dot -o output.pdf
-@end example
-
-@node starpu-workers-activity
-@subsection Monitoring activity
-
-When the FxT trace file @code{filename} has been generated, it is possible to
-generate an activity trace by calling:
-@example
-$ starpu_fxt_tool -i filename
-@end example
-
-This will create an @code{activity.data} file in the current
-directory. A profile of the application showing the activity of StarPU
-during the execution of the program can be generated:
-@example
-$ starpu_workers_activity activity.data
-@end example
-
-This will create a file named @code{activity.eps} in the current directory.
-This picture is composed of two parts.
-The first part shows the activity of the different workers. The green sections
-indicate which proportion of the time was spent executed kernels on the
-processing unit. The red sections indicate the proportion of time spent in
-StartPU: an important overhead may indicate that the granularity may be too
-low, and that bigger tasks may be appropriate to use the processing unit more
-efficiently. The black sections indicate that the processing unit was blocked
-because there was no task to process: this may indicate a lack of parallelism
-which may be alleviated by creating more tasks when it is possible.
-
-The second part of the @code{activity.eps} picture is a graph showing the
-evolution of the number of tasks available in the system during the execution.
-Ready tasks are shown in black, and tasks that are submitted but not
-schedulable yet are shown in grey.
-
-@node Codelet performance
-@section Performance of codelets
-
-The performance model of codelets (described in @ref{Performance model example}) can be examined by using the
-@code{starpu_perfmodel_display} tool:
-
-@example
-$ starpu_perfmodel_display -l
-file: <malloc_pinned.hannibal>
-file: <starpu_slu_lu_model_21.hannibal>
-file: <starpu_slu_lu_model_11.hannibal>
-file: <starpu_slu_lu_model_22.hannibal>
-file: <starpu_slu_lu_model_12.hannibal>
-@end example
-
-Here, the codelets of the lu example are available. We can examine the
-performance of the 22 kernel (in micro-seconds), which is history-based:
-
-@example
-$ starpu_perfmodel_display -s starpu_slu_lu_model_22
-performance model for cpu
-# hash      size       mean          dev           n
-57618ab0    19660800   2.851069e+05  1.829369e+04  109
-performance model for cuda_0
-# hash      size       mean          dev           n
-57618ab0    19660800   1.164144e+04  1.556094e+01  315
-performance model for cuda_1
-# hash      size       mean          dev           n
-57618ab0    19660800   1.164271e+04  1.330628e+01  360
-performance model for cuda_2
-# hash      size       mean          dev           n
-57618ab0    19660800   1.166730e+04  3.390395e+02  456
-@end example
-
-We can see that for the given size, over a sample of a few hundreds of
-execution, the GPUs are about 20 times faster than the CPUs (numbers are in
-us). The standard deviation is extremely low for the GPUs, and less than 10% for
-CPUs.
-
-This tool can also be used for regression-based performance models. It will then
-display the regression formula, and in the case of non-linear regression, the
-same performance log as for history-based performance models:
-
-@example
-$ starpu_perfmodel_display -s non_linear_memset_regression_based
-performance model for cpu_impl_0
-	Regression : #sample = 1400
-	Linear: y = alpha size ^ beta
-		alpha = 1.335973e-03
-		beta = 8.024020e-01
-	Non-Linear: y = a size ^b + c
-		a = 5.429195e-04
-		b = 8.654899e-01
-		c = 9.009313e-01
-# hash		size		mean		stddev		n
-a3d3725e	4096           	4.763200e+00   	7.650928e-01   	100
-870a30aa	8192           	1.827970e+00   	2.037181e-01   	100
-48e988e9	16384          	2.652800e+00   	1.876459e-01   	100
-961e65d2	32768          	4.255530e+00   	3.518025e-01   	100
-...
-@end example
-
-The same can also be achieved by using StarPU's library API, see
-@ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
-function. The source code of the @code{starpu_perfmodel_display} tool can be a
-useful example.
-
-The @code{starpu_perfmodel_plot} tool can be used to draw performance models.
-It writes a @code{.gp} file in the current directory, to be run in the
-@code{gnuplot} tool, which shows the corresponding curve.
-
-When the @code{flops} field of tasks is set, @code{starpu_perfmodel_plot} can
-directly draw a GFlops curve, by simply adding the @code{-f} option:
-
-@example
-$ starpu_perfmodel_display -f -s chol_model_11
-@end example
-
-This will however disable displaying the regression model, for which we can not
-compute GFlops.
-
-When the FxT trace file @code{filename} has been generated, it is possible to
-get a profiling of each codelet by calling:
-@example
-$ starpu_fxt_tool -i filename
-$ starpu_codelet_profile distrib.data codelet_name
-@end example
-
-This will create profiling data files, and a @code{.gp} file in the current
-directory, which draws the distribution of codelet time over the application
-execution, according to data input size.
-
-This is also available in the @code{starpu_perfmodel_plot} tool, by passing it
-the fxt trace:
-
-@example
-$ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_foo_0
-@end example
-
-It will produce a @code{.gp} file which contains both the performance model
-curves, and the profiling measurements.
-
-If you have the R statistical tool installed, you can additionally use
-
-@example
-$ starpu_codelet_histo_profile distrib.data
-@end example
-
-Which will create one pdf file per codelet and per input size, showing a
-histogram of the codelet execution time distribution.
-
-@node Theoretical lower bound on execution time
-@section Theoretical lower bound on execution time
-
-StarPU can record a trace of what tasks are needed to complete the
-application, and then, by using a linear system, provide a theoretical lower
-bound of the execution time (i.e. with an ideal scheduling).
-
-The computed bound is not really correct when not taking into account
-dependencies, but for an application which have enough parallelism, it is very
-near to the bound computed with dependencies enabled (which takes a huge lot
-more time to compute), and thus provides a good-enough estimation of the ideal
-execution time.
-
-@ref{Theoretical lower bound on execution time} provides an example on how to
-use this.
-
-@node Memory feedback
-@section Memory feedback
-
-It is possible to enable memory statistics. To do so, you need to pass the option
-@code{--enable-memory-stats} when running configure. It is then
-possible to call the function @code{starpu_display_memory_stats()} to
-display statistics about the current data handles registered within StarPU.
-
-Moreover, statistics will be displayed at the end of the execution on
-data handles which have not been cleared out. This can be disabled by
-setting the environment variable @code{STARPU_MEMORY_STATS} to 0.
-
-For example, if you do not unregister data at the end of the complex
-example, you will get something similar to:
-
-@example
-$ STARPU_MEMORY_STATS=0 ./examples/interface/complex
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 78.00 + 78.00 i
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 45.00 + 12.00 i
-@end example
-
-@example
-$ STARPU_MEMORY_STATS=1 ./examples/interface/complex
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 78.00 + 78.00 i
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 45.00 + 12.00 i
-
-#---------------------
-Memory stats:
-#-------
-Data on Node #3
-#-----
-Data : 0x553ff40
-Size : 16
-
-#--
-Data access stats
-/!\ Work Underway
-Node #0
-	Direct access : 4
-	Loaded (Owner) : 0
-	Loaded (Shared) : 0
-	Invalidated (was Owner) : 0
-
-Node #3
-	Direct access : 0
-	Loaded (Owner) : 0
-	Loaded (Shared) : 1
-	Invalidated (was Owner) : 0
-
-#-----
-Data : 0x5544710
-Size : 16
-
-#--
-Data access stats
-/!\ Work Underway
-Node #0
-	Direct access : 2
-	Loaded (Owner) : 0
-	Loaded (Shared) : 1
-	Invalidated (was Owner) : 1
-
-Node #3
-	Direct access : 0
-	Loaded (Owner) : 1
-	Loaded (Shared) : 0
-	Invalidated (was Owner) : 0
-@end example
-
-@node Data statistics
-@section Data statistics
-
-Different data statistics can be displayed at the end of the execution
-of the application. To enable them, you need to pass the option
-@code{--enable-stats} when calling @code{configure}. When calling
-@code{starpu_shutdown()} various statistics will be displayed,
-execution, MSI cache statistics, allocation cache statistics, and data
-transfer statistics. The display can be disabled by setting the
-environment variable @code{STARPU_STATS} to 0.
-
-@example
-$ ./examples/cholesky/cholesky_tag
-Computation took (in ms)
-518.16
-Synthetic GFlops : 44.21
-#---------------------
-MSI cache stats :
-TOTAL MSI stats	hit 1622 (66.23 %)	miss 827 (33.77 %)
-...
-@end example
-
-@example
-$ STARPU_STATS=0 ./examples/cholesky/cholesky_tag
-Computation took (in ms)
-518.16
-Synthetic GFlops : 44.21
-@end example
-
-@c TODO: data transfer stats are similar to the ones displayed when
-@c setting STARPU_BUS_STATS

+ 0 - 569
doc/texinfo/chapters/perf-optimization.texi

@@ -1,569 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-TODO: improve!
-
-@menu
-* Data management::
-* Task granularity::
-* Task submission::
-* Task priorities::
-* Task scheduling policy::
-* Performance model calibration::
-* Task distribution vs Data transfer::
-* Data prefetch::
-* Power-based scheduling::
-* Static scheduling::
-* Profiling::
-* CUDA-specific optimizations::
-* Performance debugging::
-* Simulated performance::
-@end menu
-
-Simply encapsulating application kernels into tasks already permits to
-seamlessly support CPU and GPUs at the same time. To achieve good performance, a
-few additional changes are needed.
-
-@node Data management
-@section Data management
-
-When the application allocates data, whenever possible it should use the
-@code{starpu_malloc} function, which will ask CUDA or
-OpenCL to make the allocation itself and pin the corresponding allocated
-memory. This is needed to permit asynchronous data transfer, i.e. permit data
-transfer to overlap with computations. Otherwise, the trace will show that the
-@code{DriverCopyAsync} state takes a lot of time, this is because CUDA or OpenCL
-then reverts to synchronous transfers.
-
-By default, StarPU leaves replicates of data wherever they were used, in case they
-will be re-used by other tasks, thus saving the data transfer time. When some
-task modifies some data, all the other replicates are invalidated, and only the
-processing unit which ran that task will have a valid replicate of the data. If the application knows
-that this data will not be re-used by further tasks, it should advise StarPU to
-immediately replicate it to a desired list of memory nodes (given through a
-bitmask). This can be understood like the write-through mode of CPU caches.
-
-@cartouche
-@smallexample
-starpu_data_set_wt_mask(img_handle, 1<<0);
-@end smallexample
-@end cartouche
-
-will for instance request to always automatically transfer a replicate into the
-main memory (node 0), as bit 0 of the write-through bitmask is being set.
-
-@cartouche
-@smallexample
-starpu_data_set_wt_mask(img_handle, ~0U);
-@end smallexample
-@end cartouche
-
-will request to always automatically broadcast the updated data to all memory
-nodes.
-
-Setting the write-through mask to @code{~0U} can also be useful to make sure all
-memory nodes always have a copy of the data, so that it is never evicted when
-memory gets scarse.
-
-Implicit data dependency computation can become expensive if a lot
-of tasks access the same piece of data. If no dependency is required
-on some piece of data (e.g. because it is only accessed in read-only
-mode, or because write accesses are actually commutative), use the
-@code{starpu_data_set_sequential_consistency_flag} function to disable implicit
-dependencies on that data.
-
-In the same vein, accumulation of results in the same data can become a
-bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
-accumulation (@pxref{Data reduction}). To a lesser extent, the use of the
-@code{STARPU_COMMUTE} flag keeps the bottleneck, but at least permits the
-accumulation to happen in any order.
-
-Applications often need a data just for temporary results.  In such a case,
-registration can be made without an initial value, for instance this produces a vector data:
-
-@cartouche
-@smallexample
-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
-@end smallexample
-@end cartouche
-
-StarPU will then allocate the actual buffer only when it is actually needed,
-e.g. directly on the GPU without allocating in main memory.
-
-In the same vein, once the temporary results are not useful any more, the
-data should be thrown away. If the handle is not to be reused, it can be
-unregistered:
-
-@cartouche
-@smallexample
-starpu_unregister_submit(handle);
-@end smallexample
-@end cartouche
-
-actual unregistration will be done after all tasks working on the handle
-terminate.
-
-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
-
-@cartouche
-@smallexample
-starpu_invalidate_submit(handle);
-@end smallexample
-@end cartouche
-
-the buffers containing the current value will then be freed, and reallocated
-only when another task writes some value to the handle.
-
-@node Task granularity
-@section Task granularity
-
-Like any other runtime, StarPU has some overhead to manage tasks. Since
-it does smart scheduling and data management, that overhead is not always
-neglectable. The order of magnitude of the overhead is typically a couple of
-microseconds, which is actually quite smaller than the CUDA overhead itself. The
-amount of work that a task should do should thus be somewhat
-bigger, to make sure that the overhead becomes neglectible. The offline
-performance feedback can provide a measure of task length, which should thus be
-checked if bad performance are observed. To get a grasp at the scalability
-possibility according to task size, one can run
-@code{tests/microbenchs/tasks_size_overhead.sh} which draws curves of the
-speedup of independent tasks of very small sizes.
-
-The choice of scheduler also has impact over the overhead: for instance, the
-@code{dmda} scheduler takes time to make a decision, while @code{eager} does
-not. @code{tasks_size_overhead.sh} can again be used to get a grasp at how much
-impact that has on the target machine.
-
-@node Task submission
-@section Task submission
-
-To let StarPU make online optimizations, tasks should be submitted
-asynchronously as much as possible. Ideally, all the tasks should be
-submitted, and mere calls to @code{starpu_task_wait_for_all} or
-@code{starpu_data_unregister} be done to wait for
-termination. StarPU will then be able to rework the whole schedule, overlap
-computation with communication, manage accelerator local memory usage, etc.
-
-@node Task priorities
-@section Task priorities
-
-By default, StarPU will consider the tasks in the order they are submitted by
-the application. If the application programmer knows that some tasks should
-be performed in priority (for instance because their output is needed by many
-other tasks and may thus be a bottleneck if not executed early enough), the
-@code{priority} field of the task structure should be set to transmit the
-priority information to StarPU.
-
-@node Task scheduling policy
-@section Task scheduling policy
-
-By default, StarPU uses the @code{eager} simple greedy scheduler. This is
-because it provides correct load balance even if the application codelets do not
-have performance models. If your application codelets have performance models
-(@pxref{Performance model example} for examples showing how to do it),
-you should change the scheduler thanks to the @code{STARPU_SCHED} environment
-variable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to get
-the list of available schedulers.
-
-The @b{eager} scheduler uses a central task queue, from which workers draw tasks
-to work on. This however does not permit to prefetch data since the scheduling
-decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
-
-The @b{prio} scheduler also uses a central task queue, but sorts tasks by
-priority (between -5 and 5).
-
-The @b{random} scheduler distributes tasks randomly according to assumed worker
-overall performance.
-
-The @b{ws} (work stealing) scheduler schedules tasks on the local worker by
-default. When a worker becomes idle, it steals a task from the most loaded
-worker.
-
-The @b{dm} (deque model) scheduler uses task execution performance models into account to
-perform an HEFT-similar scheduling strategy: it schedules tasks where their
-termination time will be minimal.
-
-The @b{dmda} (deque model data aware) scheduler is similar to dm, it also takes
-into account data transfer time.
-
-The @b{dmdar} (deque model data aware ready) scheduler is similar to dmda,
-it also sorts tasks on per-worker queues by number of already-available data
-buffers.
-
-The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
-also supports arbitrary priority values.
-
-The @b{heft} (heterogeneous earliest finish time) scheduler is deprecated. It
-is now just an alias for @b{dmda}.
-
-The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
-parallel tasks (still experimental).
-
-The @b{peager} (parallel eager) scheduler is similar to eager, it also
-supports parallel tasks (still experimental).
-
-@node Performance model calibration
-@section Performance model calibration
-
-Most schedulers are based on an estimation of codelet duration on each kind
-of processing unit. For this to be possible, the application programmer needs
-to configure a performance model for the codelets of the application (see
-@ref{Performance model example} for instance). History-based performance models
-use on-line calibration.  StarPU will automatically calibrate codelets
-which have never been calibrated yet, and save the result in
-@code{$STARPU_HOME/.starpu/sampling/codelets}.
-The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
-@code{export STARPU_CALIBRATE=1} . This may be necessary if your application
-has not-so-stable performance. StarPU will force calibration (and thus ignore
-the current result) until 10 (_STARPU_CALIBRATION_MINIMUM) measurements have been
-made on each architecture, to avoid badly scheduling tasks just because the
-first measurements were not so good. Details on the current performance model status
-can be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}
-option lists the available performance models, and the @code{-s} option permits
-to choose the performance model to be displayed. The result looks like:
-
-@example
-$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
-performance model for cpu
-# hash    size     mean          dev           n
-880805ba  98304    2.731309e+02  6.010210e+01  1240
-b50b6605  393216   1.469926e+03  1.088828e+02  1240
-5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
-@end example
-
-Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
-execution time on CPUs was about 11ms, with a 3ms standard deviation, over
-1240 samples. It is a good idea to check this before doing actual performance
-measurements.
-
-A graph can be drawn by using the @code{starpu_perfmodel_plot}:
-
-@example
-$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
-98304 393216 1572864
-$ gnuplot starpu_starpu_dlu_lu_model_22.gp
-$ gv starpu_starpu_dlu_lu_model_22.eps
-@end example
-
-If a kernel source code was modified (e.g. performance improvement), the
-calibration information is stale and should be dropped, to re-calibrate from
-start. This can be done by using @code{export STARPU_CALIBRATE=2}.
-
-Note: due to CUDA limitations, to be able to measure kernel duration,
-calibration mode needs to disable asynchronous data transfers. Calibration thus
-disables data transfer / computation overlapping, and should thus not be used
-for eventual benchmarks. Note 2: history-based performance models get calibrated
-only if a performance-model-based scheduler is chosen.
-
-The history-based performance models can also be explicitly filled by the
-application without execution, if e.g. the application already has a series of
-measurements. This can be done by using @code{starpu_perfmodel_update_history},
-for instance:
-
-@cartouche
-@smallexample
-static struct starpu_perfmodel perf_model = @{
-    .type = STARPU_HISTORY_BASED,
-    .symbol = "my_perfmodel",
-@};
-
-struct starpu_codelet cl = @{
-    .where = STARPU_CUDA,
-    .cuda_funcs = @{ cuda_func1, cuda_func2, NULL @},
-    .nbuffers = 1,
-    .modes = @{STARPU_W@},
-    .model = &perf_model
-@};
-
-void feed(void) @{
-    struct my_measure *measure;
-    struct starpu_task task;
-    starpu_task_init(&task);
-
-    task.cl = &cl;
-
-    for (measure = &measures[0]; measure < measures[last]; measure++) @{
-        starpu_data_handle_t handle;
-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
-	task.handles[0] = handle;
-	starpu_perfmodel_update_history(&perf_model, &task,
-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
-	                                measure->implementation, measure->time);
-	starpu_task_clean(&task);
-	starpu_data_unregister(handle);
-    @}
-@}
-@end smallexample
-@end cartouche
-
-Measurement has to be provided in milliseconds for the completion time models,
-and in Joules for the energy consumption models.
-
-@node Task distribution vs Data transfer
-@section Task distribution vs Data transfer
-
-Distributing tasks to balance the load induces data transfer penalty. StarPU
-thus needs to find a balance between both. The target function that the
-@code{dmda} scheduler of StarPU
-tries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where
-@code{T_execution} is the estimated execution time of the codelet (usually
-accurate), and @code{T_data_transfer} is the estimated data transfer time. The
-latter is estimated based on bus calibration before execution start,
-i.e. with an idle machine, thus without contention. You can force bus re-calibration by running
-@code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can be
-worth trying to tweak it by using @code{export STARPU_SCHED_BETA=2} for instance,
-since during real application execution, contention makes transfer times bigger.
-This is of course imprecise, but in practice, a rough estimation already gives
-the good results that a precise estimation would give.
-
-@node Data prefetch
-@section Data prefetch
-
-The @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
-as soon as a scheduling decision is taken for a task, requests are issued to
-transfer its required data to the target processing unit, if needeed, so that
-when the processing unit actually starts the task, its data will hopefully be
-already available and it will not have to wait for the transfer to finish.
-
-The application may want to perform some manual prefetching, for several reasons
-such as excluding initial data transfers from performance measurements, or
-setting up an initial statically-computed data distribution on the machine
-before submitting tasks, which will thus guide StarPU toward an initial task
-distribution (since StarPU will try to avoid further transfers).
-
-This can be achieved by giving the @code{starpu_data_prefetch_on_node} function
-the handle and the desired target memory node.
-
-@node Power-based scheduling
-@section Power-based scheduling
-
-If the application can provide some power performance model (through
-the @code{power_model} field of the codelet structure), StarPU will
-take it into account when distributing tasks. The target function that
-the @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +
-beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}
-is the estimated task consumption in Joules. To tune this parameter, use
-@code{export STARPU_SCHED_GAMMA=3000} for instance, to express that each Joule
-(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
-@code{alpha} and @code{beta} to zero permits to only take into account power consumption.
-
-This is however not sufficient to correctly optimize power: the scheduler would
-simply tend to run all computations on the most energy-conservative processing
-unit. To account for the consumption of the whole machine (including idle
-processing units), the idle power of the machine should be given by setting
-@code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can often
-be obtained from the machine power supplier.
-
-The power actually consumed by the total execution can be displayed by setting
-@code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
-
-On-line task consumption measurement is currently only supported through the
-@code{CL_PROFILING_POWER_CONSUMED} OpenCL extension, implemented in the MoviSim
-simulator. Applications can however provide explicit measurements by using the
-@code{starpu_perfmodel_update_history} function (examplified in @ref{Performance
-model example} with the @code{power_model} performance model. Fine-grain
-measurement is often not feasible with the feedback provided by the hardware, so
-the user can for instance run a given task a thousand times, measure the global
-consumption for that series of tasks, divide it by a thousand, repeat for
-varying kinds of tasks and task sizes, and eventually feed StarPU
-with these manual measurements through @code{starpu_perfmodel_update_history}.
-
-@node Static scheduling
-@section Static scheduling
-
-In some cases, one may want to force some scheduling, for instance force a given
-set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
-be scheduled on any other device. This can indeed be useful to guide StarPU into
-some work distribution, while still letting some degree of dynamism. For
-instance, to force execution of a task on CUDA0:
-
-@cartouche
-@smallexample
-task->execute_on_a_specific_worker = 1;
-task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
-@end smallexample
-@end cartouche
-
-@node Profiling
-@section Profiling
-
-A quick view of how many tasks each worker has executed can be obtained by setting
-@code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
-execution did happen on accelerators without penalizing performance with
-the profiling overhead.
-
-A quick view of how much data transfers have been issued can be obtained by setting
-@code{export STARPU_BUS_STATS=1} .
-
-More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
-calling @code{starpu_profiling_status_set} from the source code.
-Statistics on the execution can then be obtained by using @code{export
-STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
- More details on performance feedback are provided by the next chapter.
-
-@node CUDA-specific optimizations
-@section CUDA-specific optimizations
-
-Due to CUDA limitations, StarPU will have a hard time overlapping its own
-communications and the codelet computations if the application does not use a
-dedicated CUDA stream for its computations instead of the default stream,
-which synchronizes all operations of the GPU. StarPU provides one by the use
-of @code{starpu_cuda_get_local_stream()} which can be used by all CUDA codelet
-operations to avoid this issue. For instance:
-
-@cartouche
-@smallexample
-func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
-cudaStreamSynchronize(starpu_cuda_get_local_stream());
-@end smallexample
-@end cartouche
-
-StarPU already does appropriate calls for the CUBLAS library.
-
-Unfortunately, some CUDA libraries do not have stream variants of
-kernels. That will lower the potential for overlapping.
-
-@node Performance debugging
-@section Performance debugging
-
-To get an idea of what is happening, a lot of performance feedback is available,
-detailed in the next chapter. The various informations should be checked for.
-
-@itemize
-@item What does the Gantt diagram look like? (see @ref{Gantt diagram})
-@itemize
-  @item If it's mostly green (tasks running in the initial context) or context specific 
-  color prevailing, then the machine is properly
-  utilized, and perhaps the codelets are just slow. Check their performance, see
-  @ref{Codelet performance}.
-  @item If it's mostly purple (FetchingInput), tasks keep waiting for data
-  transfers, do you perhaps have far more communication than computation? Did
-  you properly use CUDA streams to make sure communication can be
-  overlapped? Did you use data-locality aware schedulers to avoid transfers as
-  much as possible?
-  @item If it's mostly red (Blocked), tasks keep waiting for dependencies,
-  do you have enough parallelism? It might be a good idea to check what the DAG
-  looks like (see @ref{DAG}).
-  @item If only some workers are completely red (Blocked), for some reason the
-  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
-  check it (see @ref{Codelet performance}). Do all your codelets have a
-  performance model?  When some of them don't, the schedulers switches to a
-  greedy algorithm which thus performs badly.
-@end itemize
-@end itemize
-
-You can also use the Temanejo task debugger (see @ref{Task debugger}) to
-visualize the task graph more easily.
-
-@node Simulated performance
-@section Simulated performance
-
-StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform.
-
-@subsection Calibration
-
-The idea is to first compile StarPU normally, and run the application,
-so as to automatically benchmark the bus and the codelets.
-
-@smallexample
-$ ./configure && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-   is not calibrated, forcing calibration for this run. Use the
-   STARPU_CALIBRATE environment variable to control this.
-$ ...
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST PASSED
-@end smallexample
-
-Note that we force to use the dmda scheduler to generate performance
-models for the application. The application may need to be run several
-times before the model is calibrated.
-
-@subsection Simulation
-
-Then, recompile StarPU, passing @code{--enable-simgrid} to @code{./configure}, and re-run the
-application:
-
-@smallexample
-$ ./configure --enable-simgrid && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST FAILED !!!
-@end smallexample
-
-It is normal that the test fails: since the computation are not actually done
-(that is the whole point of simgrid), the result is wrong, of course.
-
-If the performance model is not calibrated enough, the following error
-message will be displayed
-
-@smallexample
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-    is not calibrated, forcing calibration for this run. Use the
-    STARPU_CALIBRATE environment variable to control this.
-[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
-    matvecmult does not have a perfmodel, or is not calibrated enough
-@end smallexample
-
-The number of devices can be chosen as usual with @code{STARPU_NCPU},
-@code{STARPU_NCUDA}, and @code{STARPU_NOPENCL}.  For now, only the number of
-cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
-lower than the real number on the current machine.
-
-The amount of simulated GPU memory is for now unbound by default, but
-it can be chosen by hand through the @code{STARPU_LIMIT_CUDA_MEM},
-@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}, and
-@code{STARPU_LIMIT_OPENCL_devid_MEM} environment variables.
-
-The Simgrid default stack size is small; to increase it use the
-parameter @code{--cfg=contexts/stack_size}, for example:
-
-@smallexample
-$ ./example --cfg=contexts/stack_size:8192
-TEST FAILED !!!
-@end smallexample
-
-Note: of course, if the application uses @code{gettimeofday} to make its
-performance measurements, the real time will be used, which will be bogus. To
-get the simulated time, it has to use @code{starpu_timing_now} which returns the
-virtual timestamp in ms.
-
-@subsection Simulation on another machine
-
-The simgrid support even permits to perform simulations on another machine, your
-desktop, typically. To achieve this, one still needs to perform the Calibration
-step on the actual machine to be simulated, then copy them to your desktop
-machine (the @code{$STARPU_HOME/.starpu} directory). One can then perform the
-Simulation step on the desktop machine, by setting the @code{STARPU_HOSTNAME}
-environment variable to the name of the actual machine, to make StarPU use the
-performance models of the simulated machine even on the desktop machine.
-
-If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
-use simgrid to simulate execution with CUDA/OpenCL devices, but the application
-source code will probably disable the CUDA and OpenCL codelets in that
-case. Since during simgrid execution, the functions of the codelet are actually
-not called, one can use dummy functions such as the following to still permit
-CUDA or OpenCL execution:
-
-@smallexample
-static struct starpu_codelet cl11 =
-@{
-	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
-	.cpu_funcs_name = @{"chol_cpu_codelet_update_u11", NULL@},
-#ifdef STARPU_USE_CUDA
-	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
-#elif defined(STARPU_SIMGRID)
-	.cuda_funcs = @{(void*)1, NULL@},
-#endif
-	.nbuffers = 1,
-	.modes = @{STARPU_RW@},
-	.model = &chol_model_11
-@};
-@end smallexample

+ 0 - 130
doc/texinfo/chapters/sc_hypervisor.texi

@@ -1,130 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@cindex Scheduling Context Hypervisor
-
-@menu
-* What is the Hypervisor::
-* Start the Hypervisor::
-* Interrogate the runtime::
-* Trigger the Hypervisor::
-* Resizing strategies::
-@end menu
-
-@node What is the Hypervisor 
-@section What is the Hypervisor
-StarPU proposes a platform for constructing Scheduling Contexts, for deleting and modifying them dynamically.
-A parallel kernel, can thus be isolated into a scheduling context and interferences between several parallel kernels are avoided.
-If the user knows exactly how many workers each scheduling context needs, he can assign them to the contexts at their creation time or modify them during the execution of the program.
-
-The Scheduling Context Hypervisor Plugin is available for the users who do not dispose of a regular parallelism, who cannot know in advance the exact size of the context and need to resize the contexts according to the behavior of the parallel kernels.
-The Hypervisor receives information from StarPU concerning the execution of the tasks, the efficiency of the resources, etc. and it decides accordingly when and how the contexts can be resized.
-Basic strategies of resizing scheduling contexts already exist but a platform for implementing additional custom ones is available.
-
-@node Start the Hypervisor
-@section Start the Hypervisor
-The Hypervisor must be initialised once at the beging of the application. At this point a resizing policy should be indicated. This strategy depends on the information the application is able to provide to the hypervisor as well
-as on the accuracy needed for the resizing procedure. For exemple, the application may be able to provide an estimation of the workload of the contexts. In this situation the hypervisor may decide what resources the contexts need.
-However, if no information is provided the hypervisor evaluates the behavior of the resources and of the application and makes a guess about the future.
-The hypervisor resizes only the registered contexts.
-
-@node Interrogate the runtime
-@section Interrrogate the runtime
-The runtime provides the hypervisor with information concerning the behavior of the resources and the application. This is done by using the performance_counters, some callbacks indicating when the resources are idle or not efficient, when the application submits tasks or when it becames to slow. 
-
-@node Trigger the Hypervisor
-@section Trigger the Hypervisor
-The resizing is triggered either when the application requires it or when the initials distribution of resources alters the performance of the application( the application is to slow or the resource are idle for too long time, threashold indicated by the user). When this happens different resizing strategy are applied that target minimising the total execution of the application, the instant speed or the idle time of the resources.
-
-@node Resizing strategies
-@section Resizing strategies
-
-The plugin proposes several strategies for resizing the scheduling context.
-
-The @b{Application driven} strategy uses the user's input concerning the moment when he wants to resize the contexts.
-Thus, the users tags the task that should trigger the resizing process. We can set directly the corresponding field in the @code{starpu_task} data structure is @code{hypervisor_tag} or
-use the macro @code{STARPU_HYPERVISOR_TAG} in @code{starpu_insert_task} function.
-
-@cartouche
-@smallexample
-task.hypervisor_tag = 2;
-@end smallexample
-@end cartouche
-
-or
-
-@cartouche
-@smallexample
-starpu_insert_task(&codelet,
-		    ...,
-		    STARPU_HYPERVISOR_TAG, 2,
-                    0);
-@end smallexample
-@end cartouche
-
-Then the user has to indicate that when a task with the specified tag is executed the contexts should resize.
-
-@cartouche
-@smallexample
-sc_hypervisor_resize(sched_ctx, 2);
-@end smallexample
-@end cartouche
-
-The user can use the same tag to change the resizing configuration of the contexts if he considers it necessary.
-@cartouche
-@smallexample
-sc_hypervisor_ioctl(sched_ctx,
-                    HYPERVISOR_MIN_WORKERS, 6,
-                    HYPERVISOR_MAX_WORKERS, 12,
-                    HYPERVISOR_TIME_TO_APPLY, 2,
-                    NULL);
-@end smallexample
-@end cartouche
-
-
-The @b{Idleness} based strategy resizes the scheduling contexts every time one of their workers stays idle
-for a period longer than the one imposed by the user (see @pxref{The user's input in the resizing process})
-
-@cartouche
-@smallexample
-int workerids[3] = @{1, 3, 10@};
-int workerids2[9] = @{0, 2, 4, 5, 6, 7, 8, 9, 11@};
-sc_hypervisor_ioctl(sched_ctx_id,
-            HYPERVISOR_MAX_IDLE, workerids, 3, 10000.0,
-            HYPERVISOR_MAX_IDLE, workerids2, 9, 50000.0,
-            NULL);
-@end smallexample
-@end cartouche
-
-The @b{Gflops rate} based strategy resizes the scheduling contexts such that they all finish at the same time.
-The velocity of each of them is considered and once one of them is significantly slower the resizing process is triggered.
-In order to do these computations the user has to input the total number of instructions needed to be executed by the
-parallel kernels and the number of instruction to be executed by each task.
-The number of flops to be executed by a context are passed as parameter when they are registered to the hypervisor,
- (@code{sc_hypervisor_register_ctx(sched_ctx_id, flops)}) and the one to be executed by each task are passed when the task is submitted.
-The corresponding field in the @code{starpu_task} data structure is @code{flops} and
-the corresponding macro in @code{starpu_insert_task} function is
-@code{STARPU_FLOPS} (but take care of passing a double, not an integer, otherwise
-parameter passing will be bogus). When the task is executed
-the resizing process is triggered.
-@cartouche
-@smallexample
-task.flops = 100;
-@end smallexample
-@end cartouche
-
-or
-
-@cartouche
-@smallexample
-starpu_insert_task(&codelet,
-                    ...,
-                    STARPU_FLOPS, (double) 100,
-                    0);
-@end smallexample
-@end cartouche
-
-

+ 0 - 47
doc/texinfo/chapters/scaling-vector-example.texi

@@ -1,47 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Main application::
-* CPU Kernel::
-* CUDA Kernel::
-* OpenCL Kernel::
-@end menu
-
-@node Main application
-@section Main application
-
-@include chapters/vector_scal_c.texi
-
-@node CPU Kernel
-@section CPU Kernel
-
-@include chapters/vector_scal_cpu.texi
-
-@node CUDA Kernel
-@section CUDA Kernel
-
-@include chapters/vector_scal_cuda.texi
-
-@node OpenCL Kernel
-@section OpenCL Kernel
-
-@menu
-* Invoking the kernel::
-* Source of the kernel::
-@end menu
-
-@node Invoking the kernel
-@subsection Invoking the kernel
-
-@include chapters/vector_scal_opencl.texi
-
-@node Source of the kernel
-@subsection Source of the kernel
-
-@include chapters/vector_scal_opencl_codelet.texi

+ 0 - 116
doc/texinfo/chapters/sched_ctx.texi

@@ -1,116 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-TODO: improve!
-
-@menu
-* General Idea::
-* Create a Context::
-* Modify a Context::
-* Delete a Context::
-* Empty Context::
-* Contexts Sharing Workers::
-@end menu
-
-@node General Idea
-@section General Idea
-Scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
-GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
-
-@node Create a Context
-@section Create a Context
-By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
-If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
-Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
-These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
-In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
-By passing a set of workers together with the scheduling policy to the function @code{starpu_sched_ctx_create}, you will get an identifier of the context created which you will use to indicate the context you want to submit the tasks to.
-
-@cartouche
-@smallexample
-/* @b{the list of ressources the context will manage} */
-int workerids[3] = @{1, 3, 10@};
-
-/* @b{indicate the scheduling policy to be used within the context, the list of 
-   workers assigned to it, the number of workers, the name of the context} */
-int id_ctx = starpu_sched_ctx_create("dmda", workerids, 3, "my_ctx");
-
-/* @b{let StarPU know that the folowing tasks will be submitted to this context} */
-starpu_sched_ctx_set_task_context(id);
-
-/* @b{submit the task to StarPU} */
-starpu_task_submit(task);
-
-@end smallexample
-@end cartouche
-
-Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine. 
-Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
-
-
-@node Modify a Context
-@section Modify a Context
-A scheduling context can be modified dynamically. The applications may change its requirements during the execution and the programmer can add additional workers to a context or remove if no longer needed.
-In the following example we have two scheduling contexts @code{sched_ctx1} and @code{sched_ctx2}. After executing a part of the tasks some of the workers of @code{sched_ctx1} will be moved to context @code{sched_ctx2}.
-
-@cartouche
-@smallexample
-/* @b{the list of ressources that context 1 will give away} */
-int workerids[3] = @{1, 3, 10@};
-
-/* @b{add the workers to context 1} */
-starpu_sched_ctx_add_workers(workerids, 3, sched_ctx2);
-
-/* @b{remove the workers from context 2} */
-starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
-
-@end smallexample
-@end cartouche
-
-@node Delete a Context 
-@section Delete a Context
-When a context is no longer needed it must be deleted. The application can indicate which context should keep the resources of a deleted one. 
-All the tasks of the context should be executed before doing this. If the application need to avoid a barrier before moving the resources from the deleted context to the inheritor one, the application can just indicate
-when the last task was submitted. Thus, when this last task was submitted the resources will be move, but the context should still be deleted at some point of the application.
-
-@cartouche
-@smallexample
-/* @b{when the context 2 will be deleted context 1 will be keep its resources} */
-starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
-
-/* @b{submit tasks to context 2} */
-for (i = 0; i < ntasks; i++)
-    starpu_task_submit_to_ctx(task[i],sched_ctx2);
-
-/* @b{indicate that context 2 finished submitting and that } */
-/* @b{as soon as the last task of context 2 finished executing } */
-/* @b{its workers can be mobed to the inheritor context} */
-starpu_sched_ctx_finished_submit(sched_ctx1);
-
-/* @b{wait for the tasks of both contexts to finish} */
-starpu_task_wait_for_all();
-
-/* @b{delete context 2} */
-starpu_sched_ctx_delete(sched_ctx2);
-
-/* @b{delete context 1} */
-starpu_sched_ctx_delete(sched_ctx1);
-@end smallexample
-@end cartouche
-
-@node Empty Context
-@section Empty Context
-A context may not have any resources at the begining or at a certain moment of the execution. Task can still be submitted to these contexts and they will execute them as soon as they will have resources. 
-A list of tasks pending to be executed is kept and when workers are added to the contexts the tasks are submitted. However, if no resources are allocated the program will not terminate.
-If these tasks have not much priority the programmer can forbid the application to submitted them by calling the function @code{starpu_sched_ctx_stop_task_submission}.
-
-@node Contexts Sharing Workers
-@section Contexts Sharing Workers
-Contexts may share workers when a single context cannot execute efficiently enough alone on these workers or when the application decides to express a hierarchy of contexts. The workers apply 
-an alogrithm of ``Round-Robin'' to chose the context on which they will ``pop'' next. By using the function @code{void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)}
-the programmer can impose the @code{workerid} to ``pop'' in the context @code{sched_ctx_id} next.

+ 0 - 17
doc/texinfo/chapters/socl.texi

@@ -1,17 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2012  Centre National de la Recherche Scientifique
-@c Copyright (C) 2012  Univ. of Bordeaux
-@c See the file starpu.texi for copying conditions.
-
-SOCL is an OpenCL implementation based on StarPU. It gives a unified access to
-every available OpenCL device: applications can now share entities such as
-Events, Contexts or Command Queues between several OpenCL implementations.
-
-In addition, command queues that are created without specifying a device provide
-automatic scheduling of the submitted commands on OpenCL devices contained in
-the context to which the command queue is attached.
-
-Note: as of StarPU @value{VERSION}, this is still an area under
-development and subject to change.

+ 0 - 111
doc/texinfo/chapters/tips-tricks.texi

@@ -1,111 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Per-worker library initialization::  How to initialize a computation library once for each worker?
-* Limit memory::
-* Thread Binding on NetBSD::
-@end menu
-
-@node Per-worker library initialization
-@section How to initialize a computation library once for each worker?
-
-Some libraries need to be initialized once for each concurrent instance that
-may run on the machine. For instance, a C++ computation class which is not
-thread-safe by itself, but for which several instanciated objects of that class
-can be used concurrently. This can be used in StarPU by initializing one such
-object per worker. For instance, the libstarpufft example does the following to
-be able to use FFTW.
-
-Some global array stores the instanciated objects:
-
-@cartouche
-@smallexample
-fftw_plan plan_cpu[STARPU_NMAXWORKERS];
-@end smallexample
-@end cartouche
-
-At initialisation time of libstarpu, the objects are initialized:
-
-@cartouche
-@smallexample
-int workerid;
-for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) @{
-    switch (starpu_worker_get_type(workerid)) @{
-        case STARPU_CPU_WORKER:
-            plan_cpu[workerid] = fftw_plan(...);
-            break;
-    @}
-@}
-@end smallexample
-@end cartouche
-
-And in the codelet body, they are used:
-
-@cartouche
-@smallexample
-static void fft(void *descr[], void *_args)
-@{
-    int workerid = starpu_worker_get_id();
-    fftw_plan plan = plan_cpu[workerid];
-    ...
-
-    fftw_execute(plan, ...);
-@}
-@end smallexample
-@end cartouche
-
-Another way to go which may be needed is to execute some code from the workers
-themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
-by CUDA to behave properly due to threading issues. For instance, StarPU's
-@code{starpu_cublas_init} looks like the following to call
-@code{cublasInit} from the workers themselves:
-
-@cartouche
-@smallexample
-static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
-@{
-    cublasStatus cublasst = cublasInit();
-    cublasSetKernelStream(starpu_cuda_get_local_stream());
-@}
-void starpu_cublas_init(void)
-@{
-    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
-@}
-@end smallexample
-@end cartouche
-
-@node Limit memory
-@section How to limit memory per node
-
-TODO
-
-Talk about
-@code{STARPU_LIMIT_CUDA_devid_MEM}, @code{STARPU_LIMIT_CUDA_MEM},
-@code{STARPU_LIMIT_OPENCL_devid_MEM}, @code{STARPU_LIMIT_OPENCL_MEM}
-and @code{STARPU_LIMIT_CPU_MEM}
-
-@code{starpu_memory_get_available}
-
-@node Thread Binding on NetBSD
-@section Thread Binding on NetBSD
-
-When using StarPU on a NetBSD machine, if the topology
-discovery library @code{hwloc} is used, thread binding will fail. To
-prevent the problem, you should at least use the version 1.7 of
-@code{hwloc}, and also issue the following call:
-
-@example
-$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
-@end example
-
-Or add the following line in the file @code{/etc/sysctl.conf}
-
-@example
-security.models.extensions.user_set_cpu_affinity=1
-@end example

+ 0 - 118
doc/texinfo/chapters/vector_scal_c.texi

@@ -1,118 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c See the file starpu.texi for copying conditions.
-
-@smallexample
-/*
- * This example demonstrates how to use StarPU to scale an array by a factor.
- * It shows how to manipulate data with StarPU's data management library.
- *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
- *  2- how to describe which data are accessed by a task (task->handles[0])
- *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
- */
-#include <starpu.h>
-
-#define    NX    2048
-
-extern void scal_cpu_func(void *buffers[], void *_args);
-extern void scal_sse_func(void *buffers[], void *_args);
-extern void scal_cuda_func(void *buffers[], void *_args);
-extern void scal_opencl_func(void *buffers[], void *_args);
-
-static struct starpu_codelet cl = @{
-    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
-    /* CPU implementation of the codelet */
-    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
-    .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
-#ifdef STARPU_USE_CUDA
-    /* CUDA implementation of the codelet */
-    .cuda_funcs = @{ scal_cuda_func, NULL @},
-#endif
-#ifdef STARPU_USE_OPENCL
-    /* OpenCL implementation of the codelet */
-    .opencl_funcs = @{ scal_opencl_func, NULL @},
-#endif
-    .nbuffers = 1,
-    .modes = @{ STARPU_RW @}
-@};
-
-#ifdef STARPU_USE_OPENCL
-struct starpu_opencl_program programs;
-#endif
-
-int main(int argc, char **argv)
-@{
-    /* We consider a vector of float that is initialized just as any of C
-      * data */
-    float vector[NX];
-    unsigned i;
-    for (i = 0; i < NX; i++)
-        vector[i] = 1.0f;
-
-    fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
-
-    /* Initialize StarPU with default configuration */
-    starpu_init(NULL);
-
-#ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file(
-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
-#endif
-
-    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
-     * identifier. When a task needs to access a piece of data, it should
-     * refer to the handle that is associated to it.
-     * In the case of the "vector" data interface:
-     *  - the first argument of the registration method is a pointer to the
-     *    handle that should describe the data
-     *  - the second argument is the memory node where the data (ie. "vector")
-     *    resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
-     *    opposed to an adress on a GPU for instance.
-     *  - the third argument is the adress of the vector in RAM
-     *  - the fourth argument is the number of elements in the vector
-     *  - the fifth argument is the size of each element.
-     */
-    starpu_data_handle_t vector_handle;
-    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
-                                NX, sizeof(vector[0]));
-
-    float factor = 3.14;
-
-    /* create a synchronous task: any call to starpu_task_submit will block
-      * until it is terminated */
-    struct starpu_task *task = starpu_task_create();
-    task->synchronous = 1;
-
-    task->cl = &cl;
-
-    /* the codelet manipulates one buffer in RW mode */
-    task->handles[0] = vector_handle;
-
-    /* an argument is passed to the codelet, beware that this is a
-     * READ-ONLY buffer and that the codelet may be given a pointer to a
-     * COPY of the argument */
-    task->cl_arg = &factor;
-    task->cl_arg_size = sizeof(factor);
-
-    /* execute the task on any eligible computational ressource */
-    starpu_task_submit(task);
-
-    /* StarPU does not need to manipulate the array anymore so we can stop
-      * monitoring it */
-    starpu_data_unregister(vector_handle);
-
-#ifdef STARPU_USE_OPENCL
-    starpu_opencl_unload_opencl(&programs);
-#endif
-
-    /* terminate StarPU, no task can be submitted after */
-    starpu_shutdown();
-
-    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
-
-    return 0;
-@}
-@end smallexample

+ 0 - 68
doc/texinfo/chapters/vector_scal_cpu.texi

@@ -1,68 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
-@c See the file starpu.texi for copying conditions.
-
-@smallexample
-#include <starpu.h>
-#include <xmmintrin.h>
-
-/* This kernel takes a buffer and scales it by a constant factor */
-void scal_cpu_func(void *buffers[], void *cl_arg)
-@{
-    unsigned i;
-    float *factor = cl_arg;
-
-    /*
-     * The "buffers" array matches the task->handles array: for instance
-     * task->handles[0] is a handle that corresponds to a data with
-     * vector "interface", so that the first entry of the array in the
-     * codelet  is a pointer to a structure describing such a vector (ie.
-     * struct starpu_vector_interface *). Here, we therefore manipulate
-     * the buffers[0] element as a vector: nx gives the number of elements
-     * in the array, ptr gives the location of the array (that was possibly
-     * migrated/replicated), and elemsize gives the size of each elements.
-     */
-    struct starpu_vector_interface *vector = buffers[0];
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
-
-    /* get a pointer to the local copy of the vector: note that we have to
-     * cast it in (float *) since a vector could contain any type of
-     * elements so that the .ptr field is actually a uintptr_t */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-
-    /* scale the vector */
-    for (i = 0; i < n; i++)
-        val[i] *= *factor;
-@}
-
-void scal_sse_func(void *buffers[], void *cl_arg)
-@{
-    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
-    unsigned int n_iterations = n/4;
-
-    __m128 *VECTOR = (__m128*) vector;
-    __m128 FACTOR __attribute__((aligned(16)));
-    float factor = *(float *) cl_arg;
-    FACTOR = _mm_set1_ps(factor);
-
-    unsigned int i;
-    for (i = 0; i < n_iterations; i++)
-        VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
-
-    unsigned int remainder = n%4;
-    if (remainder != 0)
-    @{
-        unsigned int start = 4 * n_iterations;
-        for (i = start; i < start+remainder; ++i)
-        @{
-            vector[i] = factor * vector[i];
-        @}
-    @}
-@}
-@end smallexample

+ 0 - 35
doc/texinfo/chapters/vector_scal_cuda.texi

@@ -1,35 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2012  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c See the file starpu.texi for copying conditions.
-
-@smallexample
-#include <starpu.h>
-
-static __global__ void vector_mult_cuda(unsigned n, float *val,
-                                        float factor)
-@{
-        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
-        if (i < n)
-               val[i] *= factor;
-@}
-
-extern "C" void scal_cuda_func(void *buffers[], void *_args)
-@{
-        float *factor = (float *)_args;
-
-        /* length of the vector */
-        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-        /* local copy of the vector pointer */
-        float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
-        unsigned threads_per_block = 64;
-        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
-
-        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
-	                (n, val, *factor);
-
-        cudaStreamSynchronize(starpu_cuda_get_local_stream());
-@}
-@end smallexample

+ 0 - 61
doc/texinfo/chapters/vector_scal_opencl.texi

@@ -1,61 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c See the file starpu.texi for copying conditions.
-
-@smallexample
-#include <starpu.h>
-
-extern struct starpu_opencl_program programs;
-
-void scal_opencl_func(void *buffers[], void *_args)
-@{
-    float *factor = _args;
-    int id, devid, err;
-    cl_kernel kernel;
-    cl_command_queue queue;
-    cl_event event;
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
-
-    id = starpu_worker_get_id();
-    devid = starpu_worker_get_devid(id);
-
-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
-                                    devid);
-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
-    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
-    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
-
-    @{
-        size_t global=n;
-        size_t local;
-        size_t s;
-        cl_device_id device;
-
-        starpu_opencl_get_device(devid, &device);
-        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                        sizeof(local), &local, &s);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-        if (local > global) local=global;
-
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
-                                     NULL, &event);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-    @}
-
-    clFinish(queue);
-    starpu_opencl_collect_stats(event);
-    clReleaseEvent(event);
-
-    starpu_opencl_release_kernel(kernel);
-@}
-@end smallexample

+ 0 - 16
doc/texinfo/chapters/vector_scal_opencl_codelet.texi

@@ -1,16 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
-@c See the file starpu.texi for copying conditions.
-
-@smallexample
-__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
-@{
-        const int i = get_global_id(0);
-        if (i < nx) @{
-                val[i] *= factor;
-        @}
-@}
-@end smallexample

+ 0 - 40
doc/texinfo/dev/starpu_check_documented.py

@@ -1,40 +0,0 @@
-#!/usr/bin/python
-
-import os
-
-class bcolors:
-    FAILURE = '\033[91m'
-    NORMAL = '\033[0m'
-
-def loadFunctionsAndDatatypes(flist, dtlist, fname):
-    f = open(fname, 'r')
-    for line in f:
-        mline = line[:-1]
-        if mline.count("@deftypefun "):
-            if mline.count("fft") == 0:
-                func = mline.replace("@deftypefun ", "").replace("*} ", "*").replace("@var{", "").replace("}", "").replace("{", "").replace(" (", "(", 1)
-                flist.append(list([func, fname]))
-        if mline.count("@deftp"):
-            datatype = mline.replace("@deftp {Data Type} {", "").replace("}", "")
-            dtlist.append(list([datatype, fname]))
-    f.close()
-
-functions = []
-datatypes = []
-
-loadFunctionsAndDatatypes(functions, datatypes, "doc/starpu.texi")
-for docfile in os.listdir('doc/chapters'):
-    if docfile.count(".texi"):
-        loadFunctionsAndDatatypes(functions, datatypes, "doc/chapters/"+docfile)
-
-for function in functions:
-    x = os.system("fgrep -l \"" + function[0] + "\" include/*.h mpi/include/*.h starpufft/*h sc_hypervisor/include/*.h > /dev/null")
-    if x != 0:
-        print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + " > documented in <" + function[1] + "> does not exist in StarPU's API"
-
-for datatype in datatypes:
-    x = os.system("fgrep -l \"" + datatype[0] + "\" include/*.h mpi/include/*.h starpufft/*h sc_hypervisor/include/*.h > /dev/null")
-    if x != 0:
-        print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
-
-

+ 0 - 78
doc/texinfo/dev/starpu_check_undocumented.sh

@@ -1,78 +0,0 @@
-#!/bin/bash
-# Note: expects Coccinelle's spatch command n the PATH
-# See: http://coccinelle.lip6.fr/
-
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
-# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-stcolor=$(tput sgr0)
-redcolor=$(tput setaf 1)
-greencolor=$(tput setaf 2)
-
-H_FILES=$(find include mpi/include -name '*.h')
-
-functions=$(spatch -very_quiet -sp_file tools/dev/starpu_funcs.cocci $H_FILES)
-for func in $functions ; do
-	fname=$(echo $func|awk -F ',' '{print $1}')
-	location=$(echo $func|awk -F ',' '{print $2}')
-	x=$(grep "$fname (" doc/starpu.texi doc/chapters/*texi | grep deftypefun)
-	if test "$x" == "" ; then
-		echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
-#	else
-#		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
-	fi
-done
-
-echo
-
-structs=$(grep "struct starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-for struct in $structs ; do
-    x=$(grep "$struct\b" doc/starpu.texi doc/chapters/*texi | grep deftp)
-    if test "$x" == "" ; then
-	echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-
-echo
-
-enums=$(grep "enum starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
-for enum in $enums ; do
-    x=$(grep "$enum\b" doc/starpu.texi doc/chapters/*texi | grep deftp)
-    if test "$x" == "" ; then
-	echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-
-echo
-
-macros=$(grep "define\b" $H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
-for macro in $macros ; do
-    x=$(grep "$macro\b" doc/starpu.texi doc/chapters/*texi | grep defmac)
-    if test "$x" == "" ; then
-	echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-
-echo
-
-variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" src/| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
-for variable in $variables ; do
-    x=$(grep "$variable" doc/starpu.texi doc/chapters/*texi | grep defvr)
-    if test "$x" == "" ; then
-	echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
-    fi
-done
-

+ 0 - 28
doc/texinfo/dev/starpu_funcs.cocci

@@ -1,28 +0,0 @@
-// StarPU --- Runtime system for heterogeneous multicore architectures.
-//
-// Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-//
-// StarPU is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as published by
-// the Free Software Foundation; either version 2.1 of the License, or (at
-// your option) any later version.
-//
-// StarPU is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-//
-// See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-@starpufunc@
-position p;
-type t;
-identifier f =~ "starpu";
-@@
-
-t f@p( ... );
-
-@ script:python @
-p << starpufunc.p;
-f << starpufunc.f;
-@@
-print "%s,%s:%s" % (f,p[0].file,p[0].line)

+ 0 - 160
doc/texinfo/starpu.css

@@ -1,160 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- *
- * Permission is granted to copy, distribute and/or modify this document
- * under the terms of the GNU Free Documentation License, Version 1.3
- * or any later version published by the Free Software Foundation;
- * with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
- * See the GNU Free Documentation License in COPYING.GFDL for more details.
- */
-
-body {
-	font-family: sans-serif;
-/*	margin-top: 0px; */
-}
-
-div.node {
-	text-align: center;
-	margin-top: 12px;
-	margin-bottom: 3px;
-	background: #eeeeff;
-	font-variant: small-caps;
-/*	position: fixed;*/
-	width: 100%;
-}
-div.node a {
-	text-decoration: none;
-	color: #0020a0;
-}
-div.node hr.node {
-	height: 4px;
-	background: #fe0;
-	border: 0px;
-	margin-top: 0px;
-	margin-bottom: 0px;
-}
-
-h1 {
-	font: bold normal 2.5em sans-serif ;
-	margin: 0px;
-	color: rgb(226,0,38);
-}
-h1.sub {
-	font: bold normal 2em sans-serif ;
-	text-align: right ;
-	color: #0020a0;
-}
-h1 a {
-	color: #0020a0;
-	text-decoration: none;
-}
-
-h2 {
-	font: bold normal 2em sans-serif ;
-	margin: 0px;
-	color: #0020a0;
-}
-h2.sub {
-	font: bold normal 2em sans-serif ;
-	text-align: right ;
-	color: #0020a0;
-}
-h2 a {
-	color: #0020a0;
-	text-decoration: none;
-}
-
-h3 {
-	font: bold normal small-caps 1.5em sans-serif ;
-	color: #0020a0;
-	margin-top: 8px;
-	margin-bottom: 8px;
-}
-
-h4 {
-	font: bold normal small-caps 1em sans-serif ;
-	color: #0020a0;
-	margin-top: 8px;
-	margin-bottom: 4px;
-}
-
-h6.mirrors {
-	text-align: right;
-	margin: 0px;
-	font-size: 10px;
-}
-
-div.section {
-	background: #eeeeff;
-	padding-left: 2px;
-	padding-bottom: 2px;
-	margin-top: 12px;
-	margin-bottom: 12px;
-}
-
-p {
-	margin-top: 8px;
-	margin-bottom: 4px;
-	margin-left: 6px;
-	margin-right: 6px;
-}
-
-hr {
-	height: 8px;
-	background: #fe0;
-	border: 0px;
-	margin-top: 6px;
-	margin-bottom: 6px;
-}
-
-pre {
-	font-size: 12px;
-	background: #dddddd;
-	padding: 3px;
-	padding-left: 0px;
-	margin-left: 12px;
-}
-
-table.cartouche {
-	font-size: 12px;
-	background: #dddddd;
-	padding: 3px;
-	padding-left: 0px;
-	margin-left: 12px;
-}
-
-a {
-	font-weight: bold;
-}
-
-div.publis-desc {
-	text-align: right;
-	font-style: italic;
-	font-size: 12px;
-	padding-left: 15%;
-}
-
-p.updated {
-	text-align: right;
-	font-size: 10px;
-	font-style: italic;
-}
-
-div.contents {
-	margin-top: 12px;
-	margin-bottom: 3px;
-	font-variant: small-caps;
-	padding-left: 1em;
-	padding-right: 1em;
-	padding-top: 1px;
-	padding-bottom: 1px;
-	margin-top: 12px;
-	margin-bottom: 12px;
-	margin-top:0px;
-	margin-left: auto;
-	margin-right: auto;
-	border-top: 4px solid rgb(204,209,222);
-	border-bottom: 4px solid rgb(204,209,222);
-}

+ 0 - 272
doc/texinfo/starpu.texi

@@ -1,272 +0,0 @@
-\input texinfo @c -*-texinfo-*-
-
-@c %**start of header
-@setfilename starpu.info
-@settitle StarPU Handbook
-@c %**end of header
-
-@include chapters/version.texi
-
-@copying
-Copyright @copyright{} 2009--2013  Universit@'e de Bordeaux 1
-
-@noindent
-Copyright @copyright{} 2010--2013  Centre National de la Recherche Scientifique
-
-@noindent
-Copyright @copyright{} 2011, 2012 Institut National de Recherche en Informatique et Automatique
-
-@quotation
-Permission is granted to copy, distribute and/or modify this document
-under the terms of the GNU Free Documentation License, Version 1.3
-or any later version published by the Free Software Foundation;
-with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
-Texts.  A copy of the license is included in the section entitled ``GNU
-Free Documentation License''.
-@end quotation
-@end copying
-
-@setchapternewpage odd
-@dircategory Development
-@direntry
-* StarPU: (starpu).             StarPU Handbook
-@end direntry
-
-@titlepage
-@title StarPU Handbook
-@subtitle for StarPU @value{VERSION}
-
-@page
-@vskip 0pt plus 1fill
-
-@insertcopying
-
-@end titlepage
-
-@c @summarycontents
-@contents
-@page
-
-@ifnottex
-@node Top
-@top StarPU Handbook
-@end ifnottex
-
-This manual documents the usage of StarPU version @value{VERSION}.  It
-was last updated on @value{UPDATED}.
-
-@ifnottex
-@insertcopying
-@end ifnottex
-
-@comment
-@comment  When you add a new menu item, please keep the right hand
-@comment  aligned to the same column.  Do not use tabs.  This provides
-@comment  better formatting.
-@comment
-@menu
-* Introduction::                Getting started
-* Building and Installing StarPU::
-* Basic Examples::              Basic examples of the use of StarPU
-* Advanced Examples::           Advanced examples of the use of StarPU
-* Performance optimization::    How to optimize performance with StarPU
-* Performance feedback::        Performance debugging tools
-* Tips and Tricks::             Tips and tricks to know about
-* StarPU MPI support::          How to combine StarPU with MPI
-* StarPU FFT support::          How to perform FFT computations with StarPU
-* StarPU MIC/SCC support::      How to build and run StarPU applications on MIC and SCC
-* C Extensions::                Easier StarPU programming with GCC
-* SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
-* Scheduling Contexts in StarPU::         How to use Scheduling Context of StarPU
-* Scheduling Context Hypervisor::  How to use Scheduling Context Hypervisor with StarPU
-* StarPU's API::                The API to use StarPU
-* Scheduling Context Hypervisor's API:: The API to use the Hypervisor
-* Configuration Options for StarPU::
-* Full source code for the 'Scaling a Vector' example::
-* GNU Free Documentation License::  How you can copy and share this manual.
-
-* Concept Index::               Index of programming concepts.
-* Function Index::              Index of C functions.
-* Datatype Index::              Index of C datatypes.
-* Configuration Index::         Index of configuration options.
-@end menu
-
-@c ---------------------------------------------------------------------
-@c Introduction to StarPU
-@c ---------------------------------------------------------------------
-
-@node Introduction
-@chapter Introduction to StarPU
-@include chapters/introduction.texi
-
-@c ---------------------------------------------------------------------
-@c Installing StarPU
-@c ---------------------------------------------------------------------
-
-@node Building and Installing StarPU
-@chapter Building and Installing StarPU
-@include chapters/installing.texi
-
-@c ---------------------------------------------------------------------
-@c Basic Examples
-@c ---------------------------------------------------------------------
-
-@node Basic Examples
-@chapter Basic Examples
-@include chapters/basic-examples.texi
-
-@c ---------------------------------------------------------------------
-@c Advanced Examples
-@c ---------------------------------------------------------------------
-
-@node Advanced Examples
-@chapter Advanced Examples
-@include chapters/advanced-examples.texi
-
-@c ---------------------------------------------------------------------
-@c Performance options
-@c ---------------------------------------------------------------------
-
-@node Performance optimization
-@chapter How to optimize performance with StarPU
-@include chapters/perf-optimization.texi
-
-@c ---------------------------------------------------------------------
-@c Performance feedback
-@c ---------------------------------------------------------------------
-
-@node Performance feedback
-@chapter Performance feedback
-@include chapters/perf-feedback.texi
-
-@c ---------------------------------------------------------------------
-@c Tips and Tricks
-@c ---------------------------------------------------------------------
-
-@node Tips and Tricks
-@chapter Tips and Tricks to know about
-@include chapters/tips-tricks.texi
-
-@c ---------------------------------------------------------------------
-@c MPI support
-@c ---------------------------------------------------------------------
-
-@node StarPU MPI support
-@chapter StarPU MPI support
-@include chapters/mpi-support.texi
-
-@c ---------------------------------------------------------------------
-@c FFT support
-@c ---------------------------------------------------------------------
-
-@node StarPU FFT support
-@chapter StarPU FFT support
-@include chapters/fft-support.texi
-
-@c ---------------------------------------------------------------------
-@c MIC/SCC support
-@c ---------------------------------------------------------------------
-
-@node StarPU MIC/SCC support
-@chapter StarPU MIC/SCC support
-@include chapters/mic-scc-support.texi
-
-@c ---------------------------------------------------------------------
-@c C Extensions
-@c ---------------------------------------------------------------------
-
-@node C Extensions
-@chapter C Extensions
-@include chapters/c-extensions.texi
-
-@c ---------------------------------------------------------------------
-@c SOCL
-@c ---------------------------------------------------------------------
-
-@node SOCL OpenCL Extensions
-@chapter SOCL OpenCL Extensions
-@include chapters/socl.texi
-
-@c ---------------------------------------------------------------------
-@c Scheduling Contexts in StarPU
-@c ---------------------------------------------------------------------
-
-@node Scheduling Contexts in StarPU
-@chapter Scheduling Contexts in StarPU
-@include chapters/sched_ctx.texi
-
-@c ---------------------------------------------------------------------
-@c Scheduling Context Hypervisor
-@c ---------------------------------------------------------------------
-
-@node Scheduling Context Hypervisor
-@chapter Scheduling Context Hypervisor
-@include chapters/sc_hypervisor.texi
-
-@c ---------------------------------------------------------------------
-@c StarPU API
-@c ---------------------------------------------------------------------
-
-@node StarPU's API
-@chapter StarPU's API
-@include chapters/api.texi
-
-@c ---------------------------------------------------------------------
-@c Scheduling Context Hypervisor's API
-@c ---------------------------------------------------------------------
-
-@node Scheduling Context Hypervisor's API
-@chapter Scheduling Context Hypervisor's API
-@include chapters/hypervisor_api.texi
-
-@c ---------------------------------------------------------------------
-@c Configuration options
-@c ---------------------------------------------------------------------
-
-@node Configuration Options for StarPU
-@chapter Configuration Options for StarPU
-@include chapters/configuration.texi
-
-@c ---------------------------------------------------------------------
-@c Appendices
-@c ---------------------------------------------------------------------
-
-@c ---------------------------------------------------------------------
-@c Full source code for the 'Scaling a Vector' example
-@c ---------------------------------------------------------------------
-
-@node Full source code for the 'Scaling a Vector' example
-@appendix Full source code for the 'Scaling a Vector' example
-@include chapters/scaling-vector-example.texi
-
-@c ---------------------------------------------------------------------
-@c License
-@c ---------------------------------------------------------------------
-
-@node GNU Free Documentation License
-@appendix GNU Free Documentation License
-@include chapters/fdl-1.3.texi
-
-@c ---------------------------------------------------------------------
-@c Indices
-@c ---------------------------------------------------------------------
-
-@c comment it out for now, it is too small to be kept for now. See how
-@c it can be merged with the glossary section in the introduction
-@node Concept Index
-@unnumbered Concept Index
-@printindex cp
-
-@node Function Index
-@unnumbered Function Index
-@printindex fn
-
-@node Datatype Index
-@unnumbered Datatype Index
-@printindex tp
-
-@node Configuration Index
-@unnumbered Configuration Index
-@printindex vr
-
-@bye

+ 3 - 0
examples/Makefile.am

@@ -49,6 +49,7 @@ EXTRA_DIST = 					\
 	sched_ctx_utils/sched_ctx_utils.c			\
 	sched_ctx/sched_ctx.c					\
 	sched_ctx/parallel_code.c				\
+	sched_ctx/dummy_sched_with_ctx.c			\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -188,6 +189,7 @@ examplebin_PROGRAMS +=				\
 	profiling/profiling			\
 	sched_ctx/sched_ctx			\
 	sched_ctx/parallel_code			\
+	sched_ctx/dummy_sched_with_ctx		\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	mandelbrot/mandelbrot			\
@@ -262,6 +264,7 @@ STARPU_EXAMPLES +=				\
 	scheduler/dummy_sched			\
 	sched_ctx/sched_ctx			\
 	sched_ctx/parallel_code			\
+	sched_ctx/dummy_sched_with_ctx		\
 	reductions/dot_product			\
 	reductions/minmax_reduction
 

+ 183 - 0
examples/sched_ctx/dummy_sched_with_ctx.c

@@ -0,0 +1,183 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_scheduler.h>
+
+#define NTASKS	32000
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+struct dummy_sched_data
+{
+	struct starpu_task_list sched_list;
+     	starpu_pthread_mutex_t policy_mutex;
+};
+
+static void init_dummy_sched(unsigned sched_ctx_id)
+{
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+
+	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
+
+
+	/* Create a linked-list of tasks and a condition variable to protect it */
+	starpu_task_list_init(&data->sched_list);
+
+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)data);
+
+	starpu_pthread_mutex_init(&data->policy_mutex, NULL);
+	FPRINTF(stderr, "Initialising Dummy scheduler\n");
+}
+
+static void deinit_dummy_sched(unsigned sched_ctx_id)
+{
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	STARPU_ASSERT(starpu_task_list_empty(&data->sched_list));
+
+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
+
+	starpu_pthread_mutex_destroy(&data->policy_mutex);
+
+	free(data);
+
+	FPRINTF(stderr, "Destroying Dummy scheduler\n");
+}
+
+static int push_task_dummy(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	/* NB: In this simplistic strategy, we assume that the context in which
+	   we push task has at least one worker*/
+
+
+	/* lock all workers when pushing tasks on a list where all
+	   of them would pop for tasks */
+        starpu_pthread_mutex_lock(&data->policy_mutex);
+
+	starpu_task_list_push_front(&data->sched_list, task);
+
+	starpu_push_task_end(task);
+	starpu_pthread_mutex_unlock(&data->policy_mutex);
+
+
+        /*if there are no tasks block */
+        /* wake people waiting for a task */
+        unsigned worker = 0;
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+
+        struct starpu_sched_ctx_iterator it;
+        if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+        {
+                worker = workers->get_next(workers, &it);
+		starpu_pthread_mutex_t *sched_mutex;
+                starpu_pthread_cond_t *sched_cond;
+                starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+		starpu_pthread_mutex_lock(sched_mutex);
+                starpu_pthread_cond_signal(sched_cond);
+                starpu_pthread_mutex_unlock(sched_mutex);
+        }
+
+	return 0;
+}
+
+/* The mutex associated to the calling worker is already taken by StarPU */
+static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
+{
+	/* NB: In this simplistic strategy, we assume that all workers are able
+	 * to execute all tasks, otherwise, it would have been necessary to go
+	 * through the entire list until we find a task that is executable from
+	 * the calling worker. So we just take the head of the list and give it
+	 * to the worker. */
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	starpu_pthread_mutex_lock(&data->policy_mutex);
+	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
+	starpu_pthread_mutex_unlock(&data->policy_mutex);
+	return task;
+}
+
+static struct starpu_sched_policy dummy_sched_policy =
+{
+	.init_sched = init_dummy_sched,
+	.add_workers = NULL,
+	.remove_workers = NULL,
+	.deinit_sched = deinit_dummy_sched,
+	.push_task = push_task_dummy,
+	.pop_task = pop_task_dummy,
+	.post_exec_hook = NULL,
+	.pop_every_task = NULL,
+	.policy_name = "dummy",
+	.policy_description = "dummy scheduling strategy"
+};
+
+static void dummy_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+}
+
+static struct starpu_codelet dummy_codelet =
+{
+	.cpu_funcs = {dummy_func, NULL},
+	.cuda_funcs = {dummy_func, NULL},
+        .opencl_funcs = {dummy_func, NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "dummy",
+};
+
+
+int main(int argc, char **argv)
+{
+	int ntasks = NTASKS;
+	int ret;
+/* 	struct starpu_conf conf; */
+
+/* 	starpu_conf_init(&conf); */
+/* 	conf.sched_policy = &dummy_sched_policy, */
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	unsigned sched_ctx = starpu_sched_ctx_create_with_custom_policy(&dummy_sched_policy, NULL, -1, "dummy");
+#ifdef STARPU_QUICK_CHECK
+	ntasks /= 100;
+#endif
+
+	starpu_sched_ctx_set_context(&sched_ctx);
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &dummy_codelet;
+		task->cl_arg = NULL;
+
+		ret = starpu_task_submit(task);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_task_wait_for_all();
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 3 - 0
include/starpu_perfmodel.h

@@ -171,6 +171,9 @@ void starpu_bus_print_affinity(FILE *f);
 
 double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
 double starpu_get_latency_RAM_CUDA(unsigned cudadev);
+double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev);
+double starpu_get_latency_CUDA_RAM(unsigned cudadev);
+
 
 #ifdef __cplusplus
 }

+ 8 - 1
include/starpu_sched_ctx.h

@@ -24,8 +24,12 @@ extern "C"
 {
 #endif
 
+
 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
 
+struct starpu_sched_policy;
+unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *policy, int *workerids, int nworkers, const char *sched_name);
+
 unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap);
 
 void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
@@ -44,6 +48,7 @@ void starpu_sched_ctx_stop_task_submission(void);
 
 void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
 
+unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids);
 
 struct starpu_sched_ctx_performance_counters
 {
@@ -52,7 +57,7 @@ struct starpu_sched_ctx_performance_counters
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
-	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
+	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
 	void (*notify_delete_context)(unsigned sched_ctx);
 };
 
@@ -82,6 +87,8 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
 
 unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
+unsigned starpu_sched_ctx_contains_type_of_worker(enum starpu_worker_archtype arch, unsigned sched_ctx_id);
+
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 
 unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);

+ 21 - 17
mpi/src/starpu_mpi.c

@@ -766,8 +766,9 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 
 	_STARPU_MPI_LOG_IN();
 
-	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
-			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d internal_req %p\n",
+			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
+			  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype, req->internal_req);
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 	{
@@ -812,12 +813,6 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 		req->envelope = NULL;
 	}
 
-	if (req->internal_req)
-	{
-		free(req->internal_req);
-		req->internal_req = NULL;
-	}
-
 	/* Execute the specified callback, if any */
 	if (req->callback)
 		req->callback(req->callback_arg);
@@ -869,12 +864,13 @@ static void _starpu_mpi_copy_cb(void* arg)
 	starpu_data_unregister_submit(args->copy_handle);
 
 	_STARPU_MPI_DEBUG(3, "Done, handling request %p termination of the already received request\n",args->req);
+	// If the request is detached, we need to call _starpu_mpi_handle_request_termination
+	// as it will not be called automatically as the request is not in the list detached_requests
 	if (args->req->detached)
 		_starpu_mpi_handle_request_termination(args->req);
 	// else: If the request is not detached its termination will
 	// be handled when calling starpu_mpi_wait
 
-
 	free(args);
 }
 
@@ -976,13 +972,18 @@ static unsigned _starpu_mpi_progression_hook_func(void *arg STARPU_ATTRIBUTE_UNU
 {
 	unsigned may_block = 1;
 
-	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 	if (!_starpu_mpi_req_list_empty(detached_requests))
 	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+		STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		STARPU_PTHREAD_COND_SIGNAL(&cond_progression);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		may_block = 0;
 	}
-	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+	else
+		STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+
 
 	return may_block;
 }
@@ -1038,6 +1039,9 @@ static void _starpu_mpi_test_detached_requests(void)
 		if (flag)
 		{
 			_starpu_mpi_req_list_erase(detached_requests, req);
+#ifdef STARPU_DEVEL
+#warning FIXME: when do we free internal requests
+#endif
 			if (!req->is_internal_req)
 				free(req);
 		}
@@ -1130,12 +1134,12 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	}
 
 	{
-	     int rank, worldsize;
-	     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
-	     TRACE_MPI_START(rank, worldsize);
+		int rank, worldsize;
+		MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+		MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
+		TRACE_MPI_START(rank, worldsize);
 #ifdef STARPU_USE_FXT
-	     starpu_profiling_set_id(rank);
+		starpu_profiling_set_id(rank);
 #endif //STARPU_USE_FXT
 	}
 
@@ -1267,7 +1271,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
 				else
 				{
-					_STARPU_MPI_DEBUG(3, "Found !\n");
+					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 
 					delete_req(found_req);
 

+ 3 - 0
mpi/tests/Makefile.am

@@ -198,5 +198,8 @@ mpi_reduction_SOURCES += mpi_reduction_kernels.c
 user_defined_datatype_SOURCES = user_defined_datatype.c
 user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
 
+mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
+mpi_earlyrecv2_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 2 - 0
mpi/tests/datatypes.c

@@ -239,6 +239,7 @@ int main(int argc, char **argv)
 
 			starpu_data_unregister(matrix_handle[0]);
 			starpu_data_unregister(matrix_handle[1]);
+			free(matrix);
 		}
 
 		{
@@ -266,6 +267,7 @@ int main(int argc, char **argv)
 
 			starpu_data_unregister(block_handle[0]);
 			starpu_data_unregister(block_handle[1]);
+			free(block);
 		}
 	}
 	else if (rank == 1)

+ 159 - 41
mpi/tests/mpi_earlyrecv2.c

@@ -18,81 +18,199 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 #include <unistd.h>
+#include <interface/complex_interface.h>
 
-//#define NB 1000
 #define NB 10
 
-int main(int argc, char **argv)
-{
-	int ret, rank, size, i;
-	starpu_data_handle_t tab_handle[NB];
-	int value[NB];
+static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
 
-	MPI_Init(NULL, NULL);
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-	if (size%2 != 0)
-	{
-		if (rank == 0)
-			FPRINTF(stderr, "We need a even number of processes.\n");
+void callback(void *arg)
+{
+	unsigned *received = arg;
 
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	*received = *received + 1;
+	FPRINTF_MPI("Requests %d received\n", *received);
+	STARPU_PTHREAD_COND_SIGNAL(&cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+}
 
-	ret = starpu_init(NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-
-	for(i=0 ; i<NB ; i++)
-	{
-		value[i]=i*rank;
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
-	}
+typedef void (*check_func)(starpu_data_handle_t handle, int i, int rank, int *error);
 
+int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detached)
+{
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+	int i;
 
 	if (rank%2)
 	{
-		starpu_mpi_send(tab_handle[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_send(tab_handle[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
+		starpu_mpi_send(handles[0], other_rank, 0, MPI_COMM_WORLD);
+		starpu_mpi_send(handles[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
 		for(i=1 ; i<NB-1 ; i++)
 		{
-			starpu_mpi_send(tab_handle[i], other_rank, i, MPI_COMM_WORLD);
+			starpu_mpi_send(handles[i], other_rank, i, MPI_COMM_WORLD);
 		}
+		return 0;
 	}
 	else
 	{
+		int ret=0;
 		starpu_mpi_req req[NB];
-		memset(req, 0, NB*sizeof(starpu_mpi_req));
+		int received = 0;
+
+		if (detached)
+		{
+			starpu_mpi_irecv_detached(handles[0], other_rank, 0, MPI_COMM_WORLD, callback, &received);
+		}
+		else
+		{
+			memset(req, 0, NB*sizeof(starpu_mpi_req));
+			starpu_mpi_irecv(handles[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
+			STARPU_ASSERT(req[0] != NULL);
+		}
 
-		starpu_mpi_irecv(tab_handle[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
-		STARPU_ASSERT(req[0] != NULL);
 		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
 		usleep(2000000);
 		for(i=1 ; i<NB ; i++)
 		{
-			starpu_mpi_irecv(tab_handle[i], &req[i], other_rank, i, MPI_COMM_WORLD);
-			STARPU_ASSERT(req[i] != NULL);
+			if (detached)
+			{
+				starpu_mpi_irecv_detached(handles[i], other_rank, i, MPI_COMM_WORLD, callback, &received);
+			}
+			else
+			{
+				starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);
+				STARPU_ASSERT(req[i] != NULL);
+			}
+		}
+
+		if (detached)
+		{
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+			while (received != NB)
+			{
+			     FPRINTF_MPI("Received %d messages\n", received);
+			     STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			}
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		}
-		for(i=0 ; i<NB ; i++)
+		else
 		{
-			starpu_mpi_wait(&req[i], NULL);
-			int *rvalue = (int *)starpu_data_get_local_ptr(tab_handle[i]);
-			STARPU_ASSERT_MSG(*rvalue==i*other_rank, "Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
+			for(i=0 ; i<NB ; i++)
+			{
+			     starpu_mpi_wait(&req[i], NULL);
+			     func(handles[i], i, rank, &ret);
+			}
 		}
+		return ret;
 	}
+}
+
+void check_variable(starpu_data_handle_t handle, int i, int rank, int *error)
+{
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	int *rvalue = (int *)starpu_data_get_local_ptr(handle);
+	if (*rvalue != i*other_rank)
+	{
+		FPRINTF_MPI("Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
+		*error = 1;
+	}
+}
+
+int exchange_variable(int rank, int detached)
+{
+	int ret, i;
+	starpu_data_handle_t tab_handle[NB];
+	int value[NB];
 
+	FPRINTF_MPI("Exchanging variable data with detached=%d\n", detached);
+
+	for(i=0 ; i<NB ; i++)
+	{
+		value[i]=i*rank;
+		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
+		starpu_data_set_tag(tab_handle[i], i);
+	}
+	ret = exchange(rank, tab_handle, check_variable, detached);
 	for(i=0 ; i<NB ; i++)
 		starpu_data_unregister(tab_handle[i]);
 
+	return ret;
+}
+
+void check_complex(starpu_data_handle_t handle, int i, int rank, int *error)
+{
+	double *real = starpu_complex_get_real(handle);
+	double *imaginary = starpu_complex_get_imaginary(handle);
+
+	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+
+	if ((*real != ((i*other_rank)+12)) || (*imaginary != ((i*other_rank)+45)))
+	{
+		FPRINTF_MPI("Incorrect received value: %f != %d || %f != %d\n", *real, ((i*other_rank)+12), *imaginary, ((i*other_rank)+45));
+		*error = 1;
+	}
+}
+
+int exchange_complex(int rank, int detached)
+{
+	int ret, i;
+	starpu_data_handle_t handle[NB];
+	double real[NB];
+	double imaginary[NB];
+
+	FPRINTF_MPI("Exchanging complex data with detached=%d\n", detached);
+
+	for(i=0 ; i<NB ; i++)
+	{
+		real[i] = (i*rank)+12;
+		imaginary[i] = (i*rank)+45;
+		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
+		starpu_data_set_tag(handle[i], i);
+	}
+	ret = exchange(rank, handle, check_complex, detached);
+	for(i=0 ; i<NB ; i++)
+		starpu_data_unregister(handle[i]);
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank, size;
+
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	ret = exchange_variable(rank, 0);
+	if (ret == 0)
+		ret = exchange_variable(rank, 1);
+	if (ret == 0)
+		ret = exchange_complex(rank, 0);
+	if (ret == 0)
+		ret = exchange_complex(rank, 1);
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
 	MPI_Finalize();
 
-	return 0;
+	return ret;
 }

+ 2 - 1
sc_hypervisor/examples/Makefile.am

@@ -20,7 +20,8 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLA
 
 noinst_PROGRAMS =				\
 	app_driven_test/app_driven_test		\
-	lp_test/lp_test
+	lp_test/lp_test				\
+	lp_test/lp_resize_test
 
 if !NO_BLAS_LIB
 noinst_PROGRAMS +=				\

+ 5 - 4
sc_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -37,7 +37,7 @@ pthread_mutex_t mut[2];
  * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
  * are no data input/output managed by the DSM (cl.nbuffers = 0) */
 
-void cpu_func(void *buffers[], void *cl_arg)
+void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 {
 	struct params *params = (struct params *) cl_arg;
 
@@ -88,7 +88,7 @@ void* submit_tasks_thread(void *arg)
 			printf("require resize for sched_ctx %d at tag %d\n", sched_ctx, tag);
 			/* specify that the contexts should be resized when the task having this
 			   particular tag will finish executing */
-			sc_hypervisor_resize(sched_ctx, tag);
+			sc_hypervisor_post_resize_request(sched_ctx, tag);
 		}
 
 		params[i].sched_ctx = sched_ctx;
@@ -97,11 +97,12 @@ void* submit_tasks_thread(void *arg)
 		task[i]->cl_arg = &params[i];
 		task[i]->cl_arg_size = sizeof(params);
 
-		starpu_task_submit(task[i]);
+		int ret = starpu_task_submit(task[i]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	starpu_task_wait_for_all();
-	return;
+	return NULL;
 }
 
 int main()

+ 137 - 0
sc_hypervisor/examples/lp_test/lp_resize_test.c

@@ -0,0 +1,137 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <sc_hypervisor.h>
+
+#define NTASKS 1000
+#define NINCR 10
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+
+unsigned val[2];
+pthread_mutex_t mut[2];
+
+/* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
+ * DSM; the second arguments references read-only data that is passed as an
+ * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
+ * are no data input/output managed by the DSM (cl.nbuffers = 0) */
+
+void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
+{
+	unsigned sched_ctx = *((unsigned *) cl_arg);
+
+	int i;
+	for(i = 0; i < NINCR; i++)
+	{
+		pthread_mutex_lock(&mut[sched_ctx - 1]);
+		val[sched_ctx - 1]++;
+		pthread_mutex_unlock(&mut[sched_ctx - 1]);
+	}
+}
+
+struct starpu_codelet cl = {0};
+
+void* submit_tasks_thread(void *arg)
+{
+	unsigned sched_ctx = *((unsigned*)arg);
+	starpu_sched_ctx_set_context(&sched_ctx);
+
+	struct starpu_task *task[NTASKS];
+	int i;
+	for(i = 0; i < NTASKS; i++)
+	{
+		task[i] = starpu_task_create();
+		cl.cpu_funcs[0] = cpu_func;
+		cl.nbuffers = 0;
+
+		task[i]->cl = &cl;
+
+		task[i]->cl_arg = &sched_ctx;
+		task[i]->cl_arg_size = sizeof(unsigned);
+
+		task[i]->flops = NINCR*1000000000.0;
+		int ret = starpu_task_submit(task[i]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		if(i == NTASKS/2)
+			sc_hypervisor_resize_ctxs(NULL, -1, NULL, -1);
+	}
+
+	starpu_task_wait_for_all();
+	return;
+}
+
+int main()
+{
+	int ret = starpu_init(NULL);
+
+	if (ret == -ENODEV)
+        return 77;
+
+
+	/* create contexts */
+	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", NULL, 0, "sched_ctx1");
+	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", NULL, 0, "sched_ctx2");
+
+	/* initialize the hypervisor */
+	struct sc_hypervisor_policy policy;
+	policy.custom = 0;
+	/* indicate which strategy to use
+	   in this particular case we use app_driven which allows the user to resize 
+	   the ctxs dynamically at particular moments of the execution of the application */
+	policy.name = "feft_lp";
+	void *perf_counters = sc_hypervisor_init(&policy);
+
+	/* let starpu know which performance counters should use 
+	   to inform the hypervisor how the application and the resources are executing */
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+
+	double flops1 = NTASKS*NINCR*1000000000.0;
+	double flops2 = NTASKS*NINCR*1000000000.0;
+	/* register the contexts that should be managed by the hypervisor
+	   and indicate an approximate amount of workload if known;
+	   in this case we don't know it and we put 0 */
+	sc_hypervisor_register_ctx(sched_ctx1, flops1);
+	sc_hypervisor_register_ctx(sched_ctx2, flops2);
+        /* lp strategy allows sizing the contexts because we know the total number of flops
+	   to be executed */
+	sc_hypervisor_size_ctxs(NULL, -1, NULL, -1);
+
+	starpu_pthread_t tid[2];
+
+	val[0] = 0;
+	val[1] = 0;
+	pthread_mutex_init(&mut[0], NULL);
+	pthread_mutex_init(&mut[1], NULL);
+
+	/* we create two threads to simulate simultaneous submission of tasks */
+	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+
+	starpu_pthread_join(tid[0], NULL);
+	starpu_pthread_join(tid[1], NULL);
+
+	/* free starpu and hypervisor data */
+	starpu_shutdown();
+	sc_hypervisor_shutdown();
+
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx1, val[0], NTASKS*NINCR);
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx2, val[1], NTASKS*NINCR);
+	return 0;
+}

+ 4 - 2
sc_hypervisor/examples/lp_test/lp_test.c

@@ -32,7 +32,7 @@ pthread_mutex_t mut[2];
  * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
  * are no data input/output managed by the DSM (cl.nbuffers = 0) */
 
-void cpu_func(void *buffers[], void *cl_arg)
+void cpu_func(__attribute__((unused))void *buffers[], void *cl_arg)
 {
 	unsigned sched_ctx = *((unsigned *) cl_arg);
 
@@ -66,7 +66,9 @@ void* submit_tasks_thread(void *arg)
 		task[i]->cl_arg_size = sizeof(unsigned);
 
 		task[i]->flops = NINCR*1000000000.0;
-		starpu_task_submit(task[i]);
+		int ret = starpu_task_submit(task[i]);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
 	}
 
 	starpu_task_wait_for_all();

+ 11 - 5
sc_hypervisor/include/sc_hypervisor.h

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <sc_hypervisor_config.h>
 #include <sc_hypervisor_monitoring.h>
+#include <math.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -45,10 +46,12 @@ struct sc_hypervisor_policy
 	/* indicate if it is a policiy create by the user or not */
 	unsigned custom;
 
-	/* if knwing the future the hypervisor can find the good 
-	   distribution of workers on contexts even at the begining of the program */
+	/* Distribute workers to contexts even at the begining of the program */
 	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
 
+	/* Require explicit resizing */
+	void (*resize_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+
 	/* the hypervisor takes a decision when the worker was idle for another cyle in this ctx */
 	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
 
@@ -65,7 +68,7 @@ struct sc_hypervisor_policy
 	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
 
 	/* the hypervisor takes a decision when a job was submitted in this ctx */
-	void (*handle_submitted_job)(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint);
+	void (*handle_submitted_job)(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size);
 	
 	/* the hypervisor takes a decision when a certain ctx was deleted */
 	void (*end_ctx)(unsigned sched_ctx);
@@ -84,7 +87,10 @@ void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops);
 void sc_hypervisor_unregister_ctx(unsigned sched_ctx);
 
 /* submit a requirement of resizing when a task taged with task_tag is executed */
-void sc_hypervisor_resize(unsigned sched_ctx, int task_tag);
+void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag);
+
+/* reevaluate the distribution of the resources and eventually resize if needed */
+void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
 
 /* don't allow the hypervisor to resize a context */
 void sc_hypervisor_stop_resize(unsigned sched_ctx);
@@ -120,7 +126,7 @@ void sc_hypervisor_free_size_req(void);
 unsigned sc_hypervisor_can_resize(unsigned sched_ctx);
 
 /* indicate the types of tasks a context will execute in order to better decide the sizing of ctxs */
-void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint);
+	void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size);
 
 #ifdef __cplusplus
 }

+ 2 - 2
sc_hypervisor/include/sc_hypervisor_lp.h

@@ -44,7 +44,7 @@ double sc_hypervisor_lp_get_tmax(int nw, int *workers);
 void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw]);
 
 /* redistribute the ressource in contexts by assigning the first x available ressources to each one */
-void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw]);
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *sched_ctxs);
 
 /* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
 void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);
@@ -64,7 +64,7 @@ unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw
 #ifdef STARPU_HAVE_GLPK_H
 /* linear program that returns 1/tmax, and computes in table res the nr of workers needed by each context st 
    the system ends up in the smallest tmax*/
-double sc_hypervisor_lp_simulate_distrib_flops(int nsched_ctxs, int ntypes_of_workers, double velocity[nsched_ctxs][ntypes_of_workers], 
+double sc_hypervisor_lp_simulate_distrib_flops(int nsched_ctxs, int ntypes_of_workers, double speed[nsched_ctxs][ntypes_of_workers], 
 					       double flops[nsched_ctxs], double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
 
 /* linear program that simulates a distribution of tasks that minimises the execution time of the tasks in the pool */

+ 7 - 6
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -77,8 +77,9 @@ struct sc_hypervisor_wrapper
 	/* nr of tasks executed on each worker in this ctx */
 	int elapsed_tasks[STARPU_NMAXWORKERS];
 
-	/* the average speed of workers when they belonged to this context */
-	double ref_velocity[STARPU_NMAXWORKERS];
+	/* the average speed of the type of workers when they belonged to this context */
+	/* 0 - cuda 1 - cpu */
+	double ref_speed[2];
 
 	/* number of flops submitted to this ctx */
 	double submitted_flops;
@@ -118,11 +119,11 @@ double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrappe
 /* get the number of flops executed by a context since the begining */
 double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
 
-/* compute an average value of the cpu/cuda velocity */
-double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
+/* compute an average value of the cpu/cuda speed */
+double sc_hypervisorsc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
-/* compte the actual velocity of all workers of a specific type of worker */
-double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
+/* compte the actual speed of all workers of a specific type of worker */
+double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch);
 
 #ifdef __cplusplus
 }

+ 14 - 13
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -29,7 +29,7 @@ extern "C"
 #define HYPERVISOR_START_REDIM_SAMPLE 0.1
 #define SC_NOTHING 0
 #define SC_IDLE 1
-#define SC_VELOCITY 2
+#define SC_SPEED 2
 
 struct sc_hypervisor_policy_task_pool
 {
@@ -37,11 +37,12 @@ struct sc_hypervisor_policy_task_pool
 	uint32_t footprint;
 	unsigned sched_ctx_id;
 	unsigned long n;
+	size_t data_size;
 	struct sc_hypervisor_policy_task_pool *next;
 };
 
 /* add task information to a task wrapper linked list */
-void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools);
+	void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools, size_t data_size);
 
 /* remove task information from a task wrapper linked list */
 void sc_hypervisor_policy_remove_task_from_pool(struct starpu_task *task, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools);
@@ -73,8 +74,8 @@ unsigned sc_hypervisor_policy_resize(unsigned sender_sched_ctx, unsigned receive
 /* check the policy's constraints in order to resize  and find a context willing the resources */
 unsigned sc_hypervisor_policy_resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now);
 
-/* compute the velocity of a context */
-double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w);
+/* compute the speed of a context */
+double sc_hypervisor_get_ctx_speed(struct sc_hypervisor_wrapper* sc_w);
 
 /* get the time of execution of the slowest context */
 double sc_hypervisor_get_slowest_ctx_exec_time(void);
@@ -82,14 +83,14 @@ double sc_hypervisor_get_slowest_ctx_exec_time(void);
 /* get the time of execution of the fastest context */
 double sc_hypervisor_get_fastest_ctx_exec_time(void);
 
-/* compute the velocity of a workers in a context */
-double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker); 
+/* compute the speed of a workers in a context */
+double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker); 
 
-/* compute the velocity of a type of worker in a context */
-double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
+/* compute the speed of a type of worker in a context */
+double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
-/* compute the velocity of a type of worker in a context depending on its history */ 
-double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
+/* compute the speed of a type of worker in a context depending on its history */ 
+double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
 /* get the list of workers grouped by type */
 void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers]);
@@ -100,10 +101,10 @@ unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker);
 /* check if worker was idle long enough */
 unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker);
 
-/* check if there is a velocity gap btw ctxs */
-unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void);
+/* check if there is a speed gap btw ctxs */
+unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void);
 
-/* check what triggers resizing (idle, velocity, etc.)*/
+/* check what triggers resizing (idle, speed, etc.)*/
 unsigned sc_hypervisor_get_resize_criteria();
 
 #ifdef __cplusplus

+ 43 - 30
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -20,12 +20,12 @@
 #include <math.h>
 #include <sys/time.h>
 
-static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
+static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
 
 
-static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers)
+static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers)
 {
-	double velocity[ns][nw];
+	double speed[ns][nw];
 
 	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
@@ -41,7 +41,7 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
 			int worker = workers == NULL ? w : workers[w];
 
 			enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
-			velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
+			speed[s][w] = sc_hypervisor_get_speed(sc_w, arch);
 		}
 	}
 	
@@ -50,7 +50,7 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
 	struct timeval end_time;
 	gettimeofday(&start_time, NULL);
 
-	double res = _glp_resolve(ns, nw, velocity, w_in_s, workers, 1);
+	double res = _glp_resolve(ns, nw, speed, w_in_s, workers, 1);
 	gettimeofday(&end_time, NULL);
 
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
@@ -68,7 +68,7 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
  */
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer)
+static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer)
 {
 	int w, s;
 	glp_prob *lp;
@@ -76,7 +76,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 	lp = glp_create_prob();
 	glp_set_prob_name(lp, "StarPU theoretical bound");
 	glp_set_obj_dir(lp, GLP_MAX);
-	glp_set_obj_name(lp, "total velocity");
+	glp_set_obj_name(lp, "total speed");
 
 	{
 		int ne = 2 * ns * nw /* worker execution time */
@@ -115,10 +115,10 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 
 
 		int curr_row_idx = 0;
-		/* Total worker velocity */
+		/* Total worker speed */
 		glp_add_rows(lp, 1);
 
-		/*sum(x[s][w]*velocity[s][w]) >= vmax */
+		/*sum(x[s][w]*speed[s][w]) >= vmax */
 		char name[32], title[64];
 		starpu_worker_get_name(w, name, sizeof(name));
 		snprintf(title, sizeof(title), "worker %s", name);
@@ -131,7 +131,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 				/* x[s][w] */
 				ia[n] = curr_row_idx + 1;
 				ja[n] = s*nw+w+1;
-				ar[n] = velocity[s][w];
+				ar[n] = speed[s][w];
 				n++;
 			}
 		}
@@ -225,25 +225,27 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 }
 
 
-static void _try_resizing(void)
+static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	int ns = sc_hypervisor_get_nsched_ctxs();
-	int nw = starpu_worker_get_count(); /* Number of different workers */
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	int nw = workers == NULL ? starpu_worker_get_count() : nworkers; /* Number of different workers */
 	
+	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+
 	double w_in_s[ns][nw];
-	unsigned found_sol = _compute_max_velocity(ns, nw,  w_in_s, NULL, NULL);
+	unsigned found_sol = _compute_max_speed(ns, nw,  w_in_s, sched_ctxs, workers);
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
 	{
 		int w, s;
-		double nworkers[ns][2];
-		int nworkers_rounded[ns][2];
+		double nworkers_per_ctx[ns][2];
+		int nworkers_per_ctx_rounded[ns][2];
 		for(s = 0; s < ns; s++)
 		{
-			nworkers[s][0] = 0.0;
-			nworkers[s][1] = 0.0;
-			nworkers_rounded[s][0] = 0;
-			nworkers_rounded[s][1] = 0;
+			nworkers_per_ctx[s][0] = 0.0;
+			nworkers_per_ctx[s][1] = 0.0;
+			nworkers_per_ctx_rounded[s][0] = 0;
+			nworkers_per_ctx_rounded[s][1] = 0;
 			
 		}
 		
@@ -255,15 +257,15 @@ static void _try_resizing(void)
 				
 				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers[s][0] += w_in_s[s][w];
+					nworkers_per_ctx[s][0] += w_in_s[s][w];
 					if(w_in_s[s][w] >= 0.3)
-						nworkers_rounded[s][0]++;
+						nworkers_per_ctx_rounded[s][0]++;
 				}
 				else
 				{
-					nworkers[s][1] += w_in_s[s][w];
+					nworkers_per_ctx[s][1] += w_in_s[s][w];
 					if(w_in_s[s][w] > 0.5)
-						nworkers_rounded[s][1]++;
+						nworkers_per_ctx_rounded[s][1]++;
 				}
 			}
 		}
@@ -271,7 +273,7 @@ static void _try_resizing(void)
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 		
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
 		
 	}
 }
@@ -282,11 +284,11 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
         if(ret != EBUSY)
 	{
 		unsigned criteria = sc_hypervisor_get_resize_criteria();
-		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+		if(criteria != SC_NOTHING && criteria == SC_SPEED)
 		{
-			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+			if(sc_hypervisor_check_speed_gap_btw_ctxs())
 			{
-				_try_resizing();
+				_try_resizing(NULL, -1, NULL, -1);
 			}
 		}
                 starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
@@ -304,7 +306,7 @@ static debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
                         {
-                                _try_resizing();
+                                _try_resizing(NULL, -1, NULL, -1);
 //                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                               \
                                                                                                                                                                                                                     
                         }
@@ -313,18 +315,29 @@ static debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
         }
 }
 
+static void debit_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{
+		_try_resizing(sched_ctxs, nsched_ctxs, workers, nworkers);
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
+
 static void debit_lp_end_ctx(unsigned sched_ctx)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	int worker;
 /* 	for(worker = 0; worker < 12; worker++) */
-/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_speed[worker]); */
 
 	return;
 }
 
 struct sc_hypervisor_policy debit_lp_policy = {
 	.size_ctxs = NULL,
+	.resize_ctxs = debit_lp_resize_ctxs,
 	.handle_poped_task = debit_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = debit_lp_handle_idle_cycle,

+ 49 - 22
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -20,13 +20,14 @@
 #include <sys/time.h>
 
 #ifdef STARPU_HAVE_GLPK_H
-static void _try_resizing(void)
+static void _try_resizing(int *sched_ctxs, int nsched_ctxs)
 {
 	/* for vite */
 	starpu_trace_user_event(2);
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 
-	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
-	double nworkers[nsched_ctxs][2];
+	double nworkers_per_ctx[ns][2];
 	int nw = 1;
 #ifdef STARPU_USE_CUDA
 	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
@@ -40,7 +41,7 @@ static void _try_resizing(void)
 	struct timeval end_time;
 	gettimeofday(&start_time, NULL);
 	
-	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers, total_nw);
+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_ctx, total_nw);
 	gettimeofday(&end_time, NULL);
 	
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
@@ -50,43 +51,44 @@ static void _try_resizing(void)
 	
 	if(vmax != 0.0)
 	{
-		int nworkers_rounded[nsched_ctxs][nw];
-		sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers, nworkers_rounded);
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_rounded, nworkers);
+		int nworkers_per_ctx_rounded[nsched_ctxs][nw];
+		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_ctx, nworkers_per_ctx_rounded);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
 	}
-	
 }
-static void feft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+
+static void feft_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
+				      __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
 		unsigned criteria = sc_hypervisor_get_resize_criteria();
-		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+		if(criteria != SC_NOTHING && criteria == SC_SPEED)
 		{
-			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+			if(sc_hypervisor_check_speed_gap_btw_ctxs())
 			{
-				_try_resizing();
+				_try_resizing(NULL, -1);
 			}
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 
 }
-static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
+static void feft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
-	int nsched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : ns;
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int nw = 1;
 #ifdef STARPU_USE_CUDA
 	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
 	nw = ncuda != 0 ? 2 : 1;
 #endif
-	double nworkers_per_type[nsched_ctxs][nw];
+	double nworkers_per_type[ns][nw];
 	int total_nw[nw];
 	sc_hypervisor_group_workers_by_type(workers, nworkers, nw, total_nw);
 
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers_per_type, total_nw);
+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_type, total_nw);
 	if(vmax != 0.0)
 	{
 // 		printf("********size\n");
@@ -100,8 +102,8 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworker
 /* 				printf("ctx %d/worker type %d: n = %lf \n", i, 1, nworkers_per_type[i][1]); */
 /* #endif */
 /* 		} */
-		int nworkers_per_type_rounded[nsched_ctxs][nw];
-		sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers_per_type, nworkers_per_type_rounded);
+		int nworkers_per_type_rounded[ns][nw];
+		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_type, nworkers_per_type_rounded);
 /*       	for( i = 0; i < nsched_ctxs; i++) */
 /* 		{ */
 /* 			printf("ctx %d/worker type %d: n = %d \n", i, 0, nworkers_per_type_rounded[i][0]); */
@@ -125,14 +127,14 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworker
 			}
 		}
 		if(has_workers)
-			sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_per_type_rounded, nworkers_per_type);
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_type_rounded, nworkers_per_type, current_sched_ctxs);
 		else
-			sc_hypervisor_lp_distribute_resources_in_ctxs(sched_ctxs, nsched_ctxs, nw, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
+			sc_hypervisor_lp_distribute_resources_in_ctxs(sched_ctxs, ns, nw, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
 	}
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
-static feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+static void feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
@@ -143,7 +145,7 @@ static feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
 			{
-				_try_resizing();
+				_try_resizing(NULL, -1);
 //				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
 			}
 		}
@@ -151,8 +153,33 @@ static feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 	}
 }
 
+static void feft_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , 
+				__attribute__((unused))int *workers, __attribute__((unused))int nworkers)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{
+		struct sc_hypervisor_wrapper* sc_w  = NULL;
+		int s = 0;
+		for(s = 0; s < nsched_ctxs; s++)
+		{
+			 sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+			
+			 if((sc_w->submitted_flops + (0.1*sc_w->total_flops)) < sc_w->total_flops)
+			 {
+				 starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+				 return;
+			 }
+		}
+
+		_try_resizing(sched_ctxs, nsched_ctxs);
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
+
 struct sc_hypervisor_policy feft_lp_policy = {
 	.size_ctxs = feft_lp_size_ctxs,
+	.resize_ctxs = feft_lp_resize_ctxs,
 	.handle_poped_task = feft_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = feft_lp_handle_idle_cycle, //NULL,

+ 2 - 2
sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c

@@ -58,10 +58,10 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 	struct sc_hypervisor_wrapper* sender_sc_w = sc_hypervisor_get_wrapper(sender_sched_ctx);
 	struct sc_hypervisor_wrapper* receiver_sc_w = sc_hypervisor_get_wrapper(receiver_sched_ctx);
         int *workers = NULL;
-        double v_receiver = sc_hypervisor_get_ctx_velocity(receiver_sc_w);
+        double v_receiver = sc_hypervisor_get_ctx_speed(receiver_sc_w);
         double receiver_remainig_flops = receiver_sc_w->remaining_flops;
         double sender_exp_end = _get_exp_end(sender_sched_ctx);
-        double sender_v_cpu = sc_hypervisor_get_velocity_per_worker_type(sender_sc_w, STARPU_CPU_WORKER);
+        double sender_v_cpu = sc_hypervisor_get_speed_per_worker_type(sender_sc_w, STARPU_CPU_WORKER);
         double v_for_rctx = (receiver_remainig_flops/(sender_exp_end - starpu_timing_now())) - v_receiver;
 
         int nworkers_needed = v_for_rctx/sender_v_cpu;

+ 52 - 41
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -22,7 +22,7 @@
 
 struct ispeed_lp_data
 {
-	double **velocity;
+	double **speed;
 	double *flops;
 	double **flops_on_w;
 	int *workers;
@@ -38,11 +38,10 @@ static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
 {
 	struct ispeed_lp_data *sd = (struct ispeed_lp_data *)specific_data;
 
-	double **velocity = sd->velocity;
+	double **speed = sd->speed;
 	double *flops = sd->flops;
 	
 	double **final_flops_on_w = sd->flops_on_w;
-        int *workers = sd->workers;
 	
 	double w_in_s[ns][nw];
 	double flops_on_w[ns][nw];
@@ -110,7 +109,7 @@ static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
 				/* nflosp[s][w] */
 				ia[n] = curr_row_idx+s*nw+w+1;
 				ja[n] = colnum(w, s);
-				ar[n] = 1 / velocity[s][w];
+				ar[n] = 1 / speed[s][w];
 
 				n++;
 				
@@ -257,19 +256,18 @@ static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
 static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, int *in_sched_ctxs, int *workers)
 {
 //	double flops[ns];
-//	double velocity[ns][nw];
+//	double speed[ns][nw];
 	double *flops = (double*)malloc(ns*sizeof(double));
-	double **velocity = (double **)malloc(ns*sizeof(double*));
+	double **speed = (double **)malloc(ns*sizeof(double*));
 	int i;
 	for(i = 0; i < ns; i++)
-		velocity[i] = (double*)malloc(nw*sizeof(double));
+		speed[i] = (double*)malloc(nw*sizeof(double));
 
 	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	int w,s;
 
 	struct sc_hypervisor_wrapper* sc_w = NULL;
-	double total_flops = 0.0;
 	for(s = 0; s < ns; s++)
 	{
 		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
@@ -278,24 +276,24 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 			w_in_s[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 
-			velocity[s][w] = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
-			if(velocity[s][w] == -1.0)
+			speed[s][w] = sc_hypervisor_get_speed_per_worker(sc_w, worker);
+			if(speed[s][w] == -1.0)
 			{
 				enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
-				velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
+				speed[s][w] = sc_hypervisor_get_speed(sc_w, arch);
 				if(arch == STARPU_CUDA_WORKER)
 				{
 					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
 					if(!worker_in_ctx)
 					{
-						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
-						velocity[s][w] = (velocity[s][w] * transfer_velocity) / (velocity[s][w] + transfer_velocity);
+						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
+						speed[s][w] = (speed[s][w] * transfer_speed) / (speed[s][w] + transfer_speed);
 					}
 				}
 
 			}
 			
-//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+//			printf("v[w%d][s%d] = %lf\n",w, s, speed[s][w]);
 		}
 		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
@@ -310,7 +308,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	double tmin = 0.0;
 
         struct ispeed_lp_data specific_data;
-        specific_data.velocity = velocity;
+        specific_data.speed = speed;
         specific_data.flops = flops;
         specific_data.flops_on_w = flops_on_w;
         specific_data.workers = workers;
@@ -319,16 +317,18 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 								tmin, tmax, smallest_tmax, _glp_resolve);
 
 	for(i = 0; i < ns; i++)
-		free(velocity[i]);
-	free(velocity);
+		free(speed[i]);
+	free(speed);
 	
 	return found_sol;
 }
 
-static void _try_resizing(void)
+static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	int ns = sc_hypervisor_get_nsched_ctxs();
-	int nw = starpu_worker_get_count(); /* Number of different workers */
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	int nw = nworkers == -1 ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
+
+	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 	
 	double w_in_s[ns][nw];
 //			double flops_on_w[ns][nw];
@@ -337,19 +337,19 @@ static void _try_resizing(void)
 	for(i = 0; i < ns; i++)
 		flops_on_w[i] = (double*)malloc(nw*sizeof(double));
 	
-	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, NULL, NULL);
+	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, sched_ctxs, workers);
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
 	{
 		int w, s;
-		double nworkers[ns][2];
-		int nworkers_rounded[ns][2];
+		double nworkers_per_ctx[ns][2];
+		int nworkers_per_ctx_rounded[ns][2];
 		for(s = 0; s < ns; s++)
 		{
-			nworkers[s][0] = 0.0;
-			nworkers[s][1] = 0.0;
-			nworkers_rounded[s][0] = 0;
-			nworkers_rounded[s][1] = 0;
+			nworkers_per_ctx[s][0] = 0.0;
+			nworkers_per_ctx[s][1] = 0.0;
+			nworkers_per_ctx_rounded[s][0] = 0;
+			nworkers_per_ctx_rounded[s][1] = 0;
 			
 		}
 		
@@ -361,15 +361,15 @@ static void _try_resizing(void)
 				
 				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers[s][0] += w_in_s[s][w];
+					nworkers_per_ctx[s][0] += w_in_s[s][w];
 					if(w_in_s[s][w] >= 0.3)
-						nworkers_rounded[s][0]++;
+						nworkers_per_ctx_rounded[s][0]++;
 				}
 				else
 				{
-					nworkers[s][1] += w_in_s[s][w];
+					nworkers_per_ctx[s][1] += w_in_s[s][w];
 					if(w_in_s[s][w] > 0.5)
-						nworkers_rounded[s][1]++;
+						nworkers_per_ctx_rounded[s][1]++;
 				}
 			}
 		}
@@ -377,31 +377,32 @@ static void _try_resizing(void)
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 		
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
 	}
 	for(i = 0; i < ns; i++)
 		free(flops_on_w[i]);
 	free(flops_on_w);
 }
 
-static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void ispeed_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
+					__attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
         int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
         if(ret != EBUSY)
         {
                 unsigned criteria = sc_hypervisor_get_resize_criteria();
-                if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+                if(criteria != SC_NOTHING && criteria == SC_SPEED)
                 {
-                        if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+                        if(sc_hypervisor_check_speed_gap_btw_ctxs())
                         {
-                                _try_resizing();
+                                _try_resizing(NULL, -1, NULL, -1);
                         }
                 }
                 starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
         }
 }
 
-static ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+static void ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
         int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
         if(ret != EBUSY)
@@ -412,7 +413,7 @@ static ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
                         {
-                                _try_resizing();
+                                _try_resizing(NULL, -1, NULL, -1);
 //                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                                
                         }
                 }
@@ -420,19 +421,29 @@ static ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
         }
 }
 
+static void ispeed_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{
+		_try_resizing(sched_ctxs, nsched_ctxs, workers, nworkers);
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
 
 static void ispeed_lp_end_ctx(unsigned sched_ctx)
 {
-	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
-	int worker;
+/* 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); */
+/* 	int worker; */
 /* 	for(worker = 0; worker < 12; worker++) */
-/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
+/* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_speed[worker]); */
 
 	return;
 }
 
 struct sc_hypervisor_policy ispeed_lp_policy = {
 	.size_ctxs = NULL,
+	.resize_ctxs = ispeed_lp_resize_ctxs,
 	.handle_poped_task = ispeed_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = ispeed_lp_handle_idle_cycle,

+ 19 - 19
sc_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -22,16 +22,16 @@ static unsigned _get_fastest_sched_ctx(void)
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int fastest_sched_ctx = STARPU_NMAX_SCHED_CTXS;
-	double curr_velocity = 0.0;
-	double biggest_velocity = 0.0;
+	double curr_speed = 0.0;
+	double biggest_speed = 0.0;
 	int i;
 	for(i = 0; i < nsched_ctxs; i++)
 	{
-		curr_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[i]));
-		if( curr_velocity > biggest_velocity)
+		curr_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(sched_ctxs[i]));
+		if( curr_speed > biggest_speed)
 		{
 			fastest_sched_ctx = sched_ctxs[i];
-			biggest_velocity = curr_velocity;
+			biggest_speed = curr_speed;
 		}
 	}
 
@@ -43,16 +43,16 @@ static unsigned _get_slowest_sched_ctx(void)
 	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
-	double smallest_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[0]));
-	unsigned slowest_sched_ctx = smallest_velocity == -1.0  ? STARPU_NMAX_SCHED_CTXS : sched_ctxs[0];
-	double curr_velocity = 0.0;
+	double smallest_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(sched_ctxs[0]));
+	unsigned slowest_sched_ctx = smallest_speed == -1.0  ? STARPU_NMAX_SCHED_CTXS : sched_ctxs[0];
+	double curr_speed = 0.0;
 	int i;
 	for(i = 1; i < nsched_ctxs; i++)
 	{
-		curr_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[i]));
-		if((curr_velocity < smallest_velocity || smallest_velocity == 0.0) && curr_velocity != -1.0)
+		curr_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(sched_ctxs[i]));
+		if((curr_speed < smallest_speed || smallest_speed == 0.0) && curr_speed != -1.0)
 		{
-			smallest_velocity = curr_velocity;
+			smallest_speed = curr_speed;
 			slowest_sched_ctx = sched_ctxs[i];
 		}
 	}
@@ -104,8 +104,8 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 
 					if(!considered)
 					{
-						double worker_velocity = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
-						if(worker_velocity != -1.0)
+						double worker_speed = sc_hypervisor_get_speed_per_worker(sc_w, worker);
+						if(worker_speed != -1.0)
 						{
 							/* the first iteration*/
 							if(curr_workers[index] < 0)
@@ -119,9 +119,9 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 							else if(config->priority[worker] ==
 								config->priority[curr_workers[index]])
 							{
-								double curr_worker_velocity = sc_hypervisor_get_velocity_per_worker(sc_w, curr_workers[index]);
-//								printf("speed[%d] = %lf speed[%d] = %lf\n", worker, worker_velocity, curr_workers[index], curr_worker_velocity);
-								if(worker_velocity < curr_worker_velocity && curr_worker_velocity != -1.0)
+								double curr_worker_speed = sc_hypervisor_get_speed_per_worker(sc_w, curr_workers[index]);
+//								printf("speed[%d] = %lf speed[%d] = %lf\n", worker, worker_speed, curr_workers[index], curr_worker_speed);
+								if(worker_speed < curr_worker_speed && curr_worker_speed != -1.0)
 								{
 									curr_workers[index] = worker;
 								}
@@ -161,9 +161,9 @@ static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct star
 						double new_speed = 0.0;
 						int i;
 						for(i = 0; i < nworkers_to_move; i++)
-							new_speed += sc_hypervisor_get_velocity_per_worker(sc_hypervisor_get_wrapper(fastest_sched_ctx), workers_to_move[i]);
-						double fastest_speed = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(fastest_sched_ctx));
-						double slowest_speed = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(slowest_sched_ctx));
+							new_speed += sc_hypervisor_get_speed_per_worker(sc_hypervisor_get_wrapper(fastest_sched_ctx), workers_to_move[i]);
+						double fastest_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(fastest_sched_ctx));
+						double slowest_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(slowest_sched_ctx));
 //						printf("fast_speed(%d) %lf slow_speed(%d) %lf new speed(%d) %lf \n", fastest_sched_ctx, fastest_speed, slowest_sched_ctx, 
 //						       slowest_speed, workers_to_move[0], new_speed);
 						if(fastest_speed != -1.0 && slowest_speed != -1.0 && (slowest_speed + new_speed) <= (fastest_speed - new_speed))

+ 47 - 14
sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c

@@ -53,6 +53,7 @@ static double _compute_workers_distrib(int ns, int nw, double final_w_in_s[ns][n
 	double tasks[nw][nt];
 	double times[nw][nt];
 	
+	/* times in ms */
 	sc_hypervisor_get_tasks_times(nw, nt, times, workers, size_ctxs, task_pools);
 
 	double res = 0.0;
@@ -101,7 +102,8 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 
 	/* smallest possible tmax, difficult to obtain as we
 	   compute the nr of flops and not the tasks */
-	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, workers);
+        /*lp computes it in s but it's converted to ms just before return */
+	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, workers); 
 	double smallest_tmax = possible_tmax / 3;
 	double tmax = possible_tmax * ns;
 	double tmin = smallest_tmax;
@@ -149,21 +151,24 @@ static void size_if_required()
 	}
 }
 
-static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint)
+static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size)
 {
 	/* count the tasks of the same type */
 	starpu_pthread_mutex_lock(&mutex);
-	sc_hypervisor_policy_add_task_to_pool(cl, sched_ctx, footprint, &task_pools);
+	sc_hypervisor_policy_add_task_to_pool(cl, sched_ctx, footprint, &task_pools, data_size);
 	starpu_pthread_mutex_unlock(&mutex);
 
 	size_if_required();
 }
 
-static void _try_resizing(void)
+static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	starpu_trace_user_event(2);
-	int ns = sc_hypervisor_get_nsched_ctxs();
-	int nw = starpu_worker_get_count(); /* Number of different workers */
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
+
+	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+
 	int nt = 0; /* Number of different kinds of tasks */
 	
 //			starpu_pthread_mutex_lock(&mutex);
@@ -195,19 +200,21 @@ static void _try_resizing(void)
 	specific_data.tmp_task_pools = tmp_task_pools;
 	specific_data.size_ctxs = 0;
 
-			/* smallest possible tmax, difficult to obtain as we
-			   compute the nr of flops and not the tasks */
+	/* smallest possible tmax, difficult to obtain as we
+	   compute the nr of flops and not the tasks */
+        /*lp computes it in s but it's converted to ms just before return */
 	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, NULL);
-	double smallest_tmax = possible_tmax / 3;
+	double smallest_tmax = 0.0;//possible_tmax / 3;
 	double tmax = possible_tmax * ns;
 	double tmin = smallest_tmax;
+
 	unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
 								tmin, tmax, smallest_tmax, _compute_workers_distrib);
 //			starpu_pthread_mutex_unlock(&mutex);
 	
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
-		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
+		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 0);
 	
 	struct sc_hypervisor_policy_task_pool *next = NULL;
 	struct sc_hypervisor_policy_task_pool *tmp_tp = tmp_task_pools;
@@ -222,6 +229,7 @@ static void _try_resizing(void)
 		free(tasks_per_worker[i]);
 	free(tasks_per_worker);
 }
+
 static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
@@ -236,12 +244,12 @@ static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct sta
 		}
 
 		unsigned criteria = sc_hypervisor_get_resize_criteria();
-		if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
+		if(criteria != SC_NOTHING && criteria == SC_SPEED)
 		{
 			
-			if(sc_hypervisor_check_velocity_gap_btw_ctxs())
+			if(sc_hypervisor_check_speed_gap_btw_ctxs())
 			{
-				_try_resizing();
+				_try_resizing(NULL, -1, NULL, -1);
 			}
 		}
 
@@ -274,7 +282,7 @@ static int teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
 			{
-				_try_resizing();
+				_try_resizing(NULL, -1, NULL, -1);
 //				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
 			}
 		}
@@ -288,8 +296,33 @@ static void teft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, i
 	sc_hypervisor_save_size_req(sched_ctxs, nsched_ctxs, workers, nworkers);
 }
 
+static void teft_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{
+		struct sc_hypervisor_wrapper* sc_w  = NULL;
+		int s = 0;
+		for(s = 0; s < nsched_ctxs; s++)
+		{
+			 sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+			
+			if((sc_w->submitted_flops + (0.1*sc_w->total_flops)) < sc_w->total_flops)
+			{
+				starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+				return;
+			}
+		}
+
+
+		_try_resizing(sched_ctxs, nsched_ctxs, workers, nworkers);
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
+
 struct sc_hypervisor_policy teft_lp_policy = {
 	.size_ctxs = teft_lp_size_ctxs,
+	.resize_ctxs = teft_lp_resize_ctxs,
 	.handle_poped_task = teft_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = teft_lp_handle_idle_cycle,

+ 1 - 1
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -241,7 +241,7 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*nt+s*nw+w+1);
                         else
 				w_in_s[s][w] = glp_get_col_prim(lp, nw*nt+s*nw+w+1);
-//			printf("w_in_s[%d][%d]=%lf\n", s, w, w_in_s[s][w]);
+//			printf("w %d in ctx %d = %lf\n", w, s, w_in_s[s][w]);
 		}
 //	printf("\n");
 

+ 8 - 9
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -41,13 +41,13 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
 		if(ncuda != 0)
 		{
-			v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
-			v[i][1] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER);
+			v[i][1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
 		}
 		else
-			v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
 #else
-		v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+		v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
 #endif // STARPU_USE_CUDA
 		
 		flops[i] = sc_w->remaining_flops < 0.0 ? 0.0 : sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
@@ -83,7 +83,7 @@ double sc_hypervisor_lp_get_tmax(int nw, int *workers)
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double res[nsched_ctxs][ntypes_of_workers];
-	return sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000;
+	return sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000.0;
 }
 
 void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
@@ -299,9 +299,8 @@ void _lp_find_workers_to_remove(int nw, int tmp_nw_move[nw], int tmp_workers_mov
 	}
 }
 
-void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *sched_ctxs)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int s, s2, w;
 	for(s = 0; s < ns; s++)
 	{
@@ -490,7 +489,7 @@ void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][n
 	}
 	
 	if(!do_size)
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers, sched_ctxs_input);
 	else
 	{
 		int *current_sched_ctxs = sched_ctxs_input == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs_input;
@@ -507,7 +506,7 @@ void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][n
 			}
 		}
 		if(has_workers)
-			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers, current_sched_ctxs);
 		else
 			sc_hypervisor_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, 2, nworkers_rounded, nworkers, workers_input, nw);
 	}

+ 36 - 24
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -17,7 +17,6 @@
 #include "sc_hypervisor_policy.h"
 #include "sc_hypervisor_intern.h"
 #include "sc_hypervisor_lp.h"
-#include <math.h>
 
 static int _compute_priority(unsigned sched_ctx)
 {
@@ -365,7 +364,7 @@ double sc_hypervisor_get_slowest_ctx_exec_time(void)
 
 //		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
 		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
-		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_velocity(sc_w);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_speed(sc_w);
 		if(elapsed_time > slowest_time)
 			slowest_time = elapsed_time;
 
@@ -388,7 +387,7 @@ double sc_hypervisor_get_fastest_ctx_exec_time(void)
 		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
 
 		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
-		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_velocity(sc_w);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_speed(sc_w);
 		
 		if(elapsed_time < fastest_time)
 			fastest_time = elapsed_time;
@@ -437,25 +436,38 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
                                 times[w][t] = NAN;
 			else
 			{
-                                times[w][t] = length / 1000.;
+                                times[w][t] = (length / 1000.);
 
 				double transfer_time = 0.0;
+				unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
 				enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
-				if(arch == STARPU_CUDA_WORKER)
+				if(!worker_in_ctx && !size_ctxs)
 				{
-					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, tp->sched_ctx_id);
-					if(!worker_in_ctx && !size_ctxs)
+					if(arch == STARPU_CUDA_WORKER)
 					{
-						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
-						transfer_time +=  (tp->footprint / transfer_velocity) / 1000. ;
+						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker);
+						transfer_time +=  (tp->data_size / transfer_speed) / 1000. ;
+						double latency = starpu_get_latency_RAM_CUDA(worker);
+						transfer_time += latency/1000.;
+						
+						
+					}
+					else if(arch == STARPU_CPU_WORKER)
+					{
+						if(!starpu_sched_ctx_contains_type_of_worker(arch, tp->sched_ctx_id))
+						{
+							double transfer_speed = starpu_get_bandwidth_CUDA_RAM(worker);
+							transfer_time += (tp->data_size / transfer_speed) / 1000. ;
+							double latency = starpu_get_latency_CUDA_RAM(worker);
+							transfer_time += latency / 1000.;
+						}
 					}
-					double latency = starpu_get_latency_RAM_CUDA(worker);
-					transfer_time += latency/1000.;
-
 				}
+
 //				printf("%d/%d %s x %d time = %lf transfer_time = %lf\n", w, tp->sched_ctx_id, tp->cl->model->symbol, tp->n, times[w][t], transfer_time);
 				times[w][t] += transfer_time;
 			}
+//			printf("sc%d w%d task %s nt %d times %lf s\n", tp->sched_ctx_id, w, tp->cl->model->symbol, tp->n, times[w][t]);
                 }
         }
 }
@@ -476,8 +488,8 @@ unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker)
 	return 0;
 }
 
-/* check if there is a big velocity gap between the contexts */
-unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
+/* check if there is a big speed gap between the contexts */
+unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 {
 	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
@@ -524,8 +536,8 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 			{
 				sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 				double v[nw];
-				v[0] = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
-				v[1] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+				v[0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER);
+				v[1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
 				
 				optimal_v[i] = nworkers_per_type[i][0] * v[0] + nworkers_per_type[i][1]* v[1];
 				_set_optimal_v(i, optimal_v[i]);
@@ -542,7 +554,7 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 			
-			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
+			double ctx_v = sc_hypervisor_get_ctx_speed(sc_w);
 			if(ctx_v == -1.0)
 				return 0;
 		}
@@ -551,19 +563,19 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 			
-			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
+			double ctx_v = sc_hypervisor_get_ctx_speed(sc_w);
 			if(ctx_v != -1.0 && ((ctx_v < 0.8*optimal_v[i]) || ctx_v > 1.2*optimal_v[i])) 
 				return 1;
 		}
 	}
-	else /* if we have not been able to compute a theoretical velocity consider the env variable
-		SC_MAX_VELOCITY_GAP and compare the speed of the contexts, whenever the difference
+	else /* if we have not been able to compute a theoretical speed consider the env variable
+		SC_MAX_SPEED_GAP and compare the speed of the contexts, whenever the difference
 		btw them is greater than the max value the function returns true */
 	{
 		for(i = 0; i < nsched_ctxs; i++)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
-			double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
+			double ctx_v = sc_hypervisor_get_ctx_speed(sc_w);
 			if(ctx_v != -1.0)
 			{
 				for(j = 0; j < nsched_ctxs; j++)
@@ -575,11 +587,11 @@ unsigned sc_hypervisor_check_velocity_gap_btw_ctxs(void)
 							return 1;
 						
 						other_sc_w = sc_hypervisor_get_wrapper(sched_ctxs[j]);
-						double other_ctx_v = sc_hypervisor_get_ctx_velocity(other_sc_w);
+						double other_ctx_v = sc_hypervisor_get_ctx_speed(other_sc_w);
 						if(other_ctx_v != -1.0)
 						{
 							double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v;
-							double max_vel = _get_max_velocity_gap();
+							double max_vel = _get_max_speed_gap();
 							if(gap > max_vel)
 								return 1;
 						}
@@ -601,7 +613,7 @@ unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker)
 		if(criteria == SC_IDLE)
 			return sc_hypervisor_check_idle(sched_ctx, worker);
 		else
-			return sc_hypervisor_check_velocity_gap_btw_ctxs();
+			return sc_hypervisor_check_speed_gap_btw_ctxs();
 	}
 	else
 		return 0;

+ 39 - 46
sc_hypervisor/src/policies_utils/speed.c

@@ -19,7 +19,7 @@
 #include <math.h>
 
 
-double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w)
+double sc_hypervisor_get_ctx_speed(struct sc_hypervisor_wrapper* sc_w)
 {
 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
         double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
@@ -43,7 +43,7 @@ double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w)
 	return -1.0;
 }
 
-double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
+double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
 {
 	if(!starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx))
 		return -1.0;
@@ -74,8 +74,8 @@ double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w,
 /* /\* 			if(!worker_in_ctx) *\/ */
 /* /\* 			{ *\/ */
 
-/* /\* 				double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
-/* /\* 				elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ; *\/ */
+/* /\* 				double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
+/* /\* 				elapsed_time +=  (elapsed_data_used / transfer_speed) / 1000000 ; *\/ */
 /* /\* 			} *\/ */
 /* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
 /* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
@@ -84,7 +84,6 @@ double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w,
 /* 		} */
 			
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
-		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 1.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
                 return vel;
         }
 
@@ -94,8 +93,8 @@ double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w,
 }
 
 
-/* compute an average value of the cpu/cuda velocity */
-double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
+/* compute an average value of the cpu/cuda speed */
+double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
         int worker;
@@ -104,7 +103,7 @@ double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper*
 	if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
 
-	double velocity = 0.0;
+	double speed = 0.0;
 	unsigned nworkers = 0;
         while(workers->has_next(workers, &it))
 	{
@@ -112,57 +111,51 @@ double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper*
                 enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
                 if(arch == req_arch)
                 {
-			double _vel = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
-			if(_vel == -1.0) return -1.0;
-			velocity += _vel;
-			nworkers++;
+			double _vel = sc_hypervisor_get_speed_per_worker(sc_w, worker);
+			if(_vel > 0.0)
+			{
+				speed += _vel;
+				nworkers++;
+
+			}
 		}
-	}
-			
+	}			
 
-        return (nworkers != 0 ? velocity / nworkers : -1.0);
+	speed = ((nworkers != 0 && speed > 0.1) ? speed / nworkers : -1.0);
+	if(speed != -1.0)
+	{
+		if(arch == STARPU_CUDA_WORKER)
+			sc_w->ref_speed[0] = sc_w->ref_speed[0] > 1.0 ? (sc_w->ref_speed[0] + speed) / 2 : speed; 
+		else
+			sc_w->ref_speed[1] = sc_w->ref_speed[1] > 1.0 ? (sc_w->ref_speed[1] + speed) / 2 : speed; 
+	}
+	return speed;
 }
 
-/* compute an average value of the cpu/cuda old velocity */
-double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
+/* compute an average value of the cpu/cuda old speed */
+double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
-	double ref_velocity = 0.0;
-	unsigned nw = 0;
-
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
-	int worker;
-
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
+	if(arch == STARPU_CUDA_WORKER && sc_w->ref_speed[0] > 0.0)
+		return sc_w->ref_speed[0];
+	else
+		if(arch == STARPU_CPU_WORKER && sc_w->ref_speed[1] > 0.0)
+			return sc_w->ref_speed[1];
 
-	while(workers->has_next(workers, &it))
-	{
-		worker = workers->get_next(workers, &it);
-                enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
-                if(arch == req_arch)
-                {
-			if(sc_w->ref_velocity[worker] < 1.0) return -1.0;
-			ref_velocity += sc_w->ref_velocity[worker];
-			nw++;
-		}
-	}
-	
-	return (nw != 0 ? ref_velocity / nw : -1.0);
+	return -1.0;
 }
 
-double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
+double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
 {
 
-	double velocity = sc_hypervisor_get_velocity_per_worker_type(sc_w, arch);
-	if(velocity == -1.0)
+	double speed = sc_hypervisor_get_speed_per_worker_type(sc_w, arch);
+	if(speed == -1.0)
 	{
-		velocity = sc_hypervisor_get_ref_velocity_per_worker_type(sc_w, arch);
+		speed = sc_hypervisor_get_ref_speed_per_worker_type(sc_w, arch);
 	}
-	if(velocity == -1.0)
+	if(speed == -1.0)
 	{
-		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
+		speed = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
 	}
        
-	return velocity;
+	return speed;
 }

+ 2 - 1
sc_hypervisor/src/policies_utils/task_pool.c

@@ -17,7 +17,7 @@
 
 #include "sc_hypervisor_policy.h"
 
-void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools)
+void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools, size_t data_size)
 {
 	struct sc_hypervisor_policy_task_pool *tp = NULL;
 
@@ -35,6 +35,7 @@ void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned s
 		tp->sched_ctx_id = sched_ctx;
 		tp->n = 0;
 		tp->next = *task_pools;
+		tp->data_size = data_size;
 		*task_pools = tp;
 	}
 

+ 23 - 14
sc_hypervisor/src/sc_hypervisor.c

@@ -28,7 +28,7 @@ static void notify_pushed_task(unsigned sched_ctx, int worker);
 static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
 static void notify_idle_end(unsigned sched_ctx, int  worker);
-static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
+static void notify_submitted_job(struct starpu_task *task, unsigned footprint, size_t data_size);
 static void notify_delete_context(unsigned sched_ctx);
 
 extern struct sc_hypervisor_policy idle_policy;
@@ -63,6 +63,7 @@ static void _load_hypervisor_policy(struct sc_hypervisor_policy *policy)
 
 	hypervisor.policy.name = policy->name;
 	hypervisor.policy.size_ctxs = policy->size_ctxs;
+	hypervisor.policy.resize_ctxs = policy->resize_ctxs;
 	hypervisor.policy.handle_poped_task = policy->handle_poped_task;
 	hypervisor.policy.handle_pushed_task = policy->handle_pushed_task;
 	hypervisor.policy.handle_idle_cycle = policy->handle_idle_cycle;
@@ -134,10 +135,10 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 {
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
-	char* vel_gap = getenv("SC_HYPERVISOR_MAX_VELOCITY_GAP");
-	hypervisor.max_velocity_gap = vel_gap ? atof(vel_gap) : SC_VELOCITY_MAX_GAP_DEFAULT;
+	char* vel_gap = getenv("SC_HYPERVISOR_MAX_SPEED_GAP");
+	hypervisor.max_speed_gap = vel_gap ? atof(vel_gap) : SC_SPEED_MAX_GAP_DEFAULT;
 	char* crit =  getenv("SC_HYPERVISOR_TRIGGER_RESIZE");
-	hypervisor.resize_criteria = !crit ? SC_IDLE : strcmp(crit,"idle") == 0 ? SC_IDLE : (strcmp(crit,"speed") == 0 ? SC_VELOCITY : SC_NOTHING);
+	hypervisor.resize_criteria = !crit ? SC_IDLE : strcmp(crit,"idle") == 0 ? SC_IDLE : (strcmp(crit,"speed") == 0 ? SC_SPEED : SC_NOTHING);
 
 	starpu_pthread_mutex_init(&act_hypervisor_mutex, NULL);
 	hypervisor.start_executing_time = starpu_timing_now();
@@ -164,6 +165,9 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 		starpu_pthread_mutex_init(&hypervisor.sched_ctx_w[i].mutex, NULL);
 		hypervisor.optimal_v[i] = 0.0;
 
+		hypervisor.sched_ctx_w[i].ref_speed[0] = -1.0;
+		hypervisor.sched_ctx_w[i].ref_speed[1] = -1.0;
+
 		int j;
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
 		{
@@ -177,7 +181,6 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
-			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
 		}
 	}
 
@@ -231,8 +234,8 @@ static void _print_current_time()
 			{
 				struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
 				
-				double cpu_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
-				double cuda_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
+				double cpu_speed = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
+				double cuda_speed = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER);
 				int ncpus = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
 				int ncuda = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
 				fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda);
@@ -352,9 +355,9 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 }
 
 
-double _get_max_velocity_gap()
+double _get_max_speed_gap()
 {
-	return hypervisor.max_velocity_gap;
+	return hypervisor.max_speed_gap;
 }
 
 unsigned sc_hypervisor_get_resize_criteria()
@@ -716,7 +719,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
 /* Enqueue a resize request for 'sched_ctx', to be executed when the
  * 'task_tag' tasks of 'sched_ctx' complete.  */
-void sc_hypervisor_resize(unsigned sched_ctx, int task_tag)
+void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
 {
 	struct resize_request_entry *entry;
 
@@ -731,6 +734,12 @@ void sc_hypervisor_resize(unsigned sched_ctx, int task_tag)
 	starpu_pthread_mutex_unlock(&hypervisor.resize_mut[sched_ctx]);
 }
 
+void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	if(hypervisor.policy.resize_ctxs)
+		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
+}
+
 /* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
 static void notify_idle_end(unsigned sched_ctx, int worker)
 {
@@ -866,21 +875,21 @@ static void notify_post_exec_hook(unsigned sched_ctx, int task_tag)
 	return;
 }
 
-static void notify_submitted_job(struct starpu_task *task, uint32_t footprint)
+static void notify_submitted_job(struct starpu_task *task, uint32_t footprint, size_t data_size)
 {
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	hypervisor.sched_ctx_w[task->sched_ctx].submitted_flops += task->flops;
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
 	if(hypervisor.policy.handle_submitted_job && !type_of_tasks_known)
-		hypervisor.policy.handle_submitted_job(task->cl, task->sched_ctx, footprint);
+		hypervisor.policy.handle_submitted_job(task->cl, task->sched_ctx, footprint, data_size);
 }
 
-void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint)
+void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size)
 {
 	type_of_tasks_known = 1;
 	if(hypervisor.policy.handle_submitted_job)
-		hypervisor.policy.handle_submitted_job(cl, sched_ctx, footprint);
+		hypervisor.policy.handle_submitted_job(cl, sched_ctx, footprint, data_size);
 }
 
 static void notify_delete_context(unsigned sched_ctx)

+ 4 - 4
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -17,7 +17,7 @@
 #include <sc_hypervisor.h>
 #include <common/uthash.h>
 
-#define SC_VELOCITY_MAX_GAP_DEFAULT 50
+#define SC_SPEED_MAX_GAP_DEFAULT 50
 
 struct size_request
 {
@@ -78,8 +78,8 @@ struct sc_hypervisor
 	/* time when the hypervisor started */
 	double start_executing_time;
 
-	/* max velocity diff btw ctx before triggering resizing */
-	double max_velocity_gap;
+	/* max speed diff btw ctx before triggering resizing */
+	double max_speed_gap;
 	
 	/* criteria to trigger resizing */
 	unsigned resize_criteria;
@@ -101,7 +101,7 @@ void _add_config(unsigned sched_ctx);
 
 void _remove_config(unsigned sched_ctx);
 
-double _get_max_velocity_gap();
+double _get_max_speed_gap();
 
 double _get_optimal_v(unsigned sched_ctx);
 void _set_optimal_v(unsigned sched_ctx, double optimal_v);

+ 27 - 27
src/common/list.h

@@ -24,48 +24,48 @@
  * *********************************************************
  * LIST_TYPE(FOO, contenu);
  *  - déclare les types suivants
- *      + pour les cellules : FOO
- *      + pour les listes : FOO_list
- *      + pour les itérateurs : FOO
+ *      + pour les cellules : struct FOO
+ *      + pour les listes : struct FOO_list
+ *      + pour les itérateurs : struct FOO
  *  - déclare les accesseurs suivants :
  *     * création d'une cellule
- *   FOO_t      FOO_new(void);
+ *   struct FOO*	FOO_new(void);
  *     * suppression d'une cellule
- *   void       FOO_delete(FOO_t);
+ *   void		FOO_delete(struct FOO*);
  *     * création d'une liste (vide)
- *   FOO_list_t FOO_list_new(void);
+ *   struct FOO_list*	FOO_list_new(void);
  *     * suppression d'une liste
- *   void       FOO_list_delete(FOO_list_t);
+ *   void		FOO_list_delete(struct FOO_list*);
  *     * teste si une liste est vide
- *   int        FOO_list_empty(FOO_list_t);
+ *   int		FOO_list_empty(struct FOO_list*);
  *     * retire un élément de la liste
- *   void       FOO_list_erase(FOO_list_t, FOO_t);
+ *   void		FOO_list_erase(struct FOO_list*, struct FOO*);
  *     * ajoute une élément en queue de liste
- *   void       FOO_list_push_back(FOO_list_t, FOO_t);
+ *   void		FOO_list_push_back(struct FOO_list*, struct FOO*);
  *     * ajoute un élément en tête de list
- *   void       FOO_list_push_front(FOO_list_t, FOO_t);
+ *   void		FOO_list_push_front(struct FOO_list*, struct FOO*);
  *     * ajoute la deuxième liste à la fin de la première liste
- *   FOO_t      FOO_list_push_list_back(FOO_list_t, FOO_list_t);
+ *   struct FOO*	FOO_list_push_list_back(struct FOO_list*, struct FOO_list*);
  *     * ajoute la première liste au début de la deuxième liste
- *   FOO_t      FOO_list_push_list_front(FOO_list_t, FOO_list_t);
+ *   struct FOO*	FOO_list_push_list_front(struct FOO_list*, struct FOO_list*);
  *     * retire l'élément en queue de liste
- *   FOO_t      FOO_list_pop_back(FOO_list_t);
+ *   struct FOO*	FOO_list_pop_back(struct FOO_list*);
  *     * retire l'élement en tête de liste
- *   FOO_t      FOO_list_pop_front(FOO_list_t);
+ *   struct FOO*	FOO_list_pop_front(struct FOO_list*);
  *     * retourne l'élément en queue de liste
- *   FOO_t      FOO_list_back(FOO_list_t);
+ *   struct FOO*	FOO_list_back(struct FOO_list*);
  *     * retourne l'élement en tête de liste
- *   FOO_t      FOO_list_front(FOO_list_t);
+ *   struct FOO*	FOO_list_front(struct FOO_list*);
  *     * vérifie si la liste chainée est cohérente
- *   int	FOO_list_check(FOO_list_t);
+ *   int		FOO_list_check(struct FOO_list*);
  *     *
- *   FOO_t      FOO_list_begin(FOO_list_t);
+ *   struct FOO*	FOO_list_begin(struct FOO_list*);
  *     *
- *   FOO_t      FOO_list_end(FOO_list_t);
+ *   struct FOO*	FOO_list_end(struct FOO_list*);
  *     *
- *   FOO_t      FOO_list_next(FOO_t)
+ *   struct FOO*	FOO_list_next(struct FOO*)
  *     *
- *   int        FOO_list_size(FOO_list_t)
+ *   int		FOO_list_size(struct FOO_list*)
  * *********************************************************
  * Exemples d'utilisation :
  *  - au départ, on a :
@@ -79,16 +79,16 @@
  *      int a;
  *      int b;
  *    );
- *    qui crée les types ma_structure_t et ma_structure_list_t.
+ *    qui crée les types struct ma_structure et struct ma_structure_list.
  *  - allocation d'une liste vide :
- *  ma_structure_list_t l = ma_structure_list_new();
+ *  struct ma_structure_list * l = ma_structure_list_new();
  *  - ajouter un élément 'e' en tête de la liste 'l' :
- *  ma_structure_t e = ma_structure_new();
+ *  struct ma_structure * e = ma_structure_new();
  *  e->a = 0;
- *  e->b = 1;
+ *  e->b = 0;
  *  ma_structure_list_push_front(l, e);
  *  - itérateur de liste :
- *  ma_structure i;
+ *  struct ma_structure * i;
  *  for(i  = ma_structure_list_begin(l);
  *      i != ma_structure_list_end(l);
  *      i  = ma_structure_list_next(i))

+ 12 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -1381,12 +1381,22 @@ static void write_bus_bandwidth_file_content(void)
 
 double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
 {
-	return bandwidth_matrix[0][cudadev+1];
+	return bandwidth_matrix[STARPU_MAIN_RAM][cudadev+1];
 }
 
 double starpu_get_latency_RAM_CUDA(unsigned cudadev)
 {
-	return latency_matrix[0][cudadev+1];
+	return latency_matrix[STARPU_MAIN_RAM][cudadev+1];
+}
+
+double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
+{
+	return bandwidth_matrix[1][STARPU_MAIN_RAM];
+}
+
+double starpu_get_latency_CUDA_RAM(unsigned cudadev)
+{
+	return latency_matrix[1][STARPU_MAIN_RAM];
 }
 
 void starpu_bus_print_bandwidth(FILE *f)

+ 50 - 12
src/core/sched_ctx.c

@@ -33,8 +33,6 @@ static unsigned _starpu_worker_get_first_free_sched_ctx(struct _starpu_worker *w
 
 static unsigned _starpu_worker_get_sched_ctx_id(struct _starpu_worker *worker, unsigned sched_ctx_id);
 
-static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **workerids);
-
 static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
 {
 	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
@@ -163,7 +161,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 			workers->add(workers, worker);
 			workers_to_add[i] = worker;
 		}
-}
+	}
 
 	if(sched_ctx->sched_policy->add_workers)
 	{
@@ -205,7 +203,7 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 {
 	int *workerids = NULL;
 
-	unsigned nworkers_ctx = _get_workers_list(sched_ctx, &workerids);
+	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 
 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
@@ -242,7 +240,7 @@ static void _starpu_sched_ctx_create_hwloc_tree(struct _starpu_sched_ctx *sched_
 }
 #endif
 
-struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int *workerids,
+struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerids,
 				  int nworkers_ctx, unsigned is_initial_sched,
 				  const char *sched_name)
 {
@@ -279,7 +277,7 @@ struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int
 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
 
 	/*init the strategy structs and the worker_collection of the ressources of the context */
-	_starpu_init_sched_policy(config, sched_ctx, policy_name);
+	_starpu_init_sched_policy(config, sched_ctx, policy);
 
 	/* construct the collection of workers(list/tree/etc.) */
 	sched_ctx->workers->init(sched_ctx->workers);
@@ -437,6 +435,9 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
 						 unsigned allow_overlap)
 {
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(config, policy_name);
+
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	int workers[max_ncpus + max_ngpus];
 	int nw = 0;
@@ -449,7 +450,7 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 	for(i = 0; i < nw; i++)
 		printf("%d ", workers[i]);
 	printf("\n");
-	sched_ctx = _starpu_create_sched_ctx(policy_name, workers, nw, 0, sched_name);
+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workers, nw, 0, sched_name);
 	sched_ctx->min_ncpus = min_ncpus;
 	sched_ctx->max_ncpus = max_ncpus;
 	sched_ctx->min_ngpus = min_ngpus;
@@ -462,11 +463,27 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 	return sched_ctx->id;
 
 }
+
 unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids,
 				 int nworkers, const char *sched_name)
 {
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(config, policy_name);
+
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	sched_ctx = _starpu_create_sched_ctx(policy_name, workerids, nworkers, 0, sched_name);
+	sched_ctx = _starpu_create_sched_ctx(selected_policy, workerids, nworkers, 0, sched_name);
+
+	_starpu_update_workers_with_ctx(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
+#ifdef STARPU_USE_SC_HYPERVISOR
+	sched_ctx->perf_counters = NULL;
+#endif
+	return sched_ctx->id;
+}
+
+unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *policy, int *workerids, int nworkers, const char *sched_name)
+{
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	sched_ctx = _starpu_create_sched_ctx(policy, workerids, nworkers, 0, sched_name);
 
 	_starpu_update_workers_with_ctx(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
 #ifdef STARPU_USE_SC_HYPERVISOR
@@ -518,7 +535,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 	STARPU_ASSERT(sched_ctx->id != STARPU_NMAX_SCHED_CTXS);
 
 	int *workerids;
-	unsigned nworkers_ctx = _get_workers_list(sched_ctx, &workerids);
+	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 	
 	/*if both of them have all the ressources is pointless*/
 	/*trying to transfer ressources from one ctx to the other*/
@@ -541,7 +558,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
 	}
 
-	/* workerids is malloc-ed in _get_workers_list, don't forget to free it when
+	/* workerids is malloc-ed in starpu_sched_ctx_get_workers_list, don't forget to free it when
 	   you don't use it anymore */
 	free(workerids);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
@@ -777,7 +794,7 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 			if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 			{
 				int *workerids = NULL;
-				unsigned nworkers = _get_workers_list(sched_ctx, &workerids);
+				unsigned nworkers = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 				
 				if(nworkers > 0)
 				{
@@ -864,8 +881,9 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	return sched_ctx->workers;
 }
 
-static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **workerids)
+unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
 {
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 	*workerids = (int*)malloc(workers->nworkers*sizeof(int));
 	int worker;
@@ -995,6 +1013,26 @@ unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
 	return 0;
 }
 
+unsigned starpu_sched_ctx_contains_type_of_worker(enum starpu_worker_archtype arch, unsigned sched_ctx_id)
+{
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	int worker;
+
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		enum starpu_worker_archtype curr_arch = starpu_worker_get_type(worker);
+		if(curr_arch == arch)
+			return 1;
+	}
+	return 0;
+
+}
+
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id)
 {
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();

+ 1 - 1
src/core/sched_ctx.h

@@ -124,7 +124,7 @@ void _starpu_init_sched_ctx_for_worker(unsigned workerid);
 void _starpu_delete_sched_ctx_for_worker(unsigned workerid);
 
 /* allocate all structures belonging to a context */
-struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name);
+struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name);
 
 /* delete all sched_ctx */
 void _starpu_delete_all_sched_ctxs();

+ 3 - 6
src/core/sched_policy.c

@@ -126,7 +126,7 @@ static void display_sched_help_message(void)
 	 }
 }
 
-static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_config *config, const char *required_policy)
+struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_config *config, const char *required_policy)
 {
 	struct starpu_sched_policy *selected_policy = NULL;
 	struct starpu_conf *user_conf = config->conf;
@@ -155,7 +155,7 @@ static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_co
 	return &_starpu_sched_eager_policy;
 }
 
-void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _starpu_sched_ctx *sched_ctx, const char *required_policy)
+void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _starpu_sched_ctx *sched_ctx, struct starpu_sched_policy *selected_policy)
 {
 	/* Perhaps we have to display some help */
 	display_sched_help_message();
@@ -168,9 +168,6 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _st
 	/* Set calibrate flag */
 	_starpu_set_calibrate_flag(config->conf->calibrate);
 
-	struct starpu_sched_policy *selected_policy;
-	selected_policy = select_sched_policy(config, required_policy);
-
 	load_sched_policy(selected_policy, sched_ctx);
 
 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
@@ -295,7 +292,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
 static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
 {
-	int worker = -1, nworkers = 0;
+	unsigned worker = 0, nworkers = 0;
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 
 	struct starpu_sched_ctx_iterator it;

+ 3 - 1
src/core/sched_policy.h

@@ -27,10 +27,12 @@ struct starpu_machine_config;
 struct starpu_sched_policy *_starpu_get_sched_policy( struct _starpu_sched_ctx *sched_ctx);
 
 void _starpu_init_sched_policy(struct _starpu_machine_config *config,
-			       struct _starpu_sched_ctx *sched_ctx, const char *required_policy);
+			       struct _starpu_sched_ctx *sched_ctx, struct starpu_sched_policy *policy);
 
 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx);
 
+struct starpu_sched_policy *_starpu_select_sched_policy(struct _starpu_machine_config *config, const char *required_policy);
+
 int _starpu_push_task(struct _starpu_job *task);
 
 /* actually pushes the tasks to the specific worker or to the scheduler */

+ 11 - 2
src/core/task.c

@@ -237,9 +237,18 @@ int _starpu_submit_job(struct _starpu_job *j)
 	   && sched_ctx->perf_counters != NULL)
 	{
 		_starpu_compute_buffers_footprint(j->task->cl->model, STARPU_CPU_DEFAULT, 0, j);
-		sched_ctx->perf_counters->notify_submitted_job(j->task, j->footprint);
+		int i;
+		size_t data_size = 0;
+		for(i = 0; i < STARPU_NMAXBUFS; i++)
+		{
+			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
+			if (handle != NULL)
+				data_size += _starpu_data_get_size(handle);
+		}
+
+		sched_ctx->perf_counters->notify_submitted_job(j->task, j->footprint, data_size);
 	}
-#endif
+#endif//STARPU_USE_SC_HYPERVISOR
 
 	/* We retain handle reference count */
 	if (task->cl)

+ 5 - 1
src/core/workers.c

@@ -1002,7 +1002,11 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	_starpu_initialize_current_task_key();
 
 	if (!is_a_sink)
-		_starpu_create_sched_ctx(config.conf->sched_policy_name, NULL, -1, 1, "init");
+	{
+		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&config, config.conf->sched_policy_name);
+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init");
+
+	}
 
 	_starpu_initialize_registered_performance_models();
 

+ 1 - 1
tests/disk/disk_copy.c

@@ -41,7 +41,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 
 	/* register a disk */
-	int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) "/tmp/", 1024*1024*200);
+	int new_dd = starpu_disk_register(&starpu_disk_stdio_ops, (void *) "/tmp/", 1024*1024*200);
 	/* can't write on /tmp/ */
 	if (new_dd == -ENOENT) goto enoent;
 	

+ 6 - 1
tools/gdbinit

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010-2013  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -15,6 +15,11 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+# To set a breakpoint when starting gdb with option "-ex run",
+# here what you need to do:
+#set breakpoint pending on
+#break starpu_mpi.c:419
+
 define starpu-print-job
   set language c
   set $job = (struct _starpu_job *)$arg0