Browse Source

merge trunk@9188:9396

Nathalie Furmento 12 years ago
parent
commit
294e2975e6
100 changed files with 7231 additions and 3462 deletions
  1. 2 2
      .gitignore
  2. 1 0
      AUTHORS
  3. 8 3
      ChangeLog
  4. 38 12
      INSTALL
  5. 1 1
      Makefile.am
  6. 14 14
      configure.ac
  7. 18 18
      doc/Makefile.am
  8. 0 1177
      doc/chapters/advanced-api.texi
  9. 193 3
      doc/chapters/advanced-examples.texi
  10. 4001 0
      doc/chapters/api.texi
  11. 19 19
      doc/chapters/basic-examples.texi
  12. 0 47
      doc/chapters/benchmarks.texi
  13. 1 1
      doc/chapters/c-extensions.texi
  14. 301 0
      doc/chapters/hypervisor_api.texi
  15. 234 59
      doc/chapters/installing.texi
  16. 1 1
      doc/chapters/introduction.texi
  17. 14 111
      doc/chapters/mpi-support.texi
  18. 5 36
      doc/chapters/perf-feedback.texi
  19. 0 34
      doc/chapters/perf-optimization.texi
  20. 116 0
      doc/chapters/sched_ctx.texi
  21. 24 290
      doc/chapters/sched_ctx_hypervisor.texi
  22. 18 0
      doc/chapters/tips-tricks.texi
  23. 0 134
      doc/chapters/using.texi
  24. 3 3
      doc/chapters/vector_scal_cuda.texi
  25. 3 3
      doc/chapters/vector_scal_opencl.texi
  26. 2 2
      doc/chapters/vector_scal_opencl_codelet.texi
  27. 35 44
      doc/starpu.texi
  28. 11 6
      doc/tutorial/Makefile
  29. 4 1
      doc/tutorial/README
  30. 28 24
      doc/tutorial/hello_world.c
  31. 43 0
      doc/tutorial/hello_world_plugin.c
  32. 70 68
      doc/tutorial/vector_scal.c
  33. 24 25
      doc/tutorial/vector_scal_cpu.c
  34. 4 4
      doc/tutorial/vector_scal_cuda.cu
  35. 37 37
      doc/tutorial/vector_scal_opencl.c
  36. 7 0
      examples/Makefile.am
  37. 2 0
      examples/basic_examples/multiformat.c
  38. 2 0
      examples/basic_examples/multiformat_conversion_codelets.c
  39. 2 0
      examples/basic_examples/multiformat_cuda.cu
  40. 1 2
      examples/basic_examples/multiformat_types.h
  41. 3 5
      examples/basic_examples/vector_scal.c
  42. 3 3
      examples/basic_examples/vector_scal_cuda.cu
  43. 4 4
      examples/basic_examples/vector_scal_opencl.c
  44. 2 2
      examples/basic_examples/vector_scal_opencl_kernel.cl
  45. 6 1
      examples/mult/xgemm.c
  46. 41 24
      examples/pi/pi.c
  47. 1 3
      examples/pi/pi.h
  48. 3 2
      examples/pi/pi_kernel.cu
  49. 37 11
      examples/pi/pi_redux.c
  50. 1 0
      examples/scheduler/dummy_sched.c
  51. 3 3
      gcc-plugin/examples/vector_scal/vector_scal_cuda.cu
  52. 1 1
      gcc-plugin/examples/vector_scal/vector_scal_opencl_kernel.cl
  53. 0 1
      include/starpu.h
  54. 116 23
      include/starpu_sched_ctx.h
  55. 0 56
      include/starpu_scheduler.h
  56. 3 2
      include/starpu_task_util.h
  57. 22 12
      include/starpu_thread.h
  58. 9 1
      include/starpu_util.h
  59. 36 0
      include/starpu_worker.h
  60. 2 2
      m4/gcc.m4
  61. 1 0
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  62. 1 2
      mpi/include/starpu_mpi.h
  63. 0 4
      mpi/tests/Makefile.am
  64. 5 2
      sched_ctx_hypervisor/Makefile.am
  65. 7 6
      sched_ctx_hypervisor/examples/Makefile.am
  66. 67 27
      sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
  67. 0 0
      sc_hypervisor/examples/cholesky/cholesky.h
  68. 0 0
      sc_hypervisor/examples/cholesky/cholesky_grain_tag.c
  69. 0 0
      sc_hypervisor/examples/cholesky/cholesky_implicit.c
  70. 0 0
      sc_hypervisor/examples/cholesky/cholesky_kernels.c
  71. 0 0
      sc_hypervisor/examples/cholesky/cholesky_models.c
  72. 0 0
      sc_hypervisor/examples/cholesky/cholesky_tag.c
  73. 0 0
      sc_hypervisor/examples/cholesky/cholesky_tile_tag.c
  74. 134 0
      sc_hypervisor/examples/lp_test/lp_test.c
  75. 27 27
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  76. 0 0
      sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
  77. 129 0
      sc_hypervisor/include/sc_hypervisor.h
  78. 99 0
      sc_hypervisor/include/sc_hypervisor_config.h
  79. 30 10
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h
  80. 128 0
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  81. 104 0
      sc_hypervisor/include/sc_hypervisor_policy.h
  82. 13 13
      sched_ctx_hypervisor/src/Makefile.am
  83. 3 6
      sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c
  84. 14 12
      sched_ctx_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  85. 138 0
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  86. 31 31
      sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
  87. 8 8
      sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c
  88. 121 139
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  89. 19 19
      sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
  90. 155 255
      sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
  91. 88 0
      sc_hypervisor/src/policies_utils/dichotomy.c
  92. 64 47
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
  93. 69 65
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
  94. 99 0
      sc_hypervisor/src/policies_utils/task_pool.c
  95. 14 14
      sched_ctx_hypervisor/src/sched_ctx_config.c
  96. 76 73
      sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
  97. 7 7
      sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
  98. 0 238
      sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
  99. 0 120
      sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
  100. 0 0
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h

+ 2 - 2
.gitignore

@@ -13,8 +13,6 @@
 /doc/starpu.info
 *~
 ,*
-Makefile
-Makefile.in
 .libs
 .deps
 *.o
@@ -291,3 +289,5 @@ starpu.log
 /gcc-plugin/tests/registered
 /gcc-plugin/tests/warn-unregistered
 /cyclomatic-complexity.html
+./Makefile
+Makefile.in

+ 1 - 0
AUTHORS

@@ -17,3 +17,4 @@ Ludovic Stordeur <ludovic.stordeur@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>
+Andra Hugo <andra.hugo@inria.fr>

+ 8 - 3
ChangeLog

@@ -73,8 +73,6 @@ New features:
         - When exchanging user-defined data interfaces, the size of
 	  the data is the size returned by the pack operation, i.e
 	  data with dynamic size can now be exchanged with StarPU-MPI.
-        - New functionality starpu_mpi_irecv_probe_detached which
-  	  first tests if the message is available before calling MPI_Recv
   * Add experimental simgrid support, to simulate execution with various
     number of CPUs, GPUs, amount of memory, etc.
   * Add support for OpenCL simulators (which provide simulated execution time)
@@ -122,7 +120,6 @@ New features:
     and a Simgrid one. Applications using StarPU and wishing to use
     the Simgrid StarPU features should use it.
 
-
 Small features:
   * Add starpu_worker_get_by_type and starpu_worker_get_by_devid
   * Add starpu_fxt_stop_profiling/starpu_fxt_start_profiling which permits to
@@ -137,6 +134,10 @@ Small features:
   * New configure option --enable-mpi-progression-hook to enable the
     activity polling method for StarPU-MPI.
   * Permit to disable sequential consistency for a given task.
+  * New batch files to run StarPU applications with Microsoft Visual C
+  * Add examples/release/Makefile to test StarPU examples against an
+    installed version of StarPU. That can also be used to test
+    examples using a previous API.
 
 Changes:
   * Fix the block filter functions.
@@ -179,6 +180,10 @@ Changes:
   * Rename function starpu_helper_cublas_shutdown to starpu_cublas_shutdown
   * Rename function starpu_allocate_buffer_on_node to starpu_malloc_on_node
   * Rename function starpu_free_buffer_on_node to starpu_free_on_node
+  * Rename getter and setter functions for minimum and maximum task
+    priorities
+  * starpu_scheduler.h is no longer automatically included by
+    starpu.h, it has to be manually included when needed
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is

+ 38 - 12
INSTALL

@@ -19,6 +19,7 @@ Contents
 
 * Installing StarPU on a Unix machine
 * Installing StarPU on Windows
+* Running StarPU Applications on Microsoft Visual C
 
 
 
@@ -39,25 +40,30 @@ cygwin part.
 
 1. Install cygwin
 
-http://cygwin.com/install.html
+   http://cygwin.com/install.html
 
-Make sure the following packages are available:
-- (Devel)/subversion
-- (Devel)/libtool
-- (Devel)/gcc
-- (Devel)/make
-- your favorite editor (vi, emacs, ...)
-- (Devel)/gdb
-- (Archive)/zip
-- (Devel)/pkg-config
+   Make sure the following packages are available:
+   - (Devel)/subversion
+   - (Devel)/libtool
+   - (Devel)/gcc
+   - (Devel)/make
+   - your favorite editor (vi, emacs, ...)
+   - (Devel)/gdb
+   - (Archive)/zip
+   - (Devel)/pkg-config
 
 2. Install mingw
 
-http://www.mingw.org/
+   http://www.mingw.org/
 
 3. Install hwloc (not mandatory, but strongly recommended)
 
-http://www.open-mpi.org/projects/hwloc
+   http://www.open-mpi.org/projects/hwloc
+
+   Be careful which version you are installing. Even if your machine
+   runs windows 64 bits, if you are running a 32 bits mingw (check the
+   output of the command uname -a), you will need to install the 32
+   bits version of hwloc.
 
 4. Install Microsoft Visual C++ Studio Express
 
@@ -210,3 +216,23 @@ autogen.sh part.
    export PATH=<StarPU installation directory>/bin:$PATH
 
 
+Running StarPU Applications on Microsoft Visual C
+-------------------------------------------------
+
+Batch files are provided to run StarPU applications under Microsoft
+Visual C. They are installed in path_to_starpu/bin/msvc.
+
+To execute a StarPU application, you first need to set the environment
+variable STARPUPATH.
+
+cd c:\cygwin\home\ci\starpu\
+set STARPUPATH=c:\cygwin\home\ci\starpu\
+cd bin\msvc
+starpu_exec.bat starpu_simple.c
+
+The batch script will run Microsoft Visual C with a basic project file
+to run the given application.
+
+The batch script starpu_clean.bat can be used to delete all
+compilation generated files.
+

+ 1 - 1
Makefile.am

@@ -45,7 +45,7 @@ SUBDIRS += starpufft
 endif
 
 if STARPU_BUILD_SCHED_CTX_HYPERVISOR
-SUBDIRS += sched_ctx_hypervisor
+SUBDIRS += sc_hypervisor
 endif
 
 pkgconfigdir = $(libdir)/pkgconfig

+ 14 - 14
configure.ac

@@ -266,23 +266,23 @@ AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of
 AC_ARG_ENABLE([sched_ctx_hypervisor],
   [AS_HELP_STRING([--enable-sched-ctx-hypervisor],
     [enable resizing contexts (experimental)])],
-  [enable_sched_ctx_hypervisor="yes"],
-  [enable_sched_ctx_hypervisor="no"])
+  [enable_sc_hypervisor="yes"],
+  [enable_sc_hypervisor="no"])
 
 #for pkgconfig
 AC_SUBST(STARPU_SCHED_CTX_HYPERVISOR)
-if test "x$enable_sched_ctx_hypervisor" = "xyes"; then
-  AC_DEFINE(STARPU_USE_SCHED_CTX_HYPERVISOR, [1], [enable sched_ctx_hypervisor lib])
-#   PKG_CHECK_MODULES([SCHED_CTX_HYPERVISOR], [libsched_ctx_hypervisor], [], build_sched_ctx_hypervisor="yes")
-   STARPU_SCHED_CTX_HYPERVISOR="-lsched_ctx_hypervisor"
-   build_sched_ctx_hypervisor="yes"
+if test "x$enable_sc_hypervisor" = "xyes"; then
+  AC_DEFINE(STARPU_USE_SCHED_CTX_HYPERVISOR, [1], [enable sc_hypervisor lib])
+#   PKG_CHECK_MODULES([SCHED_CTX_HYPERVISOR], [libsc_hypervisor], [], build_sc_hypervisor="yes")
+   STARPU_SCHED_CTX_HYPERVISOR="-lsc_hypervisor"
+   build_sc_hypervisor="yes"
 else
-   build_sched_ctx_hypervisor="no"
+   build_sc_hypervisor="no"
 fi
 
 
-AM_CONDITIONAL([STARPU_BUILD_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hypervisor" = "xyes"])
-AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hypervisor" = "xyes"])
+AM_CONDITIONAL([STARPU_BUILD_SCHED_CTX_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
+AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 ###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #
@@ -1906,9 +1906,9 @@ AC_OUTPUT([
 	gcc-plugin/tests/Makefile
 	gcc-plugin/tests/run-test
 	gcc-plugin/examples/Makefile
-	sched_ctx_hypervisor/Makefile
-	sched_ctx_hypervisor/src/Makefile
-	sched_ctx_hypervisor/examples/Makefile
+	sc_hypervisor/Makefile
+	sc_hypervisor/src/Makefile
+	sc_hypervisor/examples/Makefile
 ])
 
 AC_MSG_NOTICE([
@@ -1947,7 +1947,7 @@ AC_MSG_NOTICE([
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
 	       SOCL enabled:                                $build_socl
                SOCL test suite:                             $run_socl_check
-               Scheduler Hypervisor:                        $build_sched_ctx_hypervisor
+               Scheduler Hypervisor:                        $build_sc_hypervisor
                simgrid enabled:                             $enable_simgrid
                ayudame enabled:                             $ac_cv_header_Ayudame_h
 ])

+ 18 - 18
doc/Makefile.am

@@ -12,30 +12,30 @@
 
 info_TEXINFOS = starpu.texi
 
-chapters =	chapters/advanced-api.texi \
-	chapters/benchmarks.texi \
-	chapters/configuration.texi \
-	chapters/perf-feedback.texi \
-	chapters/vector_scal_cpu.texi \
+chapters =	\
 	chapters/advanced-examples.texi \
+	chapters/api.texi \
+	chapters/basic-examples.texi \
+	chapters/c-extensions.texi \
+	chapters/configuration.texi \
 	chapters/fdl-1.3.texi \
-	chapters/perf-optimization.texi \
-	chapters/vector_scal_c.texi \
-	chapters/basic-api.texi \
+	chapters/fft-support.texi \
+	chapters/hypervisor_api.texi \
 	chapters/installing.texi \
-	chapters/scaling-vector-example.texi \
-	chapters/vector_scal_cuda.texi \
-	chapters/basic-examples.texi \
 	chapters/introduction.texi \
-	chapters/tips-tricks.texi \
-	chapters/vector_scal_opencl_codelet.texi \
-	chapters/c-extensions.texi \
 	chapters/mpi-support.texi \
-	chapters/fft-support.texi \
-	chapters/using.texi \
-	chapters/vector_scal_opencl.texi \
+	chapters/perf-feedback.texi \
+	chapters/perf-optimization.texi \
+	chapters/scaling-vector-example.texi \
+	chapters/sched_ctx_hypervisor.texi \
+	chapters/sched_ctx.texi \
 	chapters/socl.texi \
-	chapters/sched_ctx_hypervisor.texi
+	chapters/tips-tricks.texi \
+	chapters/vector_scal_cpu.texi \
+	chapters/vector_scal_c.texi \
+	chapters/vector_scal_cuda.texi \
+	chapters/vector_scal_opencl_codelet.texi \
+	chapters/vector_scal_opencl.texi
 
 starpu_TEXINFOS = 		\
 	chapters/version.texi 	\

File diff suppressed because it is too large
+ 0 - 1177
doc/chapters/advanced-api.texi


+ 193 - 3
doc/chapters/advanced-examples.texi

@@ -12,14 +12,17 @@
 * Task and Worker Profiling::
 * Partitioning Data::
 * Performance model example::
-* Theoretical lower bound on execution time::
+* Theoretical lower bound on execution time example::
 * Insert Task Utility::
 * Data reduction::
 * Temporary buffers::
 * Parallel Tasks::
 * Debugging::
 * The multiformat interface::
+* Using the Driver API::
+* Defining a New Scheduling Policy::
 * On-GPU rendering::
+* Defining a New Data Interface::
 * More examples::               More examples shipped with StarPU
 @end menu
 
@@ -418,7 +421,9 @@ default. The @code{size_base} field of @code{struct starpu_perfmodel} however
 permits the application to override that, when for instance some of the data
 do not matter for task cost (e.g. mere reference table), or when using sparse
 structures (in which case it is the number of non-zeros which matter), or when
-there is some hidden parameter such as the number of iterations, etc.
+there is some hidden parameter such as the number of iterations, etc. The
+@code{examples/pi} examples uses this to include the number of iterations in the
+base.
 
 How to use schedulers which can benefit from such performance model is explained
 in @ref{Task scheduling policy}.
@@ -441,7 +446,7 @@ for indexing history-based performance models.
 needs to be called to destroy the dummy task afterwards. See
 @code{tests/perfmodels/regression_based.c} for an example.
 
-@node Theoretical lower bound on execution time
+@node Theoretical lower bound on execution time example
 @section Theoretical lower bound on execution time
 
 For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
@@ -769,6 +774,8 @@ the other on the same worker. Also, if for instance GPU memory becomes scarce,
 StarPU will notice that it can free such buffers easily, since the content does
 not matter.
 
+The @code{examples/pi} example uses scratches for some temporary buffer.
+
 @node Parallel Tasks
 @section Parallel Tasks
 
@@ -1022,6 +1029,58 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
 A full example may be found in @code{examples/basic_examples/multiformat.c}.
 
+@node Using the Driver API
+@section Using the Driver API
+
+@pxref{Running drivers}
+
+@cartouche
+@smallexample
+int ret;
+struct starpu_driver = @{
+    .type = STARPU_CUDA_WORKER,
+    .id.cuda_id = 0
+@};
+ret = starpu_driver_init(&d);
+if (ret != 0)
+    error();
+while (some_condition) @{
+    ret = starpu_driver_run_once(&d);
+    if (ret != 0)
+        error();
+@}
+ret = starpu_driver_deinit(&d);
+if (ret != 0)
+    error();
+@end smallexample
+@end cartouche
+
+@node Defining a New Scheduling Policy
+@section Defining a New Scheduling Policy
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory @code{examples/scheduler/}.
+
+@pxref{Scheduling Policy}
+
+@cartouche
+@smallexample
+static struct starpu_sched_policy dummy_sched_policy = @{
+    .init_sched = init_dummy_sched,
+    .deinit_sched = deinit_dummy_sched,
+    .add_workers = dummy_sched_add_workers,
+    .remove_workers = dummy_sched_remove_workers,
+    .push_task = push_task_dummy,
+    .push_prio_task = NULL,
+    .pop_task = pop_task_dummy,
+    .post_exec_hook = NULL,
+    .pop_every_task = NULL,
+    .policy_name = "dummy",
+    .policy_description = "dummy scheduling strategy"
+@};
+@end smallexample
+@end cartouche
+
 @node On-GPU rendering
 @section On-GPU rendering
 
@@ -1075,6 +1134,136 @@ starpu_data_unregister(handle);
 
 and display it e.g. in the callback function.
 
+@node Defining a New Data Interface
+@section Defining a New Data Interface
+
+Let's define a new data interface to manage complex numbers.
+
+@cartouche
+@smallexample
+/* interface for complex numbers */
+struct starpu_complex_interface
+@{
+        double *real;
+        double *imaginary;
+        int nx;
+@};
+@end smallexample
+@end cartouche
+
+Registering such a data to StarPU is easily done using the function
+@code{starpu_data_register} (@pxref{Basic Data Management API}). The last
+parameter of the function, @code{interface_complex_ops}, will be
+described below.
+
+@cartouche
+@smallexample
+void starpu_complex_data_register(starpu_data_handle_t *handle,
+     unsigned home_node, double *real, double *imaginary, int nx)
+@{
+        struct starpu_complex_interface complex =
+        @{
+                .real = real,
+                .imaginary = imaginary,
+                .nx = nx
+        @};
+
+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+        @{
+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
+        @}
+
+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
+@}
+@end smallexample
+@end cartouche
+
+Different operations need to be defined for a data interface through
+the type @code{struct starpu_data_interface_ops} (@pxref{Defining
+Interface}). We only define here the basic operations needed to
+run simple applications. The source code for the different functions
+can be found in the file
+@code{examples/interface/complex_interface.c}.
+
+@cartouche
+@smallexample
+static struct starpu_data_interface_ops interface_complex_ops =
+@{
+        .register_data_handle = complex_register_data_handle,
+        .allocate_data_on_node = complex_allocate_data_on_node,
+        .copy_methods = &complex_copy_methods,
+        .get_size = complex_get_size,
+        .footprint = complex_footprint,
+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+        .interface_size = sizeof(struct starpu_complex_interface),
+@};
+@end smallexample
+@end cartouche
+
+Functions need to be defined to access the different fields of the
+complex interface from a StarPU data handle.
+
+@cartouche
+@smallexample
+double *starpu_complex_get_real(starpu_data_handle_t handle)
+@{
+        struct starpu_complex_interface *complex_interface =
+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+        return complex_interface->real;
+@}
+
+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
+int starpu_complex_get_nx(starpu_data_handle_t handle);
+@end smallexample
+@end cartouche
+
+Similar functions need to be defined to access the different fields of the
+complex interface from a @code{void *} pointer to be used within codelet
+implemetations.
+
+@cartouche
+@smallexample
+#define STARPU_COMPLEX_GET_REAL(interface)	\
+        (((struct starpu_complex_interface *)(interface))->real)
+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
+        (((struct starpu_complex_interface *)(interface))->imaginary)
+#define STARPU_COMPLEX_GET_NX(interface)	\
+        (((struct starpu_complex_interface *)(interface))->nx)
+@end smallexample
+@end cartouche
+
+Complex data interfaces can then be registered to StarPU.
+
+@cartouche
+@smallexample
+double real = 45.0;
+double imaginary = 12.0;
+starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
+starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
+@end smallexample
+@end cartouche
+
+and used by codelets.
+
+@cartouche
+@smallexample
+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+@{
+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
+        int i;
+
+        for(i=0 ; i<nx ; i++)
+        @{
+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
+        @}
+@}
+@end smallexample
+@end cartouche
+
+The whole code for this complex data interface is available in the
+directory @code{examples/interface/}.
 @node More examples
 @section More examples
 
@@ -1109,3 +1298,4 @@ More advanced examples include:
 @item @code{cholesky/}:
     Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
 @end table
+

File diff suppressed because it is too large
+ 4001 - 0
doc/chapters/api.texi


+ 19 - 19
doc/chapters/basic-examples.texi

@@ -68,9 +68,9 @@ The code can then be compiled and linked with GCC and the
 @code{-fplugin} flag:
 
 @example
-$ gcc `pkg-config starpu-1.0 --cflags` hello-starpu.c \
-    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
-    `pkg-config starpu-1.0 --libs`
+$ gcc `pkg-config starpu-1.1 --cflags` hello-starpu.c \
+    -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` \
+    `pkg-config starpu-1.1 --libs`
 @end example
 
 The code can also be compiled without the StarPU C extension and will
@@ -273,7 +273,7 @@ disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
 
 @smallexample
 $ make hello_world
-cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0) hello_world.c -o hello_world
+cc $(pkg-config --cflags starpu-1.1)  $(pkg-config --libs starpu-1.1) hello_world.c -o hello_world
 $ ./hello_world
 Hello world (params = @{1, 2.000000@} )
 Callback function (arg 42)
@@ -392,9 +392,9 @@ The program can be compiled and linked with GCC and the @code{-fplugin}
 flag:
 
 @example
-$ gcc `pkg-config starpu-1.0 --cflags` vector_scal.c \
-    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
-    `pkg-config starpu-1.0 --libs`
+$ gcc `pkg-config starpu-1.1 --cflags` vector_scal.c \
+    -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` \
+    `pkg-config starpu-1.1 --libs`
 @end example
 
 And voil@`a!
@@ -442,8 +442,8 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
   if (err != CL_SUCCESS)
     STARPU_OPENCL_REPORT_ERROR (err);
 
-  err = clSetKernelArg (kernel, 0, sizeof (val), &val);
-  err |= clSetKernelArg (kernel, 1, sizeof (size), &size);
+  err = clSetKernelArg (kernel, 0, sizeof (size), &size);
+  err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
   err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
   if (err)
     STARPU_OPENCL_REPORT_ERROR (err);
@@ -512,7 +512,7 @@ the CUDA Kernel}).
 #include <stdlib.h>
 
 static __global__ void
-vector_mult_cuda (float *val, unsigned n, float factor)
+vector_mult_cuda (unsigned n, float *val, float factor)
 @{
   unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -528,7 +528,7 @@ vector_scal_cuda (size_t size, float vector[], float factor)
   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
 
   vector_mult_cuda <<< nblocks, threads_per_block, 0,
-    starpu_cuda_get_local_stream () >>> (vector, size, factor);
+    starpu_cuda_get_local_stream () >>> (size, vector, factor);
 
   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
 @}
@@ -666,7 +666,7 @@ constant factor from this pointer.
 
 @smallexample
 $ make vector_scal
-cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0)  vector_scal.c   -o vector_scal
+cc $(pkg-config --cflags starpu-1.1)  $(pkg-config --libs starpu-1.1)  vector_scal.c   -o vector_scal
 $ ./vector_scal
 0.000000 3.000000 6.000000 9.000000 12.000000
 @end smallexample
@@ -697,7 +697,7 @@ call.
 @smallexample
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(float *val, unsigned n,
+static __global__ void vector_mult_cuda(unsigned n, float *val,
                                         float factor)
 @{
     unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
@@ -717,7 +717,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
     unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
 @i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>}
-@i{                    (val, n, *factor);}
+@i{                    (n, val, *factor);}
 
 @i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
 @}
@@ -732,7 +732,7 @@ tools to compile a OpenCL kernel stored in a file.
 
 @cartouche
 @smallexample
-__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 @{
         const int i = get_global_id(0);
         if (i < nx) @{
@@ -773,8 +773,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 @i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
 @i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
 
-@i{    err = clSetKernelArg(kernel, 0, sizeof(val), &val);}
-@i{    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
+@i{    err = clSetKernelArg(kernel, 0, sizeof(n), &n);}
+@i{    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);}
 @i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
 @i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
 
@@ -915,8 +915,8 @@ be compiled at run-time when calling the function
 
 @cartouche
 @smallexample
-CFLAGS  += $(shell pkg-config --cflags starpu-1.0)
-LDFLAGS += $(shell pkg-config --libs starpu-1.0)
+CFLAGS  += $(shell pkg-config --cflags starpu-1.1)
+LDFLAGS += $(shell pkg-config --libs starpu-1.1)
 CC       = gcc
 
 vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o

+ 0 - 47
doc/chapters/benchmarks.texi

@@ -1,47 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2012  University of Bordeaux
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Task size overhead::           Overhead of tasks depending on their size
-* Data transfer latency::        Latency of data transfers
-* Gemm::                         Matrix-matrix multiplication
-* Cholesky::                     Cholesky factorization
-* LU::                           LU factorization
-@end menu
-
-Some interesting benchmarks are installed among examples in
-/usr/lib/starpu/examples . Make sure to try various schedulers, for instance
-STARPU_SCHED=dmda
-
-@node Task size overhead
-@section Task size overhead
-
-This benchmark gives a glimpse into how big a size should be for StarPU overhead
-to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
-of the speedup of tasks of various sizes, depending on the number of CPUs being
-used.
-
-@node Data transfer latency
-@section Data transfer latency
-
-@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
-prints the measured latency.
-
-@node Gemm
-@section Matrix-matrix multiplication
-
-@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
-multiplication using BLAS and cuBLAS. They output the obtained GFlops.
-
-@node Cholesky
-@section Cholesky factorization
-
-@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
-
-@node LU
-@section LU factorization
-
-@code{lu*} perform an LU factorization. They use different dependency primitives.

+ 1 - 1
doc/chapters/c-extensions.texi

@@ -29,7 +29,7 @@ When StarPU has been installed with its GCC plug-in, programs that use
 these extensions can be compiled this way:
 
 @example
-$ gcc -c -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` foo.c
+$ gcc -c -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` foo.c
 @end example
 
 @noindent

+ 301 - 0
doc/chapters/hypervisor_api.texi

@@ -0,0 +1,301 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+
+@cindex Scheduling Context Hypervisor's API
+
+@menu
+* Managing the hypervisor::				Initialize the hypervisor
+* Registering Scheduling Contexts to the hypervisor:: 	Contexts have to register to the hypervisor
+* The user's input in the resizing process:: 		The user can help the hypervisor decide how to resize
+* Performance Counters::              			StarPU provides information to the Hypervisor through performance counters
+* Defining a new hypervisor policy::      		New Policies can be implemented
+@end menu
+
+@node Managing the hypervisor
+@section Managing the hypervisor
+There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
+
+@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
+Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
+These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
+@end deftypefun
+
+Note: The Hypervisor is actually a worker that takes this role once certain conditions trigger the resizing process (there is no additional thread assigned to the hypervisor).
+
+@deftypefun void sched_ctx_hypervisor_shutdown (void)
+The hypervisor and all information concerning it is cleaned. There is no synchronization between this function and starpu_shutdown. Thus, this should be done after starpu_shutdown(),
+because the performance counters will still need allocated callback functions.
+@end deftypefun
+
+@node Registering Scheduling Contexts to the hypervisor
+@section Registering Scheduling Contexts to the hypervisor
+Scheduling Contexts that have to be resized by the hypervisor must be first registered to the hypervisor. Whenever we want to exclude contexts from the resizing process we have to unregister them from the hypervisor.
+
+@deftypefun void sched_ctx_hypervisor_register_ctx (unsigned @var{sched_ctx}, double @var{total_flops})
+Register the context to the hypervisor, and indicate the number of flops the context will execute (needed for Gflops rate based strategy @pxref{Resizing strategies} or any other custom strategy needing it, for the others we can pass 0.0)
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_unregister_ctx (unsigned @var{sched_ctx})
+Unregister the context from the hypervisor
+@end deftypefun
+
+@node The user's input in the resizing process
+@section The user's input in the resizing process
+The user can totally forbid the resizing of a certain context or can then change his mind and allow it (in this case the resizing is managed by the hypervisor, that can forbid it or allow it)
+
+@deftypefun void sched_ctx_hypervisor_stop_resize (unsigned @var{sched_ctx})
+Forbid resizing of a context
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_start_resize (unsigned @var{sched_ctx})
+Allow resizing of a context
+@end deftypefun
+
+The user can then provide information to the hypervisor concerning the conditions of resizing.
+
+@deftypefun void sched_ctx_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
+Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
+
+@defmac HYPERVISOR_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and a double value indicating
+the maximum idle time allowed for a worker before the resizing process should be triggered
+@end defmac
+
+@defmac HYPERVISOR_PRIORITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and an int value indicating
+the priority of the workers previously mentioned.
+The workers with the smallest priority are moved the first.
+@end defmac
+
+@defmac HYPERVISOR_MIN_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the minimum number of workers a context should have, underneath this limit the context cannot execute.
+@end defmac
+
+@defmac HYPERVISOR_MAX_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the maximum number of workers a context should have, above this limit the context would not be able to scale
+@end defmac
+
+@defmac HYPERVISOR_GRANULARITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the granularity of the resizing process (the number of workers should be moved from the context once it is resized)
+This parameter is ignore for the Gflops rate based strategy @pxref{Resizing strategies}, the number of workers that have to be moved is calculated by the strategy.
+@end defmac
+
+@defmac HYPERVISOR_FIXED_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 2 arguments:
+an array of int for the workerids to apply the condition and an int to indicate the size of the array.
+These workers are not allowed to be moved from the context.
+@end defmac
+
+@defmac HYPERVISOR_MIN_TASKS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int)
+that indicated the minimum number of tasks that have to be executed before the context could be resized.
+This parameter is ignored for the Application Driven strategy @pxref{Resizing strategies} where the user indicates exactly when the resize should be done.
+@end defmac
+
+@defmac HYPERVISOR_NEW_WORKERS_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument, a double value indicating
+the maximum idle time allowed for workers that have just been moved from other contexts in the current context.
+@end defmac
+
+@defmac HYPERVISOR_TIME_TO_APPLY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int) indicating the tag
+an executed task should have such that this configuration should be taken into account.
+@end defmac
+@end deftypefun
+
+@node Performance Counters
+@section Performance Counters
+
+The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
+
+@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
+@anchor{struct starpu_sched_ctx_performance_counters}
+
+@table @asis
+@item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
+Informs the hypervisor for how long a worker has been idle in the specified context
+@item @code{void (*notify_idle_end)(unsigned sched_ctx_id, int worker)}
+Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context.
+The idle counter it though reset.
+@item @code{void (*notify_pushed_task)(unsigned sched_ctx_id, int worker)}
+Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+@item @code{void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops)}
+Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+@item @code{void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid)}
+Notifies the hypervisor a task has just been executed
+
+@end table
+@end deftp
+
+TODO maybe they should be hidden to the user
+
+@node Defining a new hypervisor policy
+@section Defining a new hypervisor policy
+
+@menu
+* Hypervisor Policy API:: Hypervisor Policy API
+* Hypervisor example::
+@end menu
+
+@node Hypervisor Policy API
+@subsection Hypervisor Policy API
+
+While Scheduling Context Hypervisor Plugin comes with a variety of resizing policies (@pxref{Resizing strategies}),
+it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own resizing policy.
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_policy}
+This structure contains all the methods that implement a hypervisor resizing policy.
+
+@table @asis
+@item @code{const char* name}
+Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
+@item @code{unsigned custom}
+Indicates whether the policy is custom or not
+@item @code{void (*handle_idle_cycle)(unsigned sched_ctx_id, int worker)}
+It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
+@item @code{void (*handle_pushed_task)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_poped_task)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_idle_end)(unsigned sched_ctx_id, int worker)}
+It is called whenever a task is executed on the indicated worker and context after a long period of idle time
+@item @code{void (*handle_post_exec_hook)(unsigned sched_ctx_id, struct starpu_htbl32_node* resize_requests, int task_tag)}
+It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
+@end table
+@end deftp
+
+The Hypervisor provides also a structure with configuration information of each context, which can be used to construct new resize strategies.
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_policy_config }
+This structure contains all configuration information of a context
+
+@table @asis
+@item @code{int min_nworkers}
+Indicates the minimum number of workers needed by the context
+@item @code{int max_nworkers}
+Indicates the maximum number of workers needed by the context
+@item @code{int granularity}
+Indicates the workers granularity of the context
+@item @code{int priority[STARPU_NMAXWORKERS]}
+Indicates the priority of each worker in the context
+@item @code{double max_idle[STARPU_NMAXWORKERS]}
+Indicates the maximum idle time accepted before a resize is triggered
+@item @code{int fixed_workers[STARPU_NMAXWORKERS]}
+Indicates which workers can be moved and which ones are fixed
+@item @code{double new_workers_max_idle}
+Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
+@end table
+@end deftp
+
+Additionally, the hypervisor provides a structure with information obtained from StarPU by means of the performance counters
+
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_wrapper}
+This structure is a wrapper of the contexts available in StarPU
+and contains all information about a context obtained by incrementing the performance counters
+
+@table @asis
+@item @code{unsigned sched_ctx}
+The context wrapped
+@item @code{struct sched_ctx_hypervisor_policy_config *config}
+The corresponding resize configuration
+@item @code{double current_idle_time[STARPU_NMAXWORKERS]}
+The idle time counter of each worker of the context
+@item @code{int pushed_tasks[STARPU_NMAXWORKERS]}
+The number of pushed tasks of each worker of the context
+@item @code{int poped_tasks[STARPU_NMAXWORKERS]}
+The number of poped tasks of each worker of the context
+@item @code{double total_flops}
+The total number of flops to execute by the context
+@item @code{double total_elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each workers of the context
+@item @code{double elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each worker of the context from last resize
+@item @code{double remaining_flops}
+The number of flops that still have to be executed by the workers in the context
+@item @code{double start_time}
+The time when he started executed
+@item @code{struct sched_ctx_hypervisor_resize_ack resize_ack}
+The structure confirming the last resize finished and a new one can be done
+@end table
+@end deftp
+
+@deftp {Data Type} {struct sched_ctx_hypervisor_resize_ack}
+This structures checks if the workers moved to another context are actually taken into account in that context
+@table @asis
+@item @code{int receiver_sched_ctx}
+The context receiving the new workers
+@item @code{int *moved_workers}
+The workers moved to the receiver context
+@item @code{int nmoved_workers}
+The number of workers moved
+@item @code{int *acked_workers}
+If the value corresponding to a worker is 1, this one is taken into account in the new context if 0 not yet
+@end table
+@end deftp
+
+The following functions can be used in the resizing strategies.
+
+@deftypefun void sched_ctx_hypervisor_move_workers (unsigned @var{sender_sched_ctx}, unsigned @var{receiver_sched_ctx}, {int *}@var{workers_to_move}, unsigned @var{nworkers_to_move}, unsigned @var{now});
+Moves workers from one context to another
+@end deftypefun
+
+@deftypefun {struct sched_ctx_hypervisor_policy_config *} sched_ctx_hypervisor_get_config (unsigned @var{sched_ctx});
+Returns the configuration structure of a context
+@end deftypefun
+
+@deftypefun {int *} sched_ctx_hypervisor_get_sched_ctxs ();
+Gets the contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun int sched_ctx_hypervisor_get_nsched_ctxs ();
+Gets the number of contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun {struct sched_ctx_hypervisor_wrapper *} sched_ctx_hypervisor_get_wrapper (unsigned @var{sched_ctx});
+Returns the wrapper corresponding the context @code{sched_ctx}
+@end deftypefun
+
+@deftypefun double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sched_ctx_hypervisor_wrapper *} @var{sc_w});
+Returns the flops of a context elapsed from the last resize
+@end deftypefun
+
+@deftypefun {char *} sched_ctx_hypervisor_get_policy ();
+Returns the name of the resizing policy the hypervisor uses
+@end deftypefun
+
+@node Hypervisor example
+@subsection Hypervisor example
+
+@cartouche
+@smallexample
+
+struct sched_ctx_hypervisor_policy dummy_policy =
+@{
+       .handle_poped_task = dummy_handle_poped_task,
+       .handle_pushed_task = dummy_handle_pushed_task,
+       .handle_idle_cycle = dummy_handle_idle_cycle,
+       .handle_idle_end = dummy_handle_idle_end,
+       .handle_post_exec_hook = dummy_handle_post_exec_hook,
+       .custom = 1,
+       .name = "dummy"
+@};
+
+@end smallexample
+@end cartouche
+
+@c Local Variables:
+@c TeX-master: "../starpu.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 234 - 59
doc/chapters/installing.texi

@@ -1,4 +1,4 @@
-@c -*-texinfo-*-
+w@c -*-texinfo-*-
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
@@ -7,23 +7,60 @@
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Downloading StarPU::
-* Configuration of StarPU::
-* Building and Installing StarPU::
+* Installing a Binary Package::
+* Installing from Source::
+* Setting up Your Own Code::
+* Benchmarking StarPU::
 @end menu
 
+@node Installing a Binary Package
+@section Installing a Binary Package
+
+One of the StarPU developers being a Debian Developer, the packages
+are well integrated and very uptodate. To see which packages are
+available, simply type:
+
+@example
+$ apt-cache search starpu
+@end example
+
+To install what you need, type:
+
+@example
+$ sudo apt-get install libstarpu-1.0 libstarpu-dev
+@end example
+
+@node Installing from Source
+@section Installing from Source
+
 StarPU can be built and installed by the standard means of the GNU
 autotools. The following chapter is intended to briefly remind how these tools
 can be used to install StarPU.
 
-@node Downloading StarPU
-@section Downloading StarPU
-
 @menu
+* Optional Dependencies::
 * Getting Sources::
-* Optional dependencies::
+* Configuring StarPU::
+* Building StarPU::
+* Installing StarPU::
 @end menu
 
+@node Optional Dependencies
+@subsection Optional Dependencies
+
+The @url{http://www.open-mpi.org/software/hwloc, @code{hwloc} topology
+discovery library} is not mandatory to use StarPU but strongly
+recommended.  It allows for topology aware scheduling, which improves
+performance.  @code{hwloc} is available in major free operating system
+distributions, and for most operating systems.
+
+If @code{hwloc} is not available on your system, the option
+@code{--without-hwloc} should be explicitely given when calling the
+@code{configure} script. If @code{hwloc} is installed with a @code{pkg-config} file,
+no option is required, it will be detected automatically, otherwise
+@code{with-hwloc=prefix} should be used to specify the location
+of @code{hwloc}.
+
 @node Getting Sources
 @subsection Getting Sources
 
@@ -56,52 +93,27 @@ are running on Windows, you will probably prefer to use
 svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk StarPU
 @end example
 
-@node Optional dependencies
-@subsection Optional dependencies
+@node Configuring StarPU
+@subsection Configuring StarPU
 
-The @url{http://www.open-mpi.org/software/hwloc, @code{hwloc} topology
-discovery library} is not mandatory to use StarPU but strongly
-recommended.  It allows for topology aware scheduling, which improves
-performance.  @code{hwloc} is available in major free operating system
-distributions, and for most operating systems.
-
-If @code{hwloc} is not available on your system, the option
-@code{--without-hwloc} should be explicitely given when calling the
-@code{configure} script. If @code{hwloc} is installed with a @code{pkg-config} file,
-no option is required, it will be detected automatically, otherwise
-@code{with-hwloc=prefix} should be used to specify the location
-of @code{hwloc}.
-
-@node Configuration of StarPU
-@section Configuration of StarPU
-
-@menu
-* Generating Makefiles and configuration scripts::
-* Running the configuration::
-@end menu
-
-@node Generating Makefiles and configuration scripts
-@subsection Generating Makefiles and configuration scripts
-
-This step is not necessary when using the tarball releases of StarPU.  If you
-are using the source code from the svn repository, you first need to generate
-the configure scripts and the Makefiles. This requires the
-availability of @code{autoconf}, @code{automake} >= 2.60, and @code{makeinfo}.
+Running @code{autogen.sh} is not necessary when using the tarball
+releases of StarPU.  If you are using the source code from the svn
+repository, you first need to generate the configure scripts and the
+Makefiles. This requires the availability of @code{autoconf},
+@code{automake} >= 2.60, and @code{makeinfo}.
 
 @example
 $ ./autogen.sh
 @end example
 
-@node Running the configuration
-@subsection Running the configuration
+You then need to configure StarPU. Details about options that are
+useful to give to @code{./configure} are given in @ref{Compilation
+configuration}.
 
 @example
 $ ./configure
 @end example
 
-Details about options that are useful to give to @code{./configure} are given in
-@ref{Compilation configuration}.
-
 By default, the files produced during the compilation are placed in
 the source directory. As the compilation generates a lot of files, it
 is advised to to put them all in a separate directory. It is then
@@ -116,25 +128,13 @@ $ cd build
 $ ../configure
 @end example
 
-@node Building and Installing StarPU
-@section Building and Installing StarPU
-
-@menu
-* Building::
-* Sanity Checks::
-* Installing::
-@end menu
-
-@node Building
-@subsection Building
+@node Building StarPU
+@subsection Building StarPU
 
 @example
 $ make
 @end example
 
-@node Sanity Checks
-@subsection Sanity Checks
-
 Once everything is built, you may want to test the result. An
 extensive set of regression tests is provided with StarPU. Running the
 tests is done by calling @code{make check}. These tests are run every night
@@ -145,8 +145,8 @@ and the result from the main profile is publicly
 $ make check
 @end example
 
-@node Installing
-@subsection Installing
+@node Installing StarPU
+@subsection Installing StarPU
 
 In order to install StarPU at the location that was specified during
 configuration:
@@ -158,3 +158,178 @@ $ make install
 Libtool interface versioning information are included in
 libraries names (libstarpu-1.0.so, libstarpumpi-1.0.so and
 libstarpufft-1.0.so).
+
+@node Setting up Your Own Code
+@section Setting up Your Own Code
+
+@menu
+* Setting Flags for Compiling::
+* Running a Basic StarPU Application::
+* Kernel Threads Started by StarPU::
+* Enabling OpenCL::
+@end menu
+
+@node Setting Flags for Compiling
+@subsection Setting Flags for Compiling, Linking and Running Applications
+
+StarPU provides a pkg-config executable to obtain relevant compiler
+and linker flags.
+Compiling and linking an application against StarPU may require to use
+specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
+To this end, it is possible to use the @code{pkg-config} tool.
+
+If StarPU was not installed at some standard location, the path of StarPU's
+library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
+that @code{pkg-config} can find it. For example if StarPU was installed in
+@code{$prefix_dir}:
+
+@example
+$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
+@end example
+
+The flags required to compile or link against StarPU are then
+accessible with the following commands@footnote{It is still possible to use the API
+provided in the version 0.9 of StarPU by calling @code{pkg-config}
+with the @code{libstarpu} package. Similar packages are provided for
+@code{libstarpumpi} and @code{libstarpufft}.}:
+
+@example
+$ pkg-config --cflags starpu-1.1  # options for the compiler
+$ pkg-config --libs starpu-1.1    # options for the linker
+@end example
+
+Make sure that @code{pkg-config --libs starpu-1.1} actually produces some output
+before going further: @code{PKG_CONFIG_PATH} has to point to the place where
+@code{starpu-1.1.pc} was installed during @code{make install}.
+
+Also pass the @code{--static} option if the application is to be
+linked statically.
+
+It is also necessary to set the variable @code{LD_LIBRARY_PATH} to
+locate dynamic libraries at runtime.
+
+@example
+$ LD_LIBRARY_PATH=$prefix_dir/lib:$LD_LIBRARY_PATH
+@end example
+
+When using a Makefile, the following lines can be added to set the
+options for the compiler and the linker:
+
+@cartouche
+@example
+CFLAGS          +=      $$(pkg-config --cflags starpu-1.1)
+LDFLAGS         +=      $$(pkg-config --libs starpu-1.1)
+@end example
+@end cartouche
+
+@node Running a Basic StarPU Application
+@subsection Running a Basic StarPU Application
+
+Basic examples using StarPU are built in the directory
+@code{examples/basic_examples/} (and installed in
+@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
+@code{vector_scal}.
+
+@example
+$ ./examples/basic_examples/vector_scal
+BEFORE: First element was 1.000000
+AFTER: First element is 3.140000
+@end example
+
+When StarPU is used for the first time, the directory
+@code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
+that directory (@pxref{STARPU_HOME}).
+
+Please note that buses are benchmarked when StarPU is launched for the
+first time. This may take a few minutes, or less if @code{hwloc} is
+installed. This step is done only once per user and per machine.
+
+@node Kernel Threads Started by StarPU
+@subsection Kernel Threads Started by StarPU
+
+StarPU automatically binds one thread per CPU core. It does not use
+SMT/hyperthreading because kernels are usually already optimized for using a
+full core, and using hyperthreading would make kernel calibration rather random.
+
+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
+
+While StarPU tasks are executing, the application is not supposed to do
+computations in the threads it starts itself, tasks should be used instead.
+
+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
+
+@node Enabling OpenCL
+@subsection Enabling OpenCL
+
+When both CUDA and OpenCL drivers are enabled, StarPU will launch an
+OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
+This design choice was necessary as OpenCL and CUDA can not run at the
+same time on the same NVIDIA GPU, as there is currently no interoperability
+between them.
+
+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
+
+@example
+$ ./configure --disable-cuda
+@end example
+
+or when running applications:
+
+@example
+$ STARPU_NCUDA=0 ./application
+@end example
+
+OpenCL will automatically be started on any device not yet used by
+CUDA. So on a machine running 4 GPUS, it is therefore possible to
+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
+so:
+
+@example
+$ STARPU_NCUDA=2 ./application
+@end example
+
+@node Benchmarking StarPU
+@section Benchmarking StarPU
+
+Some interesting benchmarks are installed among examples in
+@code{$prefix_dir/lib/starpu/examples/}. Make sure to try various
+schedulers, for instance STARPU_SCHED=dmda
+
+@menu
+* Task size overhead::
+* Data transfer latency::
+* Gemm::
+* Cholesky::
+* LU::
+@end menu
+
+@node Task size overhead
+@subsection Task size overhead
+
+This benchmark gives a glimpse into how big a size should be for StarPU overhead
+to be low enough.  Run @code{tasks_size_overhead.sh}, it will generate a plot
+of the speedup of tasks of various sizes, depending on the number of CPUs being
+used.
+
+@node Data transfer latency
+@subsection Data transfer latency
+
+@code{local_pingpong} performs a ping-pong between the first two CUDA nodes, and
+prints the measured latency.
+
+@node Gemm
+@subsection Matrix-matrix multiplication
+
+@code{sgemm} and @code{dgemm} perform a blocked matrix-matrix
+multiplication using BLAS and cuBLAS. They output the obtained GFlops.
+
+@node Cholesky
+@subsection Cholesky factorization
+
+@code{cholesky*} perform a Cholesky factorization (single precision). They use different dependency primitives.
+
+@node LU
+@subsection LU factorization
+
+@code{lu*} perform an LU factorization. They use different dependency primitives.

+ 1 - 1
doc/chapters/introduction.texi

@@ -68,7 +68,7 @@ StarPU takes particular care of scheduling tasks efficiently, using
 well-known algorithms from the literature (@pxref{Task scheduling
 policy}).  In addition, it allows scheduling experts, such as compiler
 or computational library developers, to implement custom scheduling
-policies in a portable fashion (@pxref{Scheduling Policy API}).
+policies in a portable fashion (@pxref{Defining a New Scheduling Policy}).
 
 The remainder of this section describes the main concepts used in StarPU.
 

+ 14 - 111
doc/chapters/mpi-support.texi

@@ -199,11 +199,11 @@ communications defined in StarPU-MPI.
 @node Exchanging User Defined Data Interface
 @section Exchanging User Defined Data Interface
 
-New data interfaces defined as explained in @ref{An example
-of data interface} can also be used within StarPU-MPI and exchanged
-between nodes. Two functions needs to be defined through
-the type @code{struct starpu_data_interface_ops} (@pxref{Data
-Interface API}). The pack function takes a handle and returns a
+New data interfaces defined as explained in @ref{Defining a New Data
+Interface} can also be used within StarPU-MPI and exchanged between
+nodes. Two functions needs to be defined through
+the type @code{struct starpu_data_interface_ops} (@pxref{Defining
+Interface}). The pack function takes a handle and returns a
 contiguous memory buffer along with its size where data to be conveyed to another node
 should be copied. The reversed operation is implemented in the unpack
 function which takes a contiguous memory buffer and recreates the data
@@ -258,7 +258,6 @@ static struct starpu_data_interface_ops interface_complex_ops =
 @end smallexample
 @end cartouche
 
-@page
 @node MPI Insert Task Utility
 @section MPI Insert Task Utility
 
@@ -271,94 +270,14 @@ exchange the content of the handle. All MPI nodes then process the whole task
 graph, and StarPU automatically determines which node actually execute which
 task, and trigger the required MPI transfers.
 
-@deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
-Tell StarPU-MPI which MPI tag to use when exchanging the data.
-@end deftypefun
-
-@deftypefun int starpu_data_get_tag (starpu_data_handle_t @var{handle})
-Returns the MPI tag to be used when exchanging the data.
-@end deftypefun
-
-@deftypefun starpu_data_handle_t starpu_data_get_data_handle_from_tag (int @var{tag})
-Returns the data handle associated to the MPI tag, or NULL if there is not.
-@end deftypefun
-
-@deftypefun int starpu_data_set_rank (starpu_data_handle_t @var{handle}, int @var{rank})
-Tell StarPU-MPI which MPI node "owns" a given data, that is, the node which will
-always keep an up-to-date value, and will by default execute tasks which write
-to it.
-@end deftypefun
-
-@deftypefun int starpu_data_get_rank (starpu_data_handle_t @var{handle})
-Returns the last value set by @code{starpu_data_set_rank}.
-@end deftypefun
-
-@defmac STARPU_EXECUTE_ON_NODE
-this macro is used when calling @code{starpu_mpi_insert_task}, and
-must be followed by a integer value which specified the node on which
-to execute the codelet.
-@end defmac
-
-@defmac STARPU_EXECUTE_ON_DATA
-this macro is used when calling @code{starpu_mpi_insert_task}, and
-must be followed by a data handle to specify that the node owning the
-given data will execute the codelet.
-@end defmac
-
-@deftypefun int starpu_mpi_insert_task (MPI_Comm @var{comm}, struct starpu_codelet *@var{codelet}, ...)
-Create and submit a task corresponding to @var{codelet} with the following
-arguments.  The argument list must be zero-terminated.
-
-The arguments following the codelets are the same types as for the
-function @code{starpu_insert_task} defined in @ref{Insert Task
-Utility}. The extra argument @code{STARPU_EXECUTE_ON_NODE} followed by an
-integer allows to specify the MPI node to execute the codelet. It is also
-possible to specify that the node owning a specific data will execute
-the codelet, by using @code{STARPU_EXECUTE_ON_DATA} followed by a data
-handle.
-
-The internal algorithm is as follows:
-@enumerate
-@item Find out which MPI node is going to execute the codelet.
-      @enumerate
-      @item If there is only one node owning data in W mode, it will
-      be selected;
-      @item If there is several nodes owning data in W node, the one
-      selected will be the one having the least data in R mode so as
-      to minimize the amount of data to be transfered;
-      @item The argument @code{STARPU_EXECUTE_ON_NODE} followed by an
-      integer can be used to specify the node;
-      @item The argument @code{STARPU_EXECUTE_ON_DATA} followed by a
-      data handle can be used to specify that the node owing the given
-      data will execute the codelet.
-      @end enumerate
-@item Send and receive data as requested. Nodes owning data which need to be
-read by the task are sending them to the MPI node which will execute it. The
-latter receives them.
-@item Execute the codelet. This is done by the MPI node selected in the
-1st step of the algorithm.
-@item If several MPI nodes own data to be written to, send written
-data back to their owners.
-@end enumerate
-
-The algorithm also includes a communication cache mechanism that
-allows not to send data twice to the same MPI node, unless the data
-has been modified. The cache can be disabled
-(@pxref{STARPU_MPI_CACHE}).
-@c todo parler plus du cache
-
-@end deftypefun
-
-@deftypefun void starpu_mpi_get_data_on_node (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle}, int @var{node})
-Transfer data @var{data_handle} to MPI node @var{node}, sending it from its
-owner if needed. At least the target node and the owner have to call the
-function.
-@end deftypefun
+The list of functions are described in @ref{MPI Insert Task}.
 
 Here an stencil example showing how to use @code{starpu_mpi_insert_task}. One
 first needs to define a distribution function which specifies the
 locality of the data. Note that that distribution information needs to
-be given to StarPU by calling @code{starpu_data_set_rank}.
+be given to StarPU by calling @code{starpu_data_set_rank}. A MPI tag
+should also be defined for each data handle by calling
+@code{starpu_data_set_tag}.
 
 @cartouche
 @smallexample
@@ -407,8 +326,10 @@ data which will be needed by the tasks that we will execute.
             else
                 /* I know it's useless to allocate anything for this */
                 data_handles[x][y] = NULL;
-            if (data_handles[x][y])
+            if (data_handles[x][y]) @{
                 starpu_data_set_rank(data_handles[x][y], mpi_rank);
+                starpu_data_set_tag(data_handles[x][y], x*X+y);
+            @}
         @}
     @}
 @end smallexample
@@ -446,27 +367,8 @@ execute them, or to send the required data).
 @node MPI Collective Operations
 @section MPI Collective Operations
 
-@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
-Scatter data among processes of the communicator based on the ownership of
-the data. For each data of the array @var{data_handles}, the
-process @var{root} sends the data to the process owning this data.
-Processes receiving data must have valid data handles to receive them.
-On completion of the collective communication, the @var{scallback} function is
-called with the argument @var{sarg} on the process @var{root}, the @var{rcallback} function is
-called with the argument @var{rarg} on any other process.
-@end deftypefun
-
-@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm}, {void (*}@var{scallback})(void *), {void *}@var{sarg}, {void (*}@var{rcallback})(void *), {void *}@var{rarg})
-Gather data from the different processes of the communicator onto the
-process @var{root}. Each process owning data handle in the array
-@var{data_handles} will send them to the process @var{root}. The
-process @var{root} must have valid data handles to receive the data.
-On completion of the collective communication, the @var{rcallback} function is
-called with the argument @var{rarg} on the process @var{root}, the @var{scallback} function is
-called with the argument @var{sarg} on any other process.
-@end deftypefun
+The functions are described in @ref{Collective Operations}.
 
-@page
 @cartouche
 @smallexample
 if (rank == root)
@@ -499,6 +401,7 @@ for(x = 0; x < nblocks ;  x++)
     @}
     if (data_handles[x]) @{
         starpu_data_set_rank(data_handles[x], mpi_rank);
+        starpu_data_set_tag(data_handles[x], x*nblocks+y);
     @}
 @}
 

+ 5 - 36
doc/chapters/perf-feedback.texi

@@ -11,7 +11,7 @@
 * On-line::                     On-line performance feedback
 * Off-line::                    Off-line performance feedback
 * Codelet performance::         Performance of codelets
-* Theoretical lower bound on execution time API::
+* Theoretical lower bound on execution time::
 * Memory feedback::
 * Data statistics::
 @end menu
@@ -478,11 +478,10 @@ $ starpu_codelet_histo_profile distrib.data
 Which will create one pdf file per codelet and per input size, showing a
 histogram of the codelet execution time distribution.
 
-@node Theoretical lower bound on execution time API
+@node Theoretical lower bound on execution time
 @section Theoretical lower bound on execution time
 
-See @ref{Theoretical lower bound on execution time} for an example on how to use
-this API. It permits to record a trace of what tasks are needed to complete the
+StarPU can record a trace of what tasks are needed to complete the
 application, and then, by using a linear system, provide a theoretical lower
 bound of the execution time (i.e. with an ideal scheduling).
 
@@ -492,38 +491,8 @@ near to the bound computed with dependencies enabled (which takes a huge lot
 more time to compute), and thus provides a good-enough estimation of the ideal
 execution time.
 
-@deftypefun void starpu_bound_start (int @var{deps}, int @var{prio})
-Start recording tasks (resets stats).  @var{deps} tells whether
-dependencies should be recorded too (this is quite expensive)
-@end deftypefun
-
-@deftypefun void starpu_bound_stop (void)
-Stop recording tasks
-@end deftypefun
-
-@deftypefun void starpu_bound_print_dot ({FILE *}@var{output})
-Print the DAG that was recorded
-@end deftypefun
-
-@deftypefun void starpu_bound_compute ({double *}@var{res}, {double *}@var{integer_res}, int @var{integer})
-Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script). It returns 0 if some performance models are not calibrated.
-@end deftypefun
-
-@deftypefun void starpu_bound_print_lp ({FILE *}@var{output})
-Emit the Linear Programming system on @var{output} for the recorded tasks, in
-the lp format
-@end deftypefun
-
-@deftypefun void starpu_bound_print_mps ({FILE *}@var{output})
-Emit the Linear Programming system on @var{output} for the recorded tasks, in
-the mps format
-@end deftypefun
-
-@deftypefun void starpu_bound_print ({FILE *}@var{output}, int @var{integer})
-Emit statistics of actual execution vs theoretical upper bound. @var{integer}
-permits to choose between integer solving (which takes a long time but is
-correct), and relaxed solving (which provides an approximate solution).
-@end deftypefun
+@ref{Theoretical lower bound on execution time} provides an example on how to
+use this.
 
 @node Memory feedback
 @section Memory feedback

+ 0 - 34
doc/chapters/perf-optimization.texi

@@ -14,7 +14,6 @@ TODO: improve!
 * Task submission::
 * Task priorities::
 * Task scheduling policy::
-* Task scheduling contexts::
 * Performance model calibration::
 * Task distribution vs Data transfer::
 * Data prefetch::
@@ -206,39 +205,6 @@ parallel tasks (still experimental).
 The @b{peager} (parallel eager) scheduler is similar to eager, it also
 supports parallel tasks (still experimental).
 
-@node Task scheduling contexts
-@section Task scheduling contexts
-Task scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
-GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
-
-By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
-If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
-Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
-These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
-In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
-By passing a set of workers together with the scheduling policy to the function @code{starpu_sched_ctx_create}, you will get an identifier of the context created which you will use to indicate the context you want to submit the tasks to.
-
-@cartouche
-@smallexample
-/* @b{the list of ressources the context will manage} */
-int workerids[3] = @{1, 3, 10@};
-
-/* @b{indicate the scheduling policy to be used within the context, the list of 
-   workers assigned to it, the number of workers, the name of the context} */
-int id_ctx = starpu_sched_ctx_create("heft", workerids, 3, "my_ctx");
-
-/* @b{let StarPU know that the folowing tasks will be submitted to this context} */
-starpu_sched_ctx_set_task_context(id);
-
-/* @b{submit the task to StarPU} */
-starpu_task_submit(task);
-
-@end smallexample
-@end cartouche
-
-Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine. 
-Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
-
 @node Performance model calibration
 @section Performance model calibration
 

+ 116 - 0
doc/chapters/sched_ctx.texi

@@ -0,0 +1,116 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+TODO: improve!
+
+@menu
+* General Idea::
+* Create a Context::
+* Modify a Context::
+* Delete a Context::
+* Empty Context::
+* Contexts Sharing Workers::
+@end menu
+
+@node General Idea
+@section General Idea
+Scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
+GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
+
+@node Create a Context
+@section Create a Context
+By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
+If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
+Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
+These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
+In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
+By passing a set of workers together with the scheduling policy to the function @code{starpu_sched_ctx_create}, you will get an identifier of the context created which you will use to indicate the context you want to submit the tasks to.
+
+@cartouche
+@smallexample
+/* @b{the list of ressources the context will manage} */
+int workerids[3] = @{1, 3, 10@};
+
+/* @b{indicate the scheduling policy to be used within the context, the list of 
+   workers assigned to it, the number of workers, the name of the context} */
+int id_ctx = starpu_sched_ctx_create("dmda", workerids, 3, "my_ctx");
+
+/* @b{let StarPU know that the folowing tasks will be submitted to this context} */
+starpu_sched_ctx_set_task_context(id);
+
+/* @b{submit the task to StarPU} */
+starpu_task_submit(task);
+
+@end smallexample
+@end cartouche
+
+Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine. 
+Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
+
+
+@node Modify a Context
+@section Modify a Context
+A scheduling context can be modified dynamically. The applications may change its requirements during the execution and the programmer can add additional workers to a context or remove if no longer needed.
+In the following example we have two scheduling contexts @code{sched_ctx1} and @code{sched_ctx2}. After executing a part of the tasks some of the workers of @code{sched_ctx1} will be moved to context @code{sched_ctx2}.
+
+@cartouche
+@smallexample
+/* @b{the list of ressources that context 1 will give away} */
+int workerids[3] = @{1, 3, 10@};
+
+/* @b{add the workers to context 1} */
+starpu_sched_ctx_add_workers(workerids, 3, sched_ctx2);
+
+/* @b{remove the workers from context 2} */
+starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
+
+@end smallexample
+@end cartouche
+
+@node Delete a Context 
+@section Delete a Context
+When a context is no longer needed it must be deleted. The application can indicate which context should keep the resources of a deleted one. 
+All the tasks of the context should be executed before doing this. If the application need to avoid a barrier before moving the resources from the deleted context to the inheritor one, the application can just indicate
+when the last task was submitted. Thus, when this last task was submitted the resources will be move, but the context should still be deleted at some point of the application.
+
+@cartouche
+@smallexample
+/* @b{when the context 2 will be deleted context 1 will be keep its resources} */
+starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+/* @b{submit tasks to context 2} */
+for (i = 0; i < ntasks; i++)
+    starpu_task_submit_to_ctx(task[i],sched_ctx2);
+
+/* @b{indicate that context 2 finished submitting and that } */
+/* @b{as soon as the last task of context 2 finished executing } */
+/* @b{its workers can be mobed to the inheritor context} */
+starpu_sched_ctx_finished_submit(sched_ctx1);
+
+/* @b{wait for the tasks of both contexts to finish} */
+starpu_task_wait_for_all();
+
+/* @b{delete context 2} */
+starpu_sched_ctx_delete(sched_ctx2);
+
+/* @b{delete context 1} */
+starpu_sched_ctx_delete(sched_ctx1);
+@end smallexample
+@end cartouche
+
+@node Empty Context
+@section Empty Context
+A context may not have any resources at the begining or at a certain moment of the execution. Task can still be submitted to these contexts and they will execute them as soon as they will have resources. 
+A list of tasks pending to be executed is kept and when workers are added to the contexts the tasks are submitted. However, if no resources are allocated the program will not terminate.
+If these tasks have not much priority the programmer can forbid the application to submitted them by calling the function @code{starpu_sched_ctx_stop_task_submission}.
+
+@node Contexts Sharing Workers
+@section Contexts Sharing Workers
+Contexts may share workers when a single context cannot execute efficiently enough alone on these workers or when the application decides to express a hierarchy of contexts. The workers apply 
+an alogrithm of ``Round-Robin'' to chose the context on which they will ``pop'' next. By using the function @code{void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)}
+the programmer can impose the @code{workerid} to ``pop'' in the context @code{sched_ctx_id} next.

+ 24 - 290
doc/chapters/sched_ctx_hypervisor.texi

@@ -1,124 +1,44 @@
 @c -*-texinfo-*-
 
 @c This file is part of the StarPU Handbook.
-@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c Copyright (C) 2011--2013 Institut National de Recherche en Informatique et Automatique
 @c See the file starpu.texi for copying conditions.
 
 @cindex Scheduling Context Hypervisor
 
+@menu
+* What is the Hypervisor::
+* Start the Hypervisor::
+* Interrogate the runtime::
+* Trigger the Hypervisor::
+* Resizing strategies::
+@end menu
+
+@node What is the Hypervisor 
+@section What is the Hypervisor
 StarPU proposes a platform for constructing Scheduling Contexts, for deleting and modifying them dynamically.
 A parallel kernel, can thus be isolated into a scheduling context and interferences between several parallel kernels are avoided.
 If the user knows exactly how many workers each scheduling context needs, he can assign them to the contexts at their creation time or modify them during the execution of the program.
 
-The Scheduling Context Hypervisor Plugin is available for the users who do not dispose of a regular parallelism, who cannot know in advance the exact size of the context and need to resize the contexts according to the behavior of the parallel kernel.
+The Scheduling Context Hypervisor Plugin is available for the users who do not dispose of a regular parallelism, who cannot know in advance the exact size of the context and need to resize the contexts according to the behavior of the parallel kernels.
 The Hypervisor receives information from StarPU concerning the execution of the tasks, the efficiency of the resources, etc. and it decides accordingly when and how the contexts can be resized.
 Basic strategies of resizing scheduling contexts already exist but a platform for implementing additional custom ones is available.
 
-@menu
-* Managing the hypervisor::				Initialize the hypervisor
-* Registering Scheduling Contexts to the hypervisor:: 	Contexts have to register to the hypervisor
-* The user's input in the resizing process:: 		The user can help the hypervisor decide how to resize
-* Resizing strategies::					Several resizing strategies are proposed
-* Performance Counters::              			StarPU provides information to the Hypervisor through performance counters
-* Defining a new hypervisor policy::      		New Policies can be implemented
-@end menu
-
-@node Managing the hypervisor
-@section Managing the hypervisor
-There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
-
-@deftypefun {struct starpu_sched_ctx_performance_counters *} sched_ctx_hypervisor_init ({struct sched_ctx_hypervisor_policy *} @var{policy})
-Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
-These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
-@end deftypefun
-
-Note: The Hypervisor is actually a worker that takes this role once certain conditions trigger the resizing process (there is no additional thread assigned to the hypervisor).
-
-@deftypefun void sched_ctx_hypervisor_shutdown (void)
-The hypervisor and all information is freed. There is no synchronization between this function and starpu_shutdown. Thus, this should be done after starpu_shutdown(),
-because the performance counters will still need allocated callback functions.
-@end deftypefun
-
-@node Registering Scheduling Contexts to the hypervisor
-@section Registering Scheduling Contexts to the hypervisor
-Scheduling Contexts that have to be resized by the hypervisor must be first registered to the hypervisor. Whenever we want to exclude contexts from the resizing process we have to unregister them from the hypervisor.
-
-@deftypefun void sched_ctx_hypervisor_register_ctx (unsigned @var{sched_ctx}, double @var{total_flops})
-Register the context to the hypervisor, and indicate the number of flops the context will execute (needed for Gflops rate based strategy @pxref{Resizing strategies} or any other custom strategy needing it, for the others we can pass 0.0)
-@end deftypefun
-
-@deftypefun void sched_ctx_hypervisor_unregister_ctx (unsigned @var{sched_ctx})
-Unregister the context from the hypervisor
-@end deftypefun
-
-@node The user's input in the resizing process
-@section The user's input in the resizing process
-The user can totally forbid the resizing of a certain context or can then change his mind and allow it (in this case the resizing is managed by the hypervisor, that can forbid it or allow it)
-
-@deftypefun void sched_ctx_hypervisor_stop_resize (unsigned @var{sched_ctx})
-Forbid resizing of a context
-@end deftypefun
-
-@deftypefun void sched_ctx_hypervisor_start_resize (unsigned @var{sched_ctx})
-Allow resizing of a context
-@end deftypefun
+@node Start the Hypervisor
+@section Start the Hypervisor
+The Hypervisor must be initialised once at the beging of the application. At this point a resizing policy should be indicated. This strategy depends on the information the application is able to provide to the hypervisor as well
+as on the accuracy needed for the resizing procedure. For exemple, the application may be able to provide an estimation of the workload of the contexts. In this situation the hypervisor may decide what resources the contexts need.
+However, if no information is provided the hypervisor evaluates the behavior of the resources and of the application and makes a guess about the future.
+The hypervisor resizes only the registered contexts.
 
-The user can then provide information to the hypervisor concerning the conditions of resizing.
+@node Interrogate the runtime
+@section Interrrogate the runtime
+The runtime provides the hypervisor with information concerning the behavior of the resources and the application. This is done by using the performance_counters, some callbacks indicating when the resources are idle or not efficient, when the application submits tasks or when it becames to slow. 
 
-@deftypefun void sched_ctx_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
-Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
+@node Trigger the Hypervisor
+@section Trigger the Hypervisor
+The resizing is triggered either when the application requires it or when the initials distribution of resources alters the performance of the application( the application is to slow or the resource are idle for too long time, threashold indicated by the user). When this happens different resizing strategy are applied that target minimising the total execution of the application, the instant speed or the idle time of the resources.
 
-@defmac HYPERVISOR_MAX_IDLE
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
-an array of int for the workerids to apply the condition, an int to indicate the size of the array, and a double value indicating
-the maximum idle time allowed for a worker before the resizing process should be triggered
-@end defmac
-
-@defmac HYPERVISOR_PRIORITY
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments:
-an array of int for the workerids to apply the condition, an int to indicate the size of the array, and an int value indicating
-the priority of the workers previously mentioned.
-The workers with the smallest priority are moved the first.
-@end defmac
-
-@defmac HYPERVISOR_MIN_WORKERS
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the minimum number of workers a context should have, underneath this limit the context cannot execute.
-@end defmac
-
-@defmac HYPERVISOR_MAX_WORKERS
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the maximum number of workers a context should have, above this limit the context would not be able to scale
-@end defmac
-
-@defmac HYPERVISOR_GRANULARITY
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
-the granularity of the resizing process (the number of workers should be moved from the context once it is resized)
-This parameter is ignore for the Gflops rate based strategy @pxref{Resizing strategies}, the number of workers that have to be moved is calculated by the strategy.
-@end defmac
-
-@defmac HYPERVISOR_FIXED_WORKERS
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 2 arguments:
-an array of int for the workerids to apply the condition and an int to indicate the size of the array.
-These workers are not allowed to be moved from the context.
-@end defmac
-
-@defmac HYPERVISOR_MIN_TASKS
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int)
-that indicated the minimum number of tasks that have to be executed before the context could be resized.
-This parameter is ignored for the Application Driven strategy @pxref{Resizing strategies} where the user indicates exactly when the resize should be done.
-@end defmac
-
-@defmac HYPERVISOR_NEW_WORKERS_MAX_IDLE
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument, a double value indicating
-the maximum idle time allowed for workers that have just been moved from other contexts in the current context.
-@end defmac
-
-@defmac HYPERVISOR_TIME_TO_APPLY
-This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int) indicating the tag
-an executed task should have such that this configuration should be taken into account.
-@end defmac
-@end deftypefun
 @node Resizing strategies
 @section Resizing strategies
 
@@ -205,190 +125,4 @@ starpu_insert_task(&codelet,
 @end smallexample
 @end cartouche
 
-@node Performance Counters
-@section Performance Counters
-
-The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
-
-@deftp {Data Type} {struct starpu_sched_ctx_performance_counters}
-@anchor{struct starpu_sched_ctx_performance_counters}
-
-@table @asis
-@item @code{void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time)}
-Informs the hypervisor for how long a worker has been idle in the specified context
-@item @code{void (*notify_idle_end)(unsigned sched_ctx_id, int worker)}
-Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context.
-The idle counter it though reset.
-@item @code{void (*notify_pushed_task)(unsigned sched_ctx_id, int worker)}
-Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
-@item @code{void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops)}
-Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
-@item @code{void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid)}
-Notifies the hypervisor a task has just been executed
-
-@end table
-@end deftp
-
-TODO maybe they should be hidden to the user
-
-@node Defining a new hypervisor policy
-@section Defining a new hypervisor policy
-
-@menu
-* Hypervisor Policy API:: Hypervisor Policy API
-* Hypervisor example::
-@end menu
-
-@node Hypervisor Policy API
-@subsection Hypervisor Policy API
-
-While Scheduling Context Hypervisor Plugin comes with a variety of resizing policies (@pxref{Resizing strategies}),
-it may sometimes be desirable to implement custom
-policies to address specific problems.  The API described below allows
-users to write their own resizing policy.
-
-@deftp {Data Type} {struct sched_ctx_hypervisor_policy}
-This structure contains all the methods that implement a hypervisor resizing policy.
-
-@table @asis
-@item @code{const char* name}
-Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
-@item @code{unsigned custom}
-Indicates whether the policy is custom or not
-@item @code{void (*handle_idle_cycle)(unsigned sched_ctx_id, int worker)}
-It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
-@item @code{void (*handle_pushed_task)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
-@item @code{void (*handle_poped_task)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
-@item @code{void (*handle_idle_end)(unsigned sched_ctx_id, int worker)}
-It is called whenever a task is executed on the indicated worker and context after a long period of idle time
-@item @code{void (*handle_post_exec_hook)(unsigned sched_ctx_id, struct starpu_htbl32_node* resize_requests, int task_tag)}
-It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
-@end table
-@end deftp
-
-The Hypervisor provides also a structure with configuration information of each context, which can be used to construct new resize strategies.
-
-@deftp {Data Type} {struct sched_ctx_hypervisor_policy_config }
-This structure contains all configuration information of a context
-
-@table @asis
-@item @code{int min_nworkers}
-Indicates the minimum number of workers needed by the context
-@item @code{int max_nworkers}
-Indicates the maximum number of workers needed by the context
-@item @code{int granularity}
-Indicates the workers granularity of the context
-@item @code{int priority[STARPU_NMAXWORKERS]}
-Indicates the priority of each worker in the context
-@item @code{double max_idle[STARPU_NMAXWORKERS]}
-Indicates the maximum idle time accepted before a resize is triggered
-@item @code{int fixed_workers[STARPU_NMAXWORKERS]}
-Indicates which workers can be moved and which ones are fixed
-@item @code{double new_workers_max_idle}
-Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
-@end table
-@end deftp
-
-Additionally, the hypervisor provides a structure with information obtained from StarPU by means of the performance counters
-
-
-@deftp {Data Type} {struct sched_ctx_hypervisor_wrapper}
-This structure is a wrapper of the contexts available in StarPU
-and contains all information about a context obtained by incrementing the performance counters
-
-@table @asis
-@item @code{unsigned sched_ctx}
-The context wrapped
-@item @code{struct sched_ctx_hypervisor_policy_config *config}
-The corresponding resize configuration
-@item @code{double current_idle_time[STARPU_NMAXWORKERS]}
-The idle time counter of each worker of the context
-@item @code{int pushed_tasks[STARPU_NMAXWORKERS]}
-The number of pushed tasks of each worker of the context
-@item @code{int poped_tasks[STARPU_NMAXWORKERS]}
-The number of poped tasks of each worker of the context
-@item @code{double total_flops}
-The total number of flops to execute by the context
-@item @code{double total_elapsed_flops[STARPU_NMAXWORKERS]}
-The number of flops executed by each workers of the context
-@item @code{double elapsed_flops[STARPU_NMAXWORKERS]}
-The number of flops executed by each worker of the context from last resize
-@item @code{double remaining_flops}
-The number of flops that still have to be executed by the workers in the context
-@item @code{double start_time}
-The time when he started executed
-@item @code{struct sched_ctx_hypervisor_resize_ack resize_ack}
-The structure confirming the last resize finished and a new one can be done
-@end table
-@end deftp
-
-@deftp {Data Type} {struct sched_ctx_hypervisor_resize_ack}
-This structures checks if the workers moved to another context are actually taken into account in that context
-@table @asis
-@item @code{int receiver_sched_ctx}
-The context receiving the new workers
-@item @code{int *moved_workers}
-The workers moved to the receiver context
-@item @code{int nmoved_workers}
-The number of workers moved
-@item @code{int *acked_workers}
-If the value corresponding to a worker is 1, this one is taken into account in the new context if 0 not yet
-@end table
-@end deftp
-
-The following functions can be used in the resizing strategies.
-
-@deftypefun void sched_ctx_hypervisor_move_workers (unsigned @var{sender_sched_ctx}, unsigned @var{receiver_sched_ctx}, {int *}@var{workers_to_move}, unsigned @var{nworkers_to_move}, unsigned @var{now});
-Moves workers from one context to another
-@end deftypefun
-
-@deftypefun {struct sched_ctx_hypervisor_policy_config *} sched_ctx_hypervisor_get_config (unsigned @var{sched_ctx});
-Returns the configuration structure of a context
-@end deftypefun
-
-@deftypefun {int *} sched_ctx_hypervisor_get_sched_ctxs ();
-Gets the contexts managed by the hypervisor
-@end deftypefun
-
-@deftypefun int sched_ctx_hypervisor_get_nsched_ctxs ();
-Gets the number of contexts managed by the hypervisor
-@end deftypefun
-
-@deftypefun {struct sched_ctx_hypervisor_wrapper *} sched_ctx_hypervisor_get_wrapper (unsigned @var{sched_ctx});
-Returns the wrapper corresponding the context @code{sched_ctx}
-@end deftypefun
-
-@deftypefun double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sched_ctx_hypervisor_wrapper *} @var{sc_w});
-Returns the flops of a context elapsed from the last resize
-@end deftypefun
-
-@deftypefun {char *} sched_ctx_hypervisor_get_policy ();
-Returns the name of the resizing policy the hypervisor uses
-@end deftypefun
-
-@node Hypervisor example
-@subsection Hypervisor example
-
-@cartouche
-@smallexample
-
-struct sched_ctx_hypervisor_policy dummy_policy =
-@{
-       .handle_poped_task = dummy_handle_poped_task,
-       .handle_pushed_task = dummy_handle_pushed_task,
-       .handle_idle_cycle = dummy_handle_idle_cycle,
-       .handle_idle_end = dummy_handle_idle_end,
-       .handle_post_exec_hook = dummy_handle_post_exec_hook,
-       .custom = 1,
-       .name = "dummy"
-@};
-
-@end smallexample
-@end cartouche
 
-@c Local Variables:
-@c TeX-master: "../starpu.texi"
-@c ispell-local-dictionary: "american"
-@c End:

+ 18 - 0
doc/chapters/tips-tricks.texi

@@ -9,6 +9,7 @@
 @menu
 * Per-worker library initialization::  How to initialize a computation library once for each worker?
 * Limit memory::
+* Thread Binding on NetBSD::
 @end menu
 
 @node Per-worker library initialization
@@ -91,3 +92,20 @@ and @code{STARPU_LIMIT_CPU_MEM}
 
 @code{starpu_memory_get_available}
 
+@node Thread Binding on NetBSD
+@section Thread Binding on NetBSD
+
+When using StarPU on a NetBSD machine, if the topology
+discovery library @code{hwloc} is used, thread binding will fail. To
+prevent the problem, you should at least use the version 1.7 of
+@code{hwloc}, and also issue the following call:
+
+@example
+$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
+@end example
+
+Or add the following line in the file @code{/etc/sysctl.conf}
+
+@example
+security.models.extensions.user_set_cpu_affinity=1
+@end example

+ 0 - 134
doc/chapters/using.texi

@@ -1,134 +0,0 @@
-@c -*-texinfo-*-
-
-@c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-@c See the file starpu.texi for copying conditions.
-
-@menu
-* Setting flags for compiling::
-* Running a basic StarPU application::
-* Kernel threads started by StarPU::
-* Enabling OpenCL::
-@end menu
-
-@node Setting flags for compiling
-@section Setting flags for compiling, linking and running applications
-
-StarPU provides a pkg-config executable to obtain relevant compiler
-and linker flags.
-Compiling and linking an application against StarPU may require to use
-specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
-To this end, it is possible to use the @code{pkg-config} tool.
-
-If StarPU was not installed at some standard location, the path of StarPU's
-library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
-that @code{pkg-config} can find it. For example if StarPU was installed in
-@code{$prefix_dir}:
-
-@example
-$ PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
-@end example
-
-The flags required to compile or link against StarPU are then
-accessible with the following commands@footnote{It is still possible to use the API
-provided in the version 0.9 of StarPU by calling @code{pkg-config}
-with the @code{libstarpu} package. Similar packages are provided for
-@code{libstarpumpi} and @code{libstarpufft}.}:
-
-@example
-$ pkg-config --cflags starpu-1.0  # options for the compiler
-$ pkg-config --libs starpu-1.0    # options for the linker
-@end example
-
-Make sure that @code{pkg-config --libs starpu-1.0} actually produces some output
-before going further: @code{PKG_CONFIG_PATH} has to point to the place where
-@code{starpu-1.0.pc} was installed during @code{make install}.
-
-Also pass the @code{--static} option if the application is to be
-linked statically.
-
-It is also necessary to set the variable @code{LD_LIBRARY_PATH} to
-locate dynamic libraries at runtime.
-
-@example
-$ LD_LIBRARY_PATH=$prefix_dir/lib:$LD_LIBRARY_PATH
-@end example
-
-When using a Makefile, the following lines can be added to set the
-options for the compiler and the linker:
-
-@cartouche
-@example
-CFLAGS          +=      $$(pkg-config --cflags starpu-1.0)
-LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
-@end example
-@end cartouche
-
-@node Running a basic StarPU application
-@section Running a basic StarPU application
-
-Basic examples using StarPU are built in the directory
-@code{examples/basic_examples/} (and installed in
-@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
-@code{vector_scal}.
-
-@example
-$ ./examples/basic_examples/vector_scal
-BEFORE: First element was 1.000000
-AFTER: First element is 3.140000
-@end example
-
-When StarPU is used for the first time, the directory
-@code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
-that directory (@pxref{STARPU_HOME}).
-
-Please note that buses are benchmarked when StarPU is launched for the
-first time. This may take a few minutes, or less if @code{hwloc} is
-installed. This step is done only once per user and per machine.
-
-@node Kernel threads started by StarPU
-@section Kernel threads started by StarPU
-
-StarPU automatically binds one thread per CPU core. It does not use
-SMT/hyperthreading because kernels are usually already optimized for using a
-full core, and using hyperthreading would make kernel calibration rather random.
-
-Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
-
-While StarPU tasks are executing, the application is not supposed to do
-computations in the threads it starts itself, tasks should be used instead.
-
-TODO: add a StarPU function to bind an application thread (e.g. the main thread)
-to a dedicated core (and thus disable the corresponding StarPU CPU worker).
-
-@node Enabling OpenCL
-@section Enabling OpenCL
-
-When both CUDA and OpenCL drivers are enabled, StarPU will launch an
-OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
-This design choice was necessary as OpenCL and CUDA can not run at the
-same time on the same NVIDIA GPU, as there is currently no interoperability
-between them.
-
-To enable OpenCL, you need either to disable CUDA when configuring StarPU:
-
-@example
-$ ./configure --disable-cuda
-@end example
-
-or when running applications:
-
-@example
-$ STARPU_NCUDA=0 ./application
-@end example
-
-OpenCL will automatically be started on any device not yet used by
-CUDA. So on a machine running 4 GPUS, it is therefore possible to
-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
-so:
-
-@example
-$ STARPU_NCUDA=2 ./application
-@end example

+ 3 - 3
doc/chapters/vector_scal_cuda.texi

@@ -2,13 +2,13 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009-2012  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(float *val, unsigned n,
+static __global__ void vector_mult_cuda(unsigned n, float *val,
                                         float factor)
 @{
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
@@ -28,7 +28,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
-	                (val, n, *factor);
+	                (n, val, *factor);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 @}

+ 3 - 3
doc/chapters/vector_scal_opencl.texi

@@ -2,7 +2,7 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
@@ -30,8 +30,8 @@ void scal_opencl_func(void *buffers[], void *_args)
                                     devid);
     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
-    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
+    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
     err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
     if (err) STARPU_OPENCL_REPORT_ERROR(err);
 

+ 2 - 2
doc/chapters/vector_scal_opencl_codelet.texi

@@ -2,11 +2,11 @@
 
 @c This file is part of the StarPU Handbook.
 @c Copyright (C) 2009-2011  Université de Bordeaux 1
-@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 
 @smallexample
-__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 @{
         const int i = get_global_id(0);
         if (i < nx) @{

+ 35 - 44
doc/starpu.texi

@@ -65,23 +65,22 @@ was last updated on @value{UPDATED}.
 @comment  better formatting.
 @comment
 @menu
-* Introduction::                	Getting started
-* Installing StarPU::           	How to configure, build and install StarPU
-* Using StarPU::                	How to run StarPU application
-* Basic Examples::              	Basic examples of the use of StarPU
-* Advanced Examples::           	Advanced examples of the use of StarPU
-* Benchmarks::                  	Benchmarks worth running
-* Performance optimization::    	How to optimize performance with StarPU
-* Performance feedback::        	Performance debugging tools
-* Tips and Tricks::             	Tips and tricks to know about
-* StarPU MPI support::          	How to combine StarPU with MPI
-* StarPU FFT support::          	How to perform FFT computations with StarPU
-* C Extensions::                	Easier StarPU programming with GCC
-* SOCL OpenCL Extensions::      	How to use OpenCL on top of StarPU
-* Scheduling Context Hypervisor:: 	How to use Scheduling Context Hypervisor with StarPU
-* StarPU Basic API::            	The Basic API to use StarPU
-* StarPU Advanced API::         	Advanced use of StarPU
-* Configuring StarPU::          	How to configure StarPU
+* Introduction::                Getting started
+* Building and Installing StarPU::
+* Basic Examples::              Basic examples of the use of StarPU
+* Advanced Examples::           Advanced examples of the use of StarPU
+* Performance optimization::    How to optimize performance with StarPU
+* Performance feedback::        Performance debugging tools
+* Tips and Tricks::             Tips and tricks to know about
+* StarPU MPI support::          How to combine StarPU with MPI
+* StarPU FFT support::          How to perform FFT computations with StarPU
+* C Extensions::                Easier StarPU programming with GCC
+* SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
+* Scheduling Contexts in StarPU::         How to use Scheduling Context of StarPU
+* Scheduling Context Hypervisor::  How to use Scheduling Context Hypervisor with StarPU
+* StarPU's API::                The API to use StarPU
+* Scheduling Context Hypervisor's API:: The API to use the Hypervisor
+* Configuration Options for StarPU::
 * Full source code for the 'Scaling a Vector' example::
 * GNU Free Documentation License::  How you can copy and share this manual.
 
@@ -103,19 +102,11 @@ was last updated on @value{UPDATED}.
 @c Installing StarPU
 @c ---------------------------------------------------------------------
 
-@node Installing StarPU
-@chapter Installing StarPU
+@node Building and Installing StarPU
+@chapter Building and Installing StarPU
 @include chapters/installing.texi
 
 @c ---------------------------------------------------------------------
-@c Using StarPU
-@c ---------------------------------------------------------------------
-
-@node Using StarPU
-@chapter Using StarPU
-@include chapters/using.texi
-
-@c ---------------------------------------------------------------------
 @c Basic Examples
 @c ---------------------------------------------------------------------
 
@@ -132,14 +123,6 @@ was last updated on @value{UPDATED}.
 @include chapters/advanced-examples.texi
 
 @c ---------------------------------------------------------------------
-@c Benchmarks
-@c ---------------------------------------------------------------------
-
-@node Benchmarks
-@chapter Benchmarks
-@include chapters/benchmarks.texi
-
-@c ---------------------------------------------------------------------
 @c Performance options
 @c ---------------------------------------------------------------------
 
@@ -196,6 +179,14 @@ was last updated on @value{UPDATED}.
 @include chapters/socl.texi
 
 @c ---------------------------------------------------------------------
+@c Scheduling Contexts in StarPU
+@c ---------------------------------------------------------------------
+
+@node Scheduling Contexts in StarPU
+@chapter Scheduling Contexts in StarPU
+@include chapters/sched_ctx.texi
+
+@c ---------------------------------------------------------------------
 @c Scheduling Context Hypervisor
 @c ---------------------------------------------------------------------
 
@@ -207,24 +198,24 @@ was last updated on @value{UPDATED}.
 @c StarPU API
 @c ---------------------------------------------------------------------
 
-@node StarPU Basic API
-@chapter StarPU Basic API
-@include chapters/basic-api.texi
+@node StarPU's API
+@chapter StarPU's API
+@include chapters/api.texi
 
 @c ---------------------------------------------------------------------
-@c Advanced Topics
+@c Scheduling Context Hypervisor's API
 @c ---------------------------------------------------------------------
 
-@node StarPU Advanced API
-@chapter StarPU Advanced API
-@include chapters/advanced-api.texi
+@node Scheduling Context Hypervisor's API
+@chapter Scheduling Context Hypervisor's API
+@include chapters/hypervisor_api.texi
 
 @c ---------------------------------------------------------------------
 @c Configuration options
 @c ---------------------------------------------------------------------
 
-@node Configuring StarPU
-@chapter Configuring StarPU
+@node Configuration Options for StarPU
+@chapter Configuration Options for StarPU
 @include chapters/configuration.texi
 
 @c ---------------------------------------------------------------------

+ 11 - 6
doc/tutorial/Makefile

@@ -14,17 +14,19 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-CFLAGS          +=      $$(pkg-config --cflags libstarpu-1.0)
-LDFLAGS         +=      $$(pkg-config --libs libstarpu-1.0)
+CFLAGS          +=      $$(pkg-config --cflags starpu-1.1)
+LDFLAGS         +=      $$(pkg-config --libs starpu-1.1)
 
-HAS_CUDA	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i cuda)
+HAS_CUDA	=	$(shell pkg-config --libs starpu-1.1 |grep -i cuda)
 NVCC		?=	nvcc
-HAS_OPENCL	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i opencl)
+HAS_OPENCL	=	$(shell pkg-config --libs starpu-1.1 |grep -i opencl)
 
 %.o: %.cu
 	nvcc $(CFLAGS) $< -c
 
-all: hello_world vector_scal
+TARGETS = hello_world vector_scal hello_world_plugin
+
+all: $(TARGETS)
 
 VECTOR_SCAL_PREREQUISITES	=	vector_scal.o vector_scal_cpu.o
 ifneq ($(strip $(HAS_CUDA)),)
@@ -40,5 +42,8 @@ endif
 vector_scal: $(VECTOR_SCAL_PREREQUISITES)
 	$(VECTOR_SCAL_COMPILER) $(LDFLAGS) $^ -o $@
 
+hello_world_plugin: hello_world_plugin.c
+	$(CC) $(CFLAGS) -fplugin=`pkg-config starpu-1.1 --variable=gccplugin` $(LDFLAGS) $^ -o $@
+
 clean:
-	rm -f hello_world vector_scal *.o
+	rm -f $(TARGETS) *.o

+ 4 - 1
doc/tutorial/README

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
 #
 # Redistribution  and  use  in  source and binary forms, with or without
 # modification,  are  permitted  provided  that the following conditions
@@ -44,3 +44,6 @@ Instructions on how to compile and run StarPU examples
 % STARPU_NCPU=0 ./vector_scal
 % STARPU_NCPU=0 STARPU_NCUDA=0 ./vector_scal
 
+% make hello_world_plugin
+% ./hello_world_plugin
+

+ 28 - 24
doc/tutorial/hello_world.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010-2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,52 +19,56 @@
 
 struct params
 {
-    int i;
-    float f;
+	int i;
+	float f;
 };
 
 void cpu_func(void *buffers[], void *cl_arg)
 {
-    struct params *params = cl_arg;
+	struct params *params = cl_arg;
 
-    printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
+	printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 
 struct starpu_codelet cl =
 {
-    .cpu_funcs = {cpu_func, NULL},
-    .nbuffers = 0
+	.cpu_funcs = {cpu_func, NULL},
+	.nbuffers = 0
 };
 
 void callback_func(void *callback_arg)
 {
-    printf("Callback function (arg %x)\n", callback_arg);
+	printf("Callback function (arg %x)\n", callback_arg);
 }
 
 int main(int argc, char **argv)
 {
-    /* initialize StarPU */
-    starpu_init(NULL);
+	int ret;
 
-    struct starpu_task *task = starpu_task_create();
+	/* initialize StarPU */
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-    task->cl = &cl; /* Pointer to the codelet defined above */
+	struct starpu_task *task = starpu_task_create();
 
-    struct params params = { 1, 2.0f };
-    task->cl_arg = &params;
-    task->cl_arg_size = sizeof(params);
+	task->cl = &cl; /* Pointer to the codelet defined above */
 
-    task->callback_func = callback_func;
-    task->callback_arg = 0x42;
+	struct params params = { 1, 2.0f };
+	task->cl_arg = &params;
+	task->cl_arg_size = sizeof(params);
 
-    /* starpu_task_submit will be a blocking call */
-    task->synchronous = 1;
+	task->callback_func = callback_func;
+	task->callback_arg = (void*) (uintptr_t) 0x42;
 
-    /* submit the task to StarPU */
-    starpu_task_submit(task);
+	/* starpu_task_submit will be a blocking call */
+	task->synchronous = 1;
 
-    /* terminate StarPU */
-    starpu_shutdown();
+	/* submit the task to StarPU */
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-    return 0;
+	/* terminate StarPU */
+	starpu_shutdown();
+
+	return 0;
 }

+ 43 - 0
doc/tutorial/hello_world_plugin.c

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+
+/* Task declaration. */
+static void my_task (int x) __attribute__ ((task));
+
+/* Definition of the CPU implementation of ‘my task’. */
+static void my_task (int x)
+{
+	printf ("Hello, world! With x = %d\n", x);
+}
+
+int main ()
+{
+/* Initialize StarPU. */
+#pragma starpu initialize
+
+/* Do an asynchronous call to ‘my task’. */
+	my_task (42);
+
+/* Wait for the call to complete. */
+#pragma starpu wait
+
+/* Terminate. */
+#pragma starpu shutdown
+	return 0;
+}
+

+ 70 - 68
doc/tutorial/vector_scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -26,23 +26,23 @@
 
 #define    NX    2048
 
-extern void scal_cpu_func(void *buffers[], void *_args);
-extern void scal_cuda_func(void *buffers[], void *_args);
-extern void scal_opencl_func(void *buffers[], void *_args);
+extern void vector_scal_cpu(void *buffers[], void *_args);
+extern void vector_scal_cuda(void *buffers[], void *_args);
+extern void vector_scal_opencl(void *buffers[], void *_args);
 
 static struct starpu_codelet cl = {
-    /* CPU implementation of the codelet */
-    .cpu_funcs = {scal_cpu_func, NULL},
+	/* CPU implementation of the codelet */
+	.cpu_funcs = {vector_scal_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-    /* CUDA implementation of the codelet */
-    .cuda_funcs = {scal_cuda_func, NULL},
+	/* CUDA implementation of the codelet */
+	.cuda_funcs = {vector_scal_cuda, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
-    /* OpenCL implementation of the codelet */
-    .opencl_funcs = {scal_opencl_func, NULL},
+	/* OpenCL implementation of the codelet */
+	.opencl_funcs = {vector_scal_opencl, NULL},
 #endif
-    .nbuffers = 1,
-    .modes = {STARPU_RW}
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 #ifdef STARPU_USE_OPENCL
@@ -51,72 +51,74 @@ struct starpu_opencl_program programs;
 
 int main(int argc, char **argv)
 {
-    /* We consider a vector of float that is initialized just as any of C
-      * data */
-    float vector[NX];
-    unsigned i;
-    for (i = 0; i < NX; i++)
-        vector[i] = 1.0f;
+	/* We consider a vector of float that is initialized just as any of C
+	 * data */
+	float vector[NX];
+	unsigned i;
+	for (i = 0; i < NX; i++)
+		vector[i] = 1.0f;
 
-    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+	fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
 
-    /* Initialize StarPU with default configuration */
-    starpu_init(NULL);
+	/* Initialize StarPU with default configuration */
+	int ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
+	starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
 #endif
 
-    /* Tell StaPU to associate the "vector" vector with the "vector_handle"
-     * identifier. When a task needs to access a piece of data, it should
-     * refer to the handle that is associated to it.
-     * In the case of the "vector" data interface:
-     *  - the first argument of the registration method is a pointer to the
-     *    handle that should describe the data
-     *  - the second argument is the memory node where the data (ie. "vector")
-     *    resides initially: 0 stands for an address in main memory, as
-     *    opposed to an adress on a GPU for instance.
-     *  - the third argument is the adress of the vector in RAM
-     *  - the fourth argument is the number of elements in the vector
-     *  - the fifth argument is the size of each element.
-     */
-    starpu_data_handle_t vector_handle;
-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
-                                NX, sizeof(vector[0]));
-
-    float factor = 3.14;
-
-    /* create a synchronous task: any call to starpu_task_submit will block
-      * until it is terminated */
-    struct starpu_task *task = starpu_task_create();
-    task->synchronous = 1;
-
-    task->cl = &cl;
-
-    /* the codelet manipulates one buffer in RW mode */
-    task->handles[0] = vector_handle;
-
-    /* an argument is passed to the codelet, beware that this is a
-     * READ-ONLY buffer and that the codelet may be given a pointer to a
-     * COPY of the argument */
-    task->cl_arg = &factor;
-    task->cl_arg_size = sizeof(factor);
-
-    /* execute the task on any eligible computational ressource */
-    starpu_task_submit(task);
-
-    /* StarPU does not need to manipulate the array anymore so we can stop
-      * monitoring it */
-    starpu_data_unregister(vector_handle);
+	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
+	 * identifier. When a task needs to access a piece of data, it should
+	 * refer to the handle that is associated to it.
+	 * In the case of the "vector" data interface:
+	 *  - the first argument of the registration method is a pointer to the
+	 *    handle that should describe the data
+	 *  - the second argument is the memory node where the data (ie. "vector")
+	 *    resides initially: 0 stands for an address in main memory, as
+	 *    opposed to an adress on a GPU for instance.
+	 *  - the third argument is the adress of the vector in RAM
+	 *  - the fourth argument is the number of elements in the vector
+	 *  - the fifth argument is the size of each element.
+	 */
+	starpu_data_handle_t vector_handle;
+	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+				    NX, sizeof(vector[0]));
+
+	float factor = 3.14;
+
+	/* create a synchronous task: any call to starpu_task_submit will block
+	 * until it is terminated */
+	struct starpu_task *task = starpu_task_create();
+	task->synchronous = 1;
+
+	task->cl = &cl;
+
+	/* the codelet manipulates one buffer in RW mode */
+	task->handles[0] = vector_handle;
+
+	/* an argument is passed to the codelet, beware that this is a
+	 * READ-ONLY buffer and that the codelet may be given a pointer to a
+	 * COPY of the argument */
+	task->cl_arg = &factor;
+	task->cl_arg_size = sizeof(factor);
+
+	/* execute the task on any eligible computational ressource */
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* StarPU does not need to manipulate the array anymore so we can stop
+	 * monitoring it */
+	starpu_data_unregister(vector_handle);
 
 #ifdef STARPU_USE_OPENCL
-    starpu_opencl_unload_opencl(&programs);
+	starpu_opencl_unload_opencl(&programs);
 #endif
 
-    /* terminate StarPU, no task can be submitted after */
-    starpu_shutdown();
+	/* terminate StarPU, no task can be submitted after */
+	starpu_shutdown();
 
-    fprintf(stderr, "AFTER First element is %f\n", vector[0]);
+	fprintf(stderr, "AFTER First element is %f\n", vector[0]);
 
-    return 0;
+	return 0;
 }

+ 24 - 25
doc/tutorial/vector_scal_cpu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,33 +18,32 @@
 #include <starpu.h>
 
 /* This kernel takes a buffer and scales it by a constant factor */
-void scal_cpu_func(void *buffers[], void *cl_arg)
+void vector_scal_cpu(void *buffers[], void *cl_arg)
 {
-    unsigned i;
-    float *factor = cl_arg;
+	unsigned i;
+	float *factor = cl_arg;
 
-    /*
-     * The "buffers" array matches the task->handles array: for instance
-     * task->handles[0] is a handle that corresponds to a data with
-     * vector "interface", so that the first entry of the array in the
-     * codelet  is a pointer to a structure describing such a vector (ie.
-     * struct starpu_vector_interface *). Here, we therefore manipulate
-     * the buffers[0] element as a vector: nx gives the number of elements
-     * in the array, ptr gives the location of the array (that was possibly
-     * migrated/replicated), and elemsize gives the size of each elements.
-     */
-    struct starpu_vector_interface *vector = buffers[0];
+	/*
+	 * The "buffers" array matches the task->handles array: for instance
+	 * task->handles[0] is a handle that corresponds to a data with
+	 * vector "interface", so that the first entry of the array in the
+	 * codelet  is a pointer to a structure describing such a vector (ie.
+	 * struct starpu_vector_interface *). Here, we therefore manipulate
+	 * the buffers[0] element as a vector: nx gives the number of elements
+	 * in the array, ptr gives the location of the array (that was possibly
+	 * migrated/replicated), and elemsize gives the size of each elements.
+	 */
+	struct starpu_vector_interface *vector = buffers[0];
 
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
+	/* length of the vector */
+	unsigned n = STARPU_VECTOR_GET_NX(vector);
 
-    /* get a pointer to the local copy of the vector : note that we have to
-     * cast it in (float *) since a vector could contain any type of
-     * elements so that the .ptr field is actually a uintptr_t */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+	/* get a pointer to the local copy of the vector : note that we have to
+	 * cast it in (float *) since a vector could contain any type of
+	 * elements so that the .ptr field is actually a uintptr_t */
+	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
 
-    /* scale the vector */
-    for (i = 0; i < n; i++)
-        val[i] *= *factor;
+	/* scale the vector */
+	for (i = 0; i < n; i++)
+		val[i] *= *factor;
 }
-

+ 4 - 4
doc/tutorial/vector_scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,19 +17,19 @@
 
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(float *val, unsigned n, float factor)
+static __global__ void vector_mult_cuda(float *val, unsigned int n, float factor)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
                val[i] *= factor;
 }
 
-extern "C" void scal_cuda_func(void *buffers[], void *_args)
+extern "C" void vector_scal_cuda(void *buffers[], void *_args)
 {
         float *factor = (float *)_args;
 
         /* length of the vector */
-        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
         /* local copy of the vector pointer */
         float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
         unsigned threads_per_block = 64;

+ 37 - 37
doc/tutorial/vector_scal_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,41 +19,41 @@
 
 extern struct starpu_opencl_program programs;
 
-void scal_opencl_func(void *buffers[], void *_args)
+void vector_scal_opencl(void *buffers[], void *_args)
 {
-    float *factor = _args;
-    int id, devid, err;
-    cl_kernel kernel;
-    cl_command_queue queue;
-    cl_event event;
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
-
-    id = starpu_worker_get_id();
-    devid = starpu_worker_get_devid(id);
-
-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
-                    "vector_mult_opencl", devid);   /* Name of the codelet defined above */
-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-    err = clSetKernelArg(kernel, 0, sizeof(val), &val);
-    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
-    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
-
-    {
-        size_t global=1;
-        size_t local=1;
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-    }
-
-    clFinish(queue);
-    starpu_opencl_collect_stats(event);
-    clReleaseEvent(event);
-
-    starpu_opencl_release_kernel(kernel);
+	float *factor = _args;
+	int id, devid, err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	/* length of the vector */
+	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	/* OpenCL copy of the vector pointer */
+	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
+					"vector_mult_opencl", devid);   /* Name of the codelet defined above */
+	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
+	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
+	if (err) STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=1;
+		size_t local=1;
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
 }

+ 7 - 0
examples/Makefile.am

@@ -183,6 +183,7 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/sched_ctx			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
+	mandelbrot/mandelbrot			\
 	ppm_downscaler/ppm_downscaler		\
 	ppm_downscaler/yuv_downscaler
 
@@ -791,6 +792,12 @@ endif
 # Mandelbrot Set #
 ##################
 
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+if HAVE_X11
+mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
+endif
+
 ################
 # Top Examples #
 ################

+ 2 - 0
examples/basic_examples/multiformat.c

@@ -25,6 +25,8 @@ static int ncuda = 0;
 static int nopencl = 0;
 #endif
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 static struct point array_of_structs[N_ELEMENTS];
 static starpu_data_handle_t array_of_structs_handle;
 

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -17,6 +17,8 @@
 #include <starpu.h>
 #include "multiformat_types.h"
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 #ifdef STARPU_USE_CUDA
 void cuda_to_cpu(void *buffers[], void *arg)
 {

+ 2 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -17,6 +17,8 @@
 #include <starpu.h>
 #include "multiformat_types.h"
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;

+ 1 - 2
examples/basic_examples/multiformat_types.h

@@ -13,6 +13,7 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
+
 #ifndef MULTIFORMAT_TYPES_H
 #define MULTIFORMAT_TYPES_H
 
@@ -28,6 +29,4 @@ struct point
 	float x, y;
 };
 
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-
 #endif

+ 3 - 5
examples/basic_examples/vector_scal.c

@@ -171,13 +171,11 @@ int main(int argc, char **argv)
 	/* terminate StarPU, no task can be submitted after */
 	starpu_shutdown();
 
+	ret = approximately_equal(vector[1], (1+1.0f) * factor) && approximately_equal(vector[NX-1], (NX-1+1.0f) * factor);
 	FPRINTF(stderr, "[AFTER] 1-th element     : %3.2f (should be %3.2f)\n", vector[1], (1+1.0f) * factor);
 	FPRINTF(stderr, "[AFTER] (NX-1)-th element: %3.2f (should be %3.2f)\n", vector[NX-1], (NX-1+1.0f) * factor);
-
-	return ((approximately_equal(vector[1], (1+1.0f) * factor)
-		 && approximately_equal(vector[NX-1], (NX-1+1.0f) * factor))
-		? EXIT_SUCCESS
-		: EXIT_FAILURE);
+	FPRINTF(stderr, "[AFTER] Computation is%s correct\n", ret?"":" NOT");
+	return (ret ? EXIT_SUCCESS : EXIT_FAILURE);
 
 enodev:
 	return 77;

+ 3 - 3
examples/basic_examples/vector_scal_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,7 +21,7 @@
 
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(float *val, unsigned n,
+static __global__ void vector_mult_cuda(unsigned n, float *val,
                                         float factor)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
@@ -41,7 +41,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 4 - 4
examples/basic_examples/vector_scal_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
@@ -34,7 +34,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	cl_event event;
 
 	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
 	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
@@ -44,8 +44,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "vector_mult_opencl", devid);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
-	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+	err = clSetKernelArg(kernel, 0, sizeof(n), &n);
+	err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 

+ 2 - 2
examples/basic_examples/vector_scal_opencl_kernel.cl

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+__kernel void vector_mult_opencl(unsigned int nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
         if (i < nx)

+ 6 - 1
examples/mult/xgemm.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -277,6 +277,11 @@ static void parse_args(int argc, char **argv)
 			fprintf(stderr,"Currently selected: %ux%u * %ux%u and %ux%u blocks, %u iterations\n", zdim, ydim, xdim, zdim, nslicesx, nslicesy, niter);
 			exit(EXIT_SUCCESS);
 		}
+		else
+		{
+			fprintf(stderr,"Unrecognized option %s", argv[i]);
+			exit(EXIT_FAILURE);
+		}
 	}
 }
 

+ 41 - 24
examples/pi/pi.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -30,10 +30,12 @@ void cuda_kernel(void **descr, void *cl_arg);
 /* default value */
 static unsigned ntasks = 1024;
 
+static unsigned long long nshot_per_task = 16*1024*1024ULL;
+
 static void cpu_kernel(void *descr[], void *cl_arg)
 {
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
+	unsigned nx = nshot_per_task;
 
 	TYPE *random_numbers = malloc(2*nx*sizeof(TYPE));
 	sobolCPU(2*nx/n_dimensions, n_dimensions, directions, random_numbers);
@@ -64,7 +66,7 @@ static void cpu_kernel(void *descr[], void *cl_arg)
 /* The amount of work does not depend on the data size at all :) */
 static size_t size_base(struct starpu_task *task, unsigned nimpl)
 {
-	return NSHOT_PER_TASK;
+	return nshot_per_task;
 }
 
 static void parse_args(int argc, char **argv)
@@ -77,9 +79,42 @@ static void parse_args(int argc, char **argv)
 			char *argptr;
 			ntasks = strtol(argv[++i], &argptr, 10);
 		}
+
+		if (strcmp(argv[i], "-nshot") == 0)
+		{
+			char *argptr;
+			nshot_per_task = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+		{
+			fprintf(stderr,"Usage: %s [options...]\n", argv[0]);
+			fprintf(stderr,"\n");
+			fprintf(stderr,"Options:\n");
+			fprintf(stderr,"-ntasks <n>		select the number of tasks\n");
+			fprintf(stderr,"-nshot <n>		select the number of shot per task\n");
+			exit(0);
+		}
 	}
 }
 
+static struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi"
+};
+
+static struct starpu_codelet pi_cl =
+{
+	.cpu_funcs = {cpu_kernel, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_kernel, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W},
+	.model = &model
+};
+
 int main(int argc, char **argv)
 {
 	unsigned i;
@@ -120,24 +155,6 @@ int main(int argc, char **argv)
 	
 	starpu_data_partition(cnt_array_handle, &f);
 
-	static struct starpu_perfmodel model =
-	{
-		.type = STARPU_HISTORY_BASED,
-		.size_base = size_base,
-		.symbol = "monte_carlo_pi"
-	};
-
-	struct starpu_codelet cl =
-	{
-		.cpu_funcs = {cpu_kernel, NULL},
-#ifdef STARPU_USE_CUDA
-		.cuda_funcs = {cuda_kernel, NULL},
-#endif
-		.nbuffers = 2,
-		.modes = {STARPU_R, STARPU_W},
-		.model = &model
-	};
-
 	struct timeval start;
 	struct timeval end;
 
@@ -147,7 +164,7 @@ int main(int argc, char **argv)
 	{
 		struct starpu_task *task = starpu_task_create();
 
-		task->cl = &cl;
+		task->cl = &pi_cl;
 
 		STARPU_ASSERT(starpu_data_get_sub_data(cnt_array_handle, 1, i));
 
@@ -174,14 +191,14 @@ int main(int argc, char **argv)
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	unsigned long total_shot_cnt = ntasks * NSHOT_PER_TASK;
+	unsigned long total_shot_cnt = ntasks * nshot_per_task;
 
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4, probability to impact the disk: pi/4 */
 	FPRINTF(stderr, "Pi approximation : %f (%ld / %ld)\n", ((TYPE)total_cnt*4)/(total_shot_cnt), total_cnt, total_shot_cnt);
 	FPRINTF(stderr, "Total time : %f ms\n", timing/1000.0);
 	FPRINTF(stderr, "Speed : %f GShot/s\n", total_shot_cnt/(1e3*timing));
 
-	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&cl);
+	if (!getenv("STARPU_SSILENT")) starpu_display_codelet_stats(&pi_cl);
 
 	starpu_shutdown();
 

+ 1 - 3
examples/pi/pi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,6 @@
 #include <starpu.h>
 #include <stdio.h>
 
-#define NSHOT_PER_TASK	(16*1024*1024ULL)
-
 #define TYPE	float
 
 /* extern "C" void cuda_kernel(void *descr[], void *cl_arg); */

+ 3 - 2
examples/pi/pi_kernel.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -102,7 +102,8 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	cudaError_t cures;
 
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
-	unsigned nx = NSHOT_PER_TASK;
+	unsigned long long *nshot_per_task = (unsigned long long *) cl_arg;
+	unsigned nx = *nshot_per_task;
 
 	/* Generate Random numbers */
 	float *random_numbers;

+ 37 - 11
examples/pi/pi_redux.c

@@ -30,7 +30,7 @@
 #include <curand.h>
 #endif
 
-#define NSHOT_PER_TASK	(1024*1024)
+static unsigned long long nshot_per_task = 16*1024*1024ULL;
 
 /* default value */
 static unsigned long ntasks = 1024;
@@ -92,6 +92,12 @@ static void init_rng(void *arg __attribute__((unused)))
 	}
 }
 
+/* The amount of work does not depend on the data size at all :) */
+static size_t size_base(struct starpu_task *task, unsigned nimpl)
+{
+	return nshot_per_task;
+}
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -103,6 +109,12 @@ static void parse_args(int argc, char **argv)
 			ntasks = strtol(argv[++i], &argptr, 10);
 		}
 
+		if (strcmp(argv[i], "-nshot") == 0)
+		{
+			char *argptr;
+			nshot_per_task = strtol(argv[++i], &argptr, 10);
+		}
+
 		if (strcmp(argv[i], "-noredux") == 0)
 		{
 			use_redux = 0;
@@ -114,7 +126,7 @@ static void parse_args(int argc, char **argv)
 			ntasks_warmup = 8; /* arbitrary number of warmup tasks */
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
 			fprintf(stderr, "Usage: %s [-ntasks n] [-noredux] [-warmup] [-h]\n", argv[0]);
 			exit(-1);
@@ -139,8 +151,8 @@ static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
 	unsigned long local_cnt = 0;
 
 	/* Fill the scratchpad with random numbers */
-	int i;
-	for (i = 0; i < NSHOT_PER_TASK; i++)
+	unsigned i;
+	for (i = 0; i < nshot_per_task; i++)
 	{
 		double randx, randy;
 
@@ -176,17 +188,24 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 	/* Fill the scratchpad with random numbers. Note that both x and y
 	 * arrays are in stored the same vector. */
 	float *scratchpad_xy = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
-	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*NSHOT_PER_TASK);
+	res = curandGenerateUniform(curandgens[workerid], scratchpad_xy, 2*nshot_per_task);
 	STARPU_ASSERT(res == CURAND_STATUS_SUCCESS);
 
 	float *x = &scratchpad_xy[0];
-	float *y = &scratchpad_xy[NSHOT_PER_TASK];
+	float *y = &scratchpad_xy[nshot_per_task];
 
 	unsigned long *shot_cnt = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
-	pi_redux_cuda_kernel(x, y, NSHOT_PER_TASK, shot_cnt);
+	pi_redux_cuda_kernel(x, y, nshot_per_task, shot_cnt);
 }
 #endif
 
+static struct starpu_perfmodel pi_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi_scratch"
+};
+
 static struct starpu_codelet pi_cl =
 {
 	.cpu_funcs = {pi_func_cpu, NULL},
@@ -195,7 +214,14 @@ static struct starpu_codelet pi_cl =
 #endif
 	.nbuffers = 2,
 	.modes    = {STARPU_SCRATCH, STARPU_RW},
-	.model = NULL
+	.model = &pi_model
+};
+
+static struct starpu_perfmodel pi_model_redux =
+{
+	.type = STARPU_HISTORY_BASED,
+	.size_base = size_base,
+	.symbol = "monte_carlo_pi_scratch_redux"
 };
 
 static struct starpu_codelet pi_cl_redux =
@@ -206,7 +232,7 @@ static struct starpu_codelet pi_cl_redux =
 #endif
 	.nbuffers = 2,
 	.modes    = {STARPU_SCRATCH, STARPU_REDUX},
-	.model = NULL
+	.model = &pi_model_redux
 };
 
 /*
@@ -297,7 +323,7 @@ int main(int argc, char **argv)
 	/* Create a scratchpad data */
 	starpu_data_handle_t xy_scratchpad_handle;
 	starpu_vector_data_register(&xy_scratchpad_handle, -1, (uintptr_t)NULL,
-		2*NSHOT_PER_TASK, sizeof(float));
+		2*nshot_per_task, sizeof(float));
 
 	/* Create a variable that will be used to count the number of shots
 	 * that actually hit the unit circle when shooting randomly in
@@ -349,7 +375,7 @@ int main(int argc, char **argv)
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	/* Total surface : Pi * r^ 2 = Pi*1^2, total square surface : 2^2 = 4,
 	 * probability to impact the disk: pi/4 */
-	unsigned long total = (ntasks + ntasks_warmup)*NSHOT_PER_TASK;
+	unsigned long total = (ntasks + ntasks_warmup)*nshot_per_task;
 	double pi_approx = ((double)shot_cnt*4.0)/total;
 
 	FPRINTF(stderr, "Reductions? %s\n", use_redux?"yes":"no");

+ 1 - 0
examples/scheduler/dummy_sched.c

@@ -16,6 +16,7 @@
  */
 
 #include <starpu.h>
+#include <starpu_scheduler.h>
 
 #define NTASKS	32000
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

+ 3 - 3
gcc-plugin/examples/vector_scal/vector_scal_cuda.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012 Institut National de Recherche en Informatique et Automatique
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 
 static __global__ void
-vector_mult_cuda (float *val, unsigned n, float factor)
+vector_mult_cuda (unsigned int n, float *val, float factor)
 {
   unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -38,7 +38,7 @@ vector_scal_cuda (size_t size, float vector[], float factor)
   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
 
   vector_mult_cuda <<< nblocks, threads_per_block, 0,
-    starpu_cuda_get_local_stream () >>> (vector, size, factor);
+       starpu_cuda_get_local_stream () >>> (size, vector, factor);
 
   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
 }

+ 1 - 1
gcc-plugin/examples/vector_scal/vector_scal_opencl_kernel.cl

@@ -16,7 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void vector_mult_opencl(__global float* val, unsigned int nx, float factor)
+__kernel void vector_mult_opencl(unsigned int nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
         if (i < nx) {

+ 0 - 1
include/starpu.h

@@ -55,7 +55,6 @@ typedef UINT_PTR uintptr_t;
 #include <util/starpu_task_list_inline.h>
 #endif
 #include <starpu_task_util.h>
-#include <starpu_scheduler.h>
 #include <starpu_sched_ctx.h>
 #include <starpu_expert.h>
 #include <starpu_rand.h>

+ 116 - 23
include/starpu_sched_ctx.h

@@ -24,73 +24,166 @@ extern "C"
 {
 #endif
 
+/*
+ * MANAGEMENT OF SCHEDULING CONTEXTS
+ */
+
+/* create a context indicating the scheduling policy, the workers it should have and a potential name */
+unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
+
+/* create a context indicating an approximate interval of resources */
+unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name,
+						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
+						 unsigned allow_overlap);
+
+/* add workers to a context */
+void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+
+/* remove workers from a context */
+void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+
+/* delete a certain context */
+void starpu_sched_ctx_delete(unsigned sched_ctx_id);
+
+/* indicate which context whill inherit the resources of this context when he will be deleted */
+void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
+
+/* mutex synchronising several simultaneous modifications of a context */
+starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
+
+/* indicate that the current thread is submitting only to the current context */
+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
+
+/* find out to which context is submitting the current thread */
+unsigned starpu_sched_ctx_get_context(void);
+
+/* stop submitting tasks from the empty context list until the next time the context has
+   time to check the empty context list*/
+void starpu_sched_ctx_stop_task_submission(void);
+
+/* indicate starpu that hte application finished submitting to this context in order to
+   move the workers to the inheritor as soon as possible */
+void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
+
+
+/*
+ * CONNECTION WITH THE HYPERVISOR
+ */
+
+/* performance counters used by the starpu to indicate the hypervisor 
+   how the application and the resources are executing */
 struct starpu_sched_ctx_performance_counters
 {
+	/* tell the hypervisor for how long a worker was idle in a certain context */ 
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
+	/* tell the hypervisor when a worker stoped being idle in a certain context */ 
 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
+	/* tell the hypervisor when a task was pushed on a worker in a certain context */ 
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
+	/* tell the hypervisor when a task was poped from a worker in a certain context */ 
 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
+	/* tell the hypervisor when a task finished executing in a certain context */
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
+	/* tell the hypervisor when a task was submitted to a certain context */
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
+	/* tell the hypervisor when a context was deleted */
 	void (*notify_delete_context)(unsigned sched_ctx);
 };
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+/* indicates to starpu the pointer to the performance counte */
 void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
+/* callback that lets the scheduling policy tell the hypervisor that a task was pushed on a worker */
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
-unsigned starpu_sched_ctx_create(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name);
-
-unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_name,
-						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
-						 unsigned allow_overlap);
-
-void starpu_sched_ctx_delete(unsigned sched_ctx_id);
-
-void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+/* allow the hypervisor to let starpu know he's initialised */
+void starpu_sched_ctx_notify_hypervisor_exists(void);
 
-void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+/* ask starpu if he is informed if the hypervisor is initialised */
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
+/*
+ * POLICY DATA 
+*/
+/* allow the scheduling policy to have its own data in a context, like a private list of tasks, mutexes, conds, etc. */
 void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
 
+/* return the scheduling policy private data */
 void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
 
+
+/*
+ * WORKERS IN CONTEXT 
+*/
+/* create a worker collection for a context, the type can be only STARPU_WORKER_LIST for now, which corresponds to a simple list */
 struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, int type);
 
+/* free the worker collection when removing the context */
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 
+/*return the worker collection */
 struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
 
-starpu_pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
-
-void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
-
-unsigned starpu_sched_ctx_get_context(void);
-
-void starpu_sched_ctx_notify_hypervisor_exists(void);
-
-unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
-
+/* return the number of workers in the sched_ctx's collection */
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
+/* return the number of shared workers in the sched_ctx's collection */
 unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
 
+/* return 1 if the worker belongs to the context and 0 otherwise */
 unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id);
 
+/* check if a worker is shared between several contexts */
 unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid);
 
+/* manage sharing of resources between contexts: checkOB which ctx has its turn to pop */
 unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
 
+/* manage sharing of resources between contexts: by default a round_robin strategy
+   is executed but the user can interfere to tell which ctx has its turn to pop */
 void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 
+/* time sharing a resources, indicate how long a worker has been active in
+   the current sched_ctx */
 double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
 
-void starpu_sched_ctx_stop_task_submission(void);
+/*
+ *	Priorities
+ */
 
-void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor);
+/* get min priority for the scheduler of the global context */
+int starpu_sched_get_min_priority(void);
+
+/* get max priority for the scheduler of the global context */
+int starpu_sched_get_max_priority(void);
+
+/* set min priority for the scheduler of the global context */
+int starpu_sched_set_min_priority(int min_prio);
+
+/* set max priority for the scheduler of the global context */
+int starpu_sched_set_max_priority(int max_prio);
+
+/* get min priority for the scheduler of the scheduling context indicated */
+int starpu_sched_ctx_get_min_priority(unsigned sched_ctx_id);
+
+/* get max priority for the scheduler of the scheduling context indicated */
+int starpu_sched_ctx_get_max_priority(unsigned sched_ctx_id);
+
+/* set min priority for the scheduler of the scheduling context indicated */
+int starpu_sched_ctx_set_min_priority(unsigned sched_ctx_id, int min_prio);
+
+/* set max priority for the scheduler of the scheduling context indicated */
+int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
+
+/* Provided for legacy reasons */
+#define STARPU_MIN_PRIO		(starpu_sched_get_min_priority())
+#define STARPU_MAX_PRIO		(starpu_sched_get_max_priority())
+
+/* By convention, the default priority level should be 0 so that we can
+ * statically allocate tasks with a default priority. */
+#define STARPU_DEFAULT_PRIO	0
 
-void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
 
 #ifdef __cplusplus
 }

+ 0 - 56
include/starpu_scheduler.h

@@ -20,10 +20,6 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_HAVE_HWLOC
-#include <hwloc.h>
-#endif
-
 #ifdef __cplusplus
 extern "C"
 {
@@ -31,38 +27,6 @@ extern "C"
 
 struct starpu_task;
 
-struct starpu_machine_topology
-{
-	unsigned nworkers;
-
-	unsigned ncombinedworkers;
-
-	unsigned nsched_ctxs;
-#ifdef STARPU_HAVE_HWLOC
-	hwloc_topology_t hwtopology;
-#else
-	/* We maintain ABI compatibility with and without hwloc */
-	void *dummy;
-#endif
-
-	unsigned nhwcpus;
-	unsigned nhwcudagpus;
-	unsigned nhwopenclgpus;
-
-	unsigned ncpus;
-	unsigned ncudagpus;
-	unsigned nopenclgpus;
-
-	/* Where to bind workers ? */
-	unsigned workers_bindid[STARPU_NMAXWORKERS];
-
-	/* Which GPU(s) do we use for CUDA ? */
-	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
-
-	/* Which GPU(s) do we use for OpenCL ? */
-	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
-};
-
 /* This structure contains all the methods that implement a scheduling policy.
  * An application may specify which scheduling strategy in the "sched_policy"
  * field of the starpu_conf structure passed to the starpu_init function. */
@@ -137,24 +101,6 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back);
 int starpu_push_task_end(struct starpu_task *task);
 
 /*
- *	Priorities
- */
-
-/* Provided for legacy reasons */
-#define STARPU_MIN_PRIO		(starpu_sched_get_min_priority())
-#define STARPU_MAX_PRIO		(starpu_sched_get_max_priority())
-
-/* By convention, the default priority level should be 0 so that we can
- * statically allocate tasks with a default priority. */
-#define STARPU_DEFAULT_PRIO	0
-
-int starpu_sched_get_min_priority(void);
-int starpu_sched_get_max_priority(void);
-
-void starpu_sched_set_min_priority(int min_prio);
-void starpu_sched_set_max_priority(int max_prio);
-
-/*
  *	Parallel tasks
  */
 
@@ -178,8 +124,6 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
  *	Performance predictions
  */
 
-/* Return the current date in us */
-double starpu_timing_now(void);
 /* Returns the perfmodel footprint for the task */
 uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns expected task duration in us */

+ 3 - 2
include/starpu_task_util.h

@@ -43,9 +43,10 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_EXECUTE_ON_NODE	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
 #define STARPU_EXECUTE_ON_DATA	(1<<10)	/* Used by MPI to define which task is going to execute the codelet */
 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
-#define STARPU_TAG       (1<<12) /* Tag */
+#define STARPU_TAG              (1<<12) /* Tag */
 #define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
-#define STARPU_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
+#define STARPU_FLOPS	        (1<<14)	/* Used to specify the number of flops needed to be executed by a task */
+#define STARPU_SCHED_CTX	(1<<15)	/* Used to specify the sched_ctx to which the task will be submitted */
 
 /* Wrapper to create a task. */
 int starpu_insert_task(struct starpu_codelet *cl, ...);

+ 22 - 12
include/starpu_thread.h

@@ -21,7 +21,7 @@
 #ifdef STARPU_SIMGRID
 #include <xbt/synchro_core.h>
 #include <msg/msg.h>
-#else
+#elif !defined(_MSC_VER)
 #include <pthread.h>
 #endif
 
@@ -45,7 +45,7 @@ int starpu_pthread_attr_init(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate);
 
-#else /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
 
 typedef pthread_t starpu_pthread_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
@@ -56,7 +56,8 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #define starpu_pthread_attr_destroy pthread_attr_destroy
 #define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
 
-#endif /* STARPU_SIMGRID */
+#endif /* STARPU_SIMGRID, _MSC_VER */
+
 /*
  * Encapsulation of the pthread_mutex_* functions.
  */
@@ -73,7 +74,7 @@ int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex);
 int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex);
 int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex);
 
-#else /* !STARPU_SIMGRID */
+#elif !defined(_MSC_VER) /* !STARPU_SIMGRID */
 
 typedef pthread_mutex_t starpu_pthread_mutex_t;
 typedef pthread_mutexattr_t starpu_pthread_mutexattr_t;
@@ -85,7 +86,8 @@ typedef pthread_mutexattr_t starpu_pthread_mutexattr_t;
 #define starpu_pthread_mutex_trylock pthread_mutex_trylock
 
 #define STARPU_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
-#endif /* STARPU_SIMGRID */
+
+#endif /* STARPU_SIMGRID, _MSC_VER */
 
 /*
  * Encapsulation of the pthread_key_* functions.
@@ -98,7 +100,7 @@ int starpu_pthread_key_delete(starpu_pthread_key_t key);
 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer);
 void *starpu_pthread_getspecific(starpu_pthread_key_t key);
 
-#else /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
 
 typedef pthread_key_t starpu_pthread_key_t;
 
@@ -107,13 +109,14 @@ typedef pthread_key_t starpu_pthread_key_t;
 #define starpu_pthread_setspecific pthread_setspecific
 #define starpu_pthread_getspecific pthread_getspecific
 
-#endif /* STARPU_SIMGRID */
+#endif /* STARPU_SIMGRID, _MSC_VER */
 
 /*
  * Encapsulation of the pthread_cond_* functions.
  */
 
 #ifdef STARPU_SIMGRID
+
 typedef xbt_cond_t starpu_pthread_cond_t;
 typedef int starpu_pthread_condattr_t;
 #define STARPU_PTHREAD_COND_INITIALIZER NULL
@@ -125,7 +128,8 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime);
 int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond);
 
-#else /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
+
 typedef pthread_cond_t starpu_pthread_cond_t;
 typedef pthread_condattr_t starpu_pthread_condattr_t;
 #define STARPU_PTHREAD_COND_INITIALIZER PTHREAD_COND_INITIALIZER
@@ -137,14 +141,14 @@ typedef pthread_condattr_t starpu_pthread_condattr_t;
 #define starpu_pthread_cond_timedwait pthread_cond_timedwait
 #define starpu_pthread_cond_destroy pthread_cond_destroy
 
-#endif /* STARPU_SIMGRID */
-
+#endif /* STARPU_SIMGRID, _MSC_VER */
 
 /*
  * Encapsulation of the pthread_rwlock_* functions.
  */
 
 #ifdef STARPU_SIMGRID
+
 typedef xbt_mutex_t starpu_pthread_rwlock_t;
 typedef int starpu_pthread_rwlockattr_t;
 
@@ -154,7 +158,7 @@ int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
-#else /* STARPU_SIMGRID */
+#elif !defined(_MSC_VER) /* STARPU_SIMGRID */
 
 typedef pthread_rwlock_t starpu_pthread_rwlock_t;
 typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
@@ -165,7 +169,13 @@ typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
 #define starpu_pthread_rwlock_wrlock pthread_rwlock_wrlock
 #define starpu_pthread_rwlock_unlock pthread_rwlock_unlock
 
-#endif /* STARPU_SIMGRID */
+#endif /* STARPU_SIMGRID, _MSC_VER */
+
+#ifdef _MSC_VER
+typedef void* starpu_pthread_rwlock_t;
+typedef void* starpu_pthread_mutex_t;
+typedef void* starpu_pthread_cond_t;
+#endif /* _MSC_VER */
 
 #ifdef __cplusplus
 }

+ 9 - 1
include/starpu_util.h

@@ -82,7 +82,12 @@ extern "C"
 #endif
 
 #define STARPU_ABORT() do {                                          \
-	fprintf(stderr, "[starpu][abort] %s:%d %s\n", __FILE__, __LINE__, __starpu_func__); \
+	fprintf(stderr, "[starpu][abort][%s@%s:%d]\n", __starpu_func__, __FILE__, __LINE__); \
+	abort();                                                     \
+} while(0)
+
+#define STARPU_ABORT_MSG(msg, ...) do {					\
+	fprintf(stderr, "[starpu][abort][%s@%s:%d] " msg "\n", __starpu_func__, __FILE__, __LINE__, ## __VA_ARGS__); \
 	abort();                                                     \
 } while(0)
 
@@ -263,6 +268,9 @@ void starpu_execute_on_specific_workers(void (*func)(void*), void * arg, unsigne
  * copied, and it is given the callback_arg pointer as argument.*/
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
+/* Return the current date in us */
+double starpu_timing_now(void);
+
 #ifdef __cplusplus
 }
 #endif

+ 36 - 0
include/starpu_worker.h

@@ -21,6 +21,11 @@
 #include <stdlib.h>
 #include <starpu_config.h>
 
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -39,6 +44,37 @@ struct starpu_sched_ctx_iterator
 	int cursor;
 };
 
+struct starpu_machine_topology
+{
+	unsigned nworkers;
+
+	unsigned ncombinedworkers;
+
+	unsigned nsched_ctxs;
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_topology_t hwtopology;
+#else
+	/* We maintain ABI compatibility with and without hwloc */
+	void *dummy;
+#endif
+
+	unsigned nhwcpus;
+	unsigned nhwcudagpus;
+	unsigned nhwopenclgpus;
+
+	unsigned ncpus;
+	unsigned ncudagpus;
+	unsigned nopenclgpus;
+
+	/* Where to bind workers ? */
+	unsigned workers_bindid[STARPU_NMAXWORKERS];
+
+	/* Which GPU(s) do we use for CUDA ? */
+	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
+
+	/* Which GPU(s) do we use for OpenCL ? */
+	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
+};
 
 /* generic structure used by the scheduling contexts to iterate the workers */
 struct starpu_worker_collection

+ 2 - 2
m4/gcc.m4

@@ -1,6 +1,6 @@
 dnl -*- Autoconf -*-
 dnl
-dnl Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+dnl Copyright (C) 2011, 2012, 2013 Inria
 dnl
 dnl StarPU is free software; you can redistribute it and/or modify
 dnl it under the terms of the GNU Lesser General Public License as published by
@@ -188,7 +188,7 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
 
 
     AC_DEFINE_UNQUOTED([STARPU_INCLUDE_DIR],
-      ["`eval "echo $includedir"`/starpu/$STARPU_EFFECTIVE_VERSION"],
+      ["`test "x$prefix" = xNONE && prefix=$ac_default_prefix ; eval "echo $includedir"`/starpu/$STARPU_EFFECTIVE_VERSION"],
       [Define to the directory where StarPU's headers are installed.])
 
     dnl Now, `gcc' or `g++'?

+ 1 - 0
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -22,6 +22,7 @@
 #include "mpi_cholesky_models.h"
 #include "mpi_cholesky_codelets.h"
 #include "mpi_cholesky_kernels.h"
+#include <sys/time.h>
 
 /*
  *	Create the codelets

+ 1 - 2
mpi/include/starpu_mpi.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,7 +36,6 @@ int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI
 int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
 int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
-int starpu_mpi_irecv_probe_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);

+ 0 - 4
mpi/tests/Makefile.am

@@ -83,7 +83,6 @@ starpu_mpi_TESTS =				\
 	mpi_irecv				\
 	mpi_isend_detached			\
 	mpi_irecv_detached			\
-	mpi_probe				\
 	mpi_detached_tag			\
 	ring					\
 	ring_async				\
@@ -108,7 +107,6 @@ noinst_PROGRAMS =				\
 	mpi_irecv				\
 	mpi_isend_detached			\
 	mpi_irecv_detached			\
-	mpi_probe				\
 	mpi_detached_tag			\
 	ring					\
 	ring_async				\
@@ -134,8 +132,6 @@ mpi_isend_detached_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 mpi_irecv_detached_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
-mpi_probe_LDADD =			\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 mpi_detached_tag_LDADD =				\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 pingpong_LDADD =					\

+ 5 - 2
sched_ctx_hypervisor/Makefile.am

@@ -17,8 +17,11 @@ SUBDIRS = src examples
 
 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
 
-versinclude_HEADERS = include/sched_ctx_hypervisor.h
-
+versinclude_HEADERS = 	include/sc_hypervisor.h			\
+			include/sc_hypervisor_config.h 		\
+			include/sc_hypervisor_monitoring.h 	\
+			include/sc_hypervisor_policy.h 		\
+			include/sc_hypervisor_lp.h
 showcheck:
 	for i in $(SUBDIRS) ; do \
 		make -C $$i showcheck ; \

+ 7 - 6
sched_ctx_hypervisor/examples/Makefile.am

@@ -14,14 +14,15 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include -I$(top_srcdir)/sched_ctx_hypervisor/examples
+LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include -I$(top_srcdir)/sc_hypervisor/examples
 AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS)
 
 if !NO_BLAS_LIB
 noinst_PROGRAMS =				\
-	cholesky/cholesky_implicit  \
-	app_driven_test/app_driven_test
+	cholesky/cholesky_implicit  		\
+	app_driven_test/app_driven_test		\
+	lp_test/lp_test
 
 noinst_HEADERS = 				\
 	cholesky/cholesky.h			\
@@ -38,14 +39,14 @@ cholesky_cholesky_implicit_SOURCES =		\
 	$(top_srcdir)/examples/common/blas.c
 
 cholesky_cholesky_implicit_LDADD =		\
-	$(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la \
+	$(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la \
 	$(STARPU_BLAS_LDFLAGS)
 
 app_driven_test_app_driven_test_SOURCES =		\
 	app_driven_test/app_driven_test.c
 
 app_driven_test_app_driven_test_LDADD =		\
-	$(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
+	$(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
 
 endif
 

+ 67 - 27
sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -17,46 +17,58 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <starpu.h>
-#include <sched_ctx_hypervisor.h>
+#include <sc_hypervisor.h>
 
+#define NTASKS 1000
+#define NINCR 10
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
-/* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
- * DSM; the second arguments references read-only data that is passed as an
- * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
- * are no data input/output managed by the DSM (cl.nbuffers = 0) */
 struct params
 {
 	unsigned sched_ctx;
-    int task_tag;
+	int task_tag;
 };
 
+unsigned val[2];
+pthread_mutex_t mut[2];
+
+/* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
+ * DSM; the second arguments references read-only data that is passed as an
+ * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
+ * are no data input/output managed by the DSM (cl.nbuffers = 0) */
+
 void cpu_func(void *buffers[], void *cl_arg)
 {
 	struct params *params = (struct params *) cl_arg;
 
 	int i;
-	for(i = 0; i < 1000; i++);
-	FPRINTF(stdout, "Hello world sched_ctx = %d task_tag = %d \n", params->sched_ctx, params->task_tag);
+	for(i = 0; i < NINCR; i++)
+	{
+		pthread_mutex_lock(&mut[params->sched_ctx - 1]);
+		val[params->sched_ctx - 1]++;
+		pthread_mutex_unlock(&mut[params->sched_ctx - 1]);
+	}
+	if(params->task_tag != 0)
+		FPRINTF(stdout, "Task with tag %d executed in ctx = %d %d counter_tests\n", params->task_tag, params->sched_ctx, val[params->sched_ctx - 1]);
 }
 
-struct starpu_codelet cl = {};
+struct starpu_codelet cl = {0};
 
+/* the management of the tags is done by the user */
+/* who will take care that the tags will be unique */
 int tag = 1;
-void* start_thread(void *arg)
+void* submit_tasks_thread(void *arg)
 {
 	unsigned sched_ctx = *((unsigned*)arg);
 	starpu_sched_ctx_set_context(&sched_ctx);
 
-	struct starpu_task *task[10];
-	struct params params[10];
+	struct starpu_task *task[NTASKS];
+	struct params params[NTASKS];
 	int i;
-	for(i = 0; i < 10; i++)
+	for(i = 0; i < NTASKS; i++)
 	{
-		int j;
-		for(j = 0; j < 1000; j++);
 		task[i] = starpu_task_create();
-
+//		usleep(5000);
 		cl.cpu_funcs[0] = cpu_func;
 		cl.nbuffers = 0;
 
@@ -64,14 +76,19 @@ void* start_thread(void *arg)
 
 		if(sched_ctx == 1 && i == 5)
 		{
+			/* tag the tasks whose execution will start the resizing process */
 			task[i]->hypervisor_tag = tag;
-			sched_ctx_hypervisor_ioctl(sched_ctx,
+			/* indicate particular settings the context should have when the 
+			   resizing will be done */
+			sc_hypervisor_ioctl(sched_ctx,
 						   HYPERVISOR_TIME_TO_APPLY, tag,
 						   HYPERVISOR_MIN_WORKERS, 2,
 						   HYPERVISOR_MAX_WORKERS, 12,
 						   HYPERVISOR_NULL);
 			printf("require resize for sched_ctx %d at tag %d\n", sched_ctx, tag);
-			sched_ctx_hypervisor_resize(sched_ctx, tag);
+			/* specify that the contexts should be resized when the task having this
+			   particular tag will finish executing */
+			sc_hypervisor_resize(sched_ctx, tag);
 		}
 
 		params[i].sched_ctx = sched_ctx;
@@ -84,6 +101,7 @@ void* start_thread(void *arg)
 	}
 
 	starpu_task_wait_for_all();
+	return;
 }
 
 int main()
@@ -104,27 +122,49 @@ int main()
 	for(i = 0; i < nres2; i++)
 		ressources2[i] = nres1+i;
 
-	unsigned sched_ctx1 = starpu_sched_ctx_create("heft", ressources1, nres1, "sched_ctx1");
-	unsigned sched_ctx2 = starpu_sched_ctx_create("heft", ressources2, nres2, "sched_ctx2");
+	/* create contexts */
+	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", ressources1, nres1, "sched_ctx1");
+	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", ressources2, nres2, "sched_ctx2");
 
-	struct sched_ctx_hypervisor_policy policy;
+	/* initialize the hypervisor */
+	struct sc_hypervisor_policy policy;
 	policy.custom = 0;
+	/* indicate which strategy to use
+	   in this particular case we use app_driven which allows the user to resize 
+	   the ctxs dynamically at particular moments of the execution of the application */
 	policy.name = "app_driven";
-	void *perf_counters = sched_ctx_hypervisor_init(&policy);
+	void *perf_counters = sc_hypervisor_init(&policy);
 
+	/* let starpu know which performance counters should use 
+	   to inform the hypervisor how the application and the resources are executing */
 	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
 	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
-	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
-	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
+
+	/* register the contexts that should be managed by the hypervisor
+	   and indicate an approximate amount of workload if known;
+	   in this case we don't know it and we put 0 */
+	sc_hypervisor_register_ctx(sched_ctx1, 0.0);
+	sc_hypervisor_register_ctx(sched_ctx2, 0.0);
 
 	starpu_pthread_t tid[2];
 
-	starpu_pthread_create(&tid[0], NULL, start_thread, (void*)&sched_ctx1);
-	starpu_pthread_create(&tid[1], NULL, start_thread, (void*)&sched_ctx2);
+	val[0] = 0;
+	val[1] = 0;
+	pthread_mutex_init(&mut[0], NULL);
+	pthread_mutex_init(&mut[1], NULL);
+
+	/* we create two threads to simulate simultaneous submission of tasks */
+	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
 
 	starpu_pthread_join(tid[0], NULL);
 	starpu_pthread_join(tid[1], NULL);
 
+	/* free starpu and hypervisor data */
 	starpu_shutdown();
-	sched_ctx_hypervisor_shutdown();
+	sc_hypervisor_shutdown();
+
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx1, val[0], NTASKS*NINCR);
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx2, val[1], NTASKS*NINCR);
+	return 0;
 }

sched_ctx_hypervisor/examples/cholesky/cholesky.h → sc_hypervisor/examples/cholesky/cholesky.h


sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c → sc_hypervisor/examples/cholesky/cholesky_grain_tag.c


sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c → sc_hypervisor/examples/cholesky/cholesky_implicit.c


sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c → sc_hypervisor/examples/cholesky/cholesky_kernels.c


sched_ctx_hypervisor/examples/cholesky/cholesky_models.c → sc_hypervisor/examples/cholesky/cholesky_models.c


sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c → sc_hypervisor/examples/cholesky/cholesky_tag.c


sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c → sc_hypervisor/examples/cholesky/cholesky_tile_tag.c


+ 134 - 0
sc_hypervisor/examples/lp_test/lp_test.c

@@ -0,0 +1,134 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <sc_hypervisor.h>
+
+#define NTASKS 1000
+#define NINCR 10
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+
+unsigned val[2];
+pthread_mutex_t mut[2];
+
+/* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the
+ * DSM; the second arguments references read-only data that is passed as an
+ * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
+ * are no data input/output managed by the DSM (cl.nbuffers = 0) */
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+	unsigned sched_ctx = *((unsigned *) cl_arg);
+
+	int i;
+	for(i = 0; i < NINCR; i++)
+	{
+		pthread_mutex_lock(&mut[sched_ctx - 1]);
+		val[sched_ctx - 1]++;
+		pthread_mutex_unlock(&mut[sched_ctx - 1]);
+	}
+}
+
+struct starpu_codelet cl = {0};
+
+void* submit_tasks_thread(void *arg)
+{
+	unsigned sched_ctx = *((unsigned*)arg);
+	starpu_sched_ctx_set_context(&sched_ctx);
+
+	struct starpu_task *task[NTASKS];
+	int i;
+	for(i = 0; i < NTASKS; i++)
+	{
+		task[i] = starpu_task_create();
+		cl.cpu_funcs[0] = cpu_func;
+		cl.nbuffers = 0;
+
+		task[i]->cl = &cl;
+
+		task[i]->cl_arg = &sched_ctx;
+		task[i]->cl_arg_size = sizeof(unsigned);
+
+		task[i]->flops = NINCR*1000000000.0;
+		starpu_task_submit(task[i]);
+	}
+
+	starpu_task_wait_for_all();
+	return;
+}
+
+int main()
+{
+	int ret = starpu_init(NULL);
+
+	if (ret == -ENODEV)
+        return 77;
+
+
+	/* create contexts */
+	unsigned sched_ctx1 = starpu_sched_ctx_create("dmda", NULL, 0, "sched_ctx1");
+	unsigned sched_ctx2 = starpu_sched_ctx_create("dmda", NULL, 0, "sched_ctx2");
+
+	/* initialize the hypervisor */
+	struct sc_hypervisor_policy policy;
+	policy.custom = 0;
+	/* indicate which strategy to use
+	   in this particular case we use app_driven which allows the user to resize 
+	   the ctxs dynamically at particular moments of the execution of the application */
+	policy.name = "feft_lp";
+	void *perf_counters = sc_hypervisor_init(&policy);
+
+	/* let starpu know which performance counters should use 
+	   to inform the hypervisor how the application and the resources are executing */
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+
+	double flops1 = NTASKS*NINCR*1000000000.0;
+	double flops2 = NTASKS*NINCR*1000000000.0;
+	/* register the contexts that should be managed by the hypervisor
+	   and indicate an approximate amount of workload if known;
+	   in this case we don't know it and we put 0 */
+	sc_hypervisor_register_ctx(sched_ctx1, flops1);
+	sc_hypervisor_register_ctx(sched_ctx2, flops2);
+        /* lp strategy allows sizing the contexts because we know the total number of flops
+	   to be executed */
+	sc_hypervisor_size_ctxs(NULL, -1, NULL, -1);
+
+	starpu_pthread_t tid[2];
+
+	val[0] = 0;
+	val[1] = 0;
+	pthread_mutex_init(&mut[0], NULL);
+	pthread_mutex_init(&mut[1], NULL);
+
+	/* we create two threads to simulate simultaneous submission of tasks */
+	starpu_pthread_create(&tid[0], NULL, submit_tasks_thread, (void*)&sched_ctx1);
+	starpu_pthread_create(&tid[1], NULL, submit_tasks_thread, (void*)&sched_ctx2);
+
+	starpu_pthread_join(tid[0], NULL);
+	starpu_pthread_join(tid[1], NULL);
+
+	/* free starpu and hypervisor data */
+	starpu_shutdown();
+	sc_hypervisor_shutdown();
+
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx1, val[0], NTASKS*NINCR);
+	FPRINTF(stdout, "ctx = %d executed %d counter_tests out of %d \n", sched_ctx2, val[1], NTASKS*NINCR);
+	return 0;
+}

+ 27 - 27
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -16,7 +16,7 @@
 
 #include "sched_ctx_utils.h"
 #include <starpu.h>
-#include "sched_ctx_hypervisor.h"
+#include "sc_hypervisor.h"
 #define NSAMPLES 3
 
 unsigned size1;
@@ -106,14 +106,14 @@ void* start_bench(void *val)
 	/* { */
 	/* 	starpu_pthread_mutex_lock(&mut); */
 	/* 	if(first){ */
-	/* 		sched_ctx_hypervisor_unregiser_ctx(p->ctx); */
+	/* 		sc_hypervisor_unregiser_ctx(p->ctx); */
 	/* 		starpu_sched_ctx_delete(p->ctx, p->the_other_ctx); */
 	/* 	} */
 
 	/* 	first = 0; */
 	/* 	starpu_pthread_mutex_unlock(&mut); */
 	/* } */
-	sched_ctx_hypervisor_stop_resize(p->the_other_ctx);
+	sc_hypervisor_stop_resize(p->the_other_ctx);
 	rv[p->id].flops /= NSAMPLES;
 	rv[p->id].avg_timing /= NSAMPLES;
 }
@@ -238,10 +238,10 @@ void start_2ndbench(void (*bench)(float*, unsigned, unsigned))
 
 void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 {
-	struct sched_ctx_hypervisor_policy policy;
+	struct sc_hypervisor_policy policy;
 	policy.custom = 0;
 	policy.name = "idle";
-	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
+	struct starpu_sched_ctx_performance_counters *perf_counters = sc_hypervisor_init(&policy);
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
@@ -270,9 +270,9 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	starpu_sched_ctx_set_perf_counters(p1.ctx, perf_counters);
 	p2.the_other_ctx = (int)p1.ctx;
 	p1.nworkers = nworkers1;
-	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
+	sc_hypervisor_register_ctx(p1.ctx, 0.0);
 
-	/* sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* sc_hypervisor_ioctl(p1.ctx, */
 	/* 			   HYPERVISOR_MAX_IDLE, p1.workers, p1.nworkers, 5000.0, */
 	/* 			   HYPERVISOR_MAX_IDLE, p1.workers, gpu+gpu1, 100000.0, */
 	/* 			   HYPERVISOR_EMPTY_CTX_MAX_IDLE, p1.workers, p1.nworkers, 500000.0, */
@@ -283,7 +283,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	/* 			   HYPERVISOR_MAX_WORKERS, 12, */
 	/* 			   NULL); */
 
-	sched_ctx_hypervisor_ioctl(p1.ctx,
+	sc_hypervisor_ioctl(p1.ctx,
 				   HYPERVISOR_GRANULARITY, 2,
 				   HYPERVISOR_MIN_TASKS, 1000,
 				   HYPERVISOR_MIN_WORKERS, 6,
@@ -306,9 +306,9 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	starpu_sched_ctx_set_perf_counters(p2.ctx, perf_counters);
 	p1.the_other_ctx = (int)p2.ctx;
 	p2.nworkers = 0;
-	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);
+	sc_hypervisor_register_ctx(p2.ctx, 0.0);
 
-	/* sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* sc_hypervisor_ioctl(p2.ctx, */
 	/* 			   HYPERVISOR_MAX_IDLE, p2.workers, p2.nworkers, 2000.0, */
 	/* 			   HYPERVISOR_MAX_IDLE, p2.workers, gpu+gpu2, 5000.0, */
 	/* 			   HYPERVISOR_EMPTY_CTX_MAX_IDLE, p1.workers, p1.nworkers, 500000.0, */
@@ -319,7 +319,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	/* 			   HYPERVISOR_MAX_WORKERS, 8, */
 	/* 			   NULL); */
 
-	sched_ctx_hypervisor_ioctl(p2.ctx,
+	sc_hypervisor_ioctl(p2.ctx,
 				   HYPERVISOR_GRANULARITY, 2,
 				   HYPERVISOR_MIN_TASKS, 500,
 				   HYPERVISOR_MIN_WORKERS, 0,
@@ -337,36 +337,36 @@ void set_hypervisor_conf(int event, int task_tag)
 /* 		{ */
 /* 			if(it < 2) */
 /* 			{ */
-/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 				sc_hypervisor_ioctl(p2.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 2, */
 /* 							   HYPERVISOR_MAX_WORKERS, 4, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 
 /* 				printf("%d: set max %d for tag %d\n", p2.ctx, 4, task_tag); */
-/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 				sc_hypervisor_ioctl(p1.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 6, */
 /* 							   HYPERVISOR_MAX_WORKERS, 8, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 /* 				printf("%d: set max %d for tag %d\n", p1.ctx, 8, task_tag); */
-/* 				sched_ctx_hypervisor_resize(p1.ctx, task_tag); */
+/* 				sc_hypervisor_resize(p1.ctx, task_tag); */
 /* 			} */
 /* 			if(it == 2) */
 /* 			{ */
-/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 				sc_hypervisor_ioctl(p2.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 12, */
 /* 							   HYPERVISOR_MAX_WORKERS, 12, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 /* 				printf("%d: set max %d for tag %d\n", p2.ctx, 12, task_tag); */
-/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 				sc_hypervisor_ioctl(p1.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 0, */
 /* 							   HYPERVISOR_MAX_WORKERS, 0, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 /* 				printf("%d: set max %d for tag %d\n", p1.ctx, 0, task_tag); */
-/* 				sched_ctx_hypervisor_resize(p1.ctx, task_tag); */
+/* 				sc_hypervisor_resize(p1.ctx, task_tag); */
 /* 			} */
 /* 			it++; */
 
@@ -378,19 +378,19 @@ void set_hypervisor_conf(int event, int task_tag)
 /* 		{ */
 /* 			if(it2 < 3) */
 /* 			{ */
-/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 				sc_hypervisor_ioctl(p1.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 6, */
 /* 							   HYPERVISOR_MAX_WORKERS, 12, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 /* 				printf("%d: set max %d for tag %d\n", p1.ctx, 12, task_tag); */
-/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 				sc_hypervisor_ioctl(p2.ctx, */
 /* 							   HYPERVISOR_MIN_WORKERS, 0, */
 /* 							   HYPERVISOR_MAX_WORKERS, 0, */
 /* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 /* 							   NULL); */
 /* 				printf("%d: set max %d for tag %d\n", p2.ctx, 0, task_tag); */
-/* 				sched_ctx_hypervisor_resize(p2.ctx, task_tag); */
+/* 				sc_hypervisor_resize(p2.ctx, task_tag); */
 /* 			} */
 /* 			it2++; */
 /* 		} */
@@ -401,7 +401,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 	if(event == START_BENCH) */
 	/* 	{ */
 	/* 		int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 		sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 		sc_hypervisor_ioctl(p1.ctx, */
 	/* 					   HYPERVISOR_MAX_IDLE, workers, 12, 800000.0, */
 	/* 					   HYPERVISOR_TIME_TO_APPLY, task_tag, */
 	/* 					   NULL); */
@@ -411,7 +411,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 		if(it2 < 2) */
 	/* 		{ */
 	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 			sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* 			sc_hypervisor_ioctl(p2.ctx, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 500.0, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 200.0, */
 	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
@@ -420,7 +420,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 		if(it2 == 2) */
 	/* 		{ */
 	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 			sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* 			sc_hypervisor_ioctl(p2.ctx, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 1000.0, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 500.0, */
 	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
@@ -434,7 +434,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 	if(event == START_BENCH) */
 	/* 	{ */
 	/* 		int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 		sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 		sc_hypervisor_ioctl(p1.ctx, */
 	/* 					   HYPERVISOR_MAX_IDLE, workers, 12, 1500.0, */
 	/* 					   HYPERVISOR_MAX_IDLE, workers, 3, 4000.0, */
 	/* 					   HYPERVISOR_TIME_TO_APPLY, task_tag, */
@@ -445,7 +445,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 		if(it < 2) */
 	/* 		{ */
 	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 			sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 			sc_hypervisor_ioctl(p1.ctx, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 100.0, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 5000.0, */
 	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
@@ -454,7 +454,7 @@ void set_hypervisor_conf(int event, int task_tag)
 	/* 		if(it == 2) */
 	/* 		{ */
 	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
-	/* 			sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 			sc_hypervisor_ioctl(p1.ctx, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 5000.0, */
 	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 10000.0, */
 	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
@@ -471,7 +471,7 @@ void end_contexts()
 {
 	free(p1.workers);
 	free(p2.workers);
-	sched_ctx_hypervisor_shutdown();
+	sc_hypervisor_shutdown();
 }
 
 void parse_args_ctx(int argc, char **argv)

sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h → sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h


+ 129 - 0
sc_hypervisor/include/sc_hypervisor.h

@@ -0,0 +1,129 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 - 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SCHED_CTX_HYPERVISOR_H
+#define SCHED_CTX_HYPERVISOR_H
+
+#include <starpu.h>
+#include <sc_hypervisor_config.h>
+#include <sc_hypervisor_monitoring.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* synchronise the hypervisor when several workers try to update its information */
+starpu_pthread_mutex_t act_hypervisor_mutex;
+
+
+/* Forward declaration of an internal data structure
+ * FIXME: Remove when no longer exposed.  */
+/* the resizing is not done instantly, a request is kept and executed 
+   when available */
+struct resize_request_entry;
+
+/* platform of resizing contexts */
+struct sc_hypervisor_policy
+{
+	/* name of the strategy */
+	const char* name;
+
+	/* indicate if it is a policiy create by the user or not */
+	unsigned custom;
+
+	/* if knwing the future the hypervisor can find the good 
+	   distribution of workers on contexts even at the begining of the program */
+	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+
+	/* the hypervisor takes a decision when the worker was idle for another cyle in this ctx */
+	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
+
+	/* the hypervisor takes a decision when another task was pushed on this worker in this ctx */
+	void (*handle_pushed_task)(unsigned sched_ctx, int worker);
+
+	/* the hypervisor takes a decision when another task was poped from this worker in this ctx */
+	void (*handle_poped_task)(unsigned sched_ctx, int worker,struct starpu_task *task, uint32_t footprint);
+
+	/* the hypervisor takes a decision when the worker stoped being idle in this ctx */
+	void (*handle_idle_end)(unsigned sched_ctx, int worker);
+
+	/* the hypervisor takes a decision when a certain task finished executing in this ctx */
+	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
+
+	/* the hypervisor takes a decision when a job was submitted in this ctx */
+	void (*handle_submitted_job)(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint);
+	
+	/* the hypervisor takes a decision when a certain ctx was deleted */
+	void (*end_ctx)(unsigned sched_ctx);
+};
+
+/* start the hypervisor indicating the resizing policy to user */
+struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy);
+
+/* shutdown the hypervisor */
+void sc_hypervisor_shutdown(void);
+
+/* only registered contexts are resized by the hypervisor */
+void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops);
+
+/* remove a worker from the hypervisor's list */
+void sc_hypervisor_unregister_ctx(unsigned sched_ctx);
+
+/* submit a requirement of resizing when a task taged with task_tag is executed */
+void sc_hypervisor_resize(unsigned sched_ctx, int task_tag);
+
+/* don't allow the hypervisor to resize a context */
+void sc_hypervisor_stop_resize(unsigned sched_ctx);
+
+/* allow the hypervisor to resize a context */
+void sc_hypervisor_start_resize(unsigned sched_ctx);
+
+/* check out the current policy of the hypervisor */
+const char *sc_hypervisor_get_policy();
+
+/* ask the hypervisor to add workers to a sched_ctx */
+void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx);
+
+/* ask the hypervisor to remove workers from a sched_ctx */
+void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now);
+
+/* ask the hypervisor to move workers from one context to another */
+void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
+
+/* ask the hypervisor to chose a distribution of workers in the required contexts */
+void sc_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
+
+/* check if there are pending demands of resizing */
+unsigned sc_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers);
+
+/* save a demand of resizing */
+void sc_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
+
+/* clear the list of pending demands of resizing */
+void sc_hypervisor_free_size_req(void);
+
+/* check out if a context can be resized */
+unsigned sc_hypervisor_can_resize(unsigned sched_ctx);
+
+/* indicate the types of tasks a context will execute in order to better decide the sizing of ctxs */
+void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 99 - 0
sc_hypervisor/include/sc_hypervisor_config.h

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 - 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SCHED_CTX_HYPERVISOR_CONFIG_H
+#define SCHED_CTX_HYPERVISOR_CONFIG_H
+
+#include <sc_hypervisor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* ioctl properties*/
+#define HYPERVISOR_MAX_IDLE -1
+#define HYPERVISOR_MIN_WORKING -2
+#define HYPERVISOR_PRIORITY -3
+#define HYPERVISOR_MIN_WORKERS -4
+#define HYPERVISOR_MAX_WORKERS -5
+#define HYPERVISOR_GRANULARITY -6
+#define HYPERVISOR_FIXED_WORKERS -7
+#define HYPERVISOR_MIN_TASKS -8
+#define HYPERVISOR_NEW_WORKERS_MAX_IDLE -9
+#define HYPERVISOR_TIME_TO_APPLY -10
+#define HYPERVISOR_EMPTY_CTX_MAX_IDLE -11
+#define HYPERVISOR_NULL -12
+#define	HYPERVISOR_ISPEED_W_SAMPLE -13
+#define HYPERVISOR_ISPEED_CTX_SAMPLE -14
+
+
+#define MAX_IDLE_TIME 5000000000
+#define MIN_WORKING_TIME 500
+
+struct sc_hypervisor_policy_config
+{
+	/* underneath this limit we cannot resize */
+	int min_nworkers;
+
+	/* above this limit we cannot resize */
+	int max_nworkers;
+
+	/*resize granularity */
+	int granularity;
+
+	/* priority for a worker to stay in this context */
+	/* the smaller the priority the faster it will be moved */
+	/* to another context */
+	int priority[STARPU_NMAXWORKERS];
+
+	/* above this limit the priority of the worker is reduced */
+	double max_idle[STARPU_NMAXWORKERS];
+
+	/* underneath this limit the priority of the worker is reduced */
+	double min_working[STARPU_NMAXWORKERS];
+
+	/* workers that will not move */
+	int fixed_workers[STARPU_NMAXWORKERS];
+
+	/* max idle for the workers that will be added during the resizing process*/
+	double new_workers_max_idle;
+
+	/* above this context we allow removing all workers */
+	double empty_ctx_max_idle[STARPU_NMAXWORKERS];
+
+	/* sample used to compute the instant speed per worker*/
+	double ispeed_w_sample[STARPU_NMAXWORKERS];
+
+	/* sample used to compute the instant speed per ctx*/
+	double ispeed_ctx_sample;
+
+};
+
+/* set a certain configuration to a context */
+void sc_hypervisor_set_config(unsigned sched_ctx, void *config);
+
+/* check out the configuration of a context */
+struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
+
+/* impose different parameters to a configuration of a context */
+void sc_hypervisor_ioctl(unsigned sched_ctx, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 30 - 10
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  INRIA
+ * Copyright (C) 2010-2013  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,9 +14,17 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "policy_tools.h"
+#ifndef SCHED_CTX_HYPERVISOR_LP_H
+#define SCHED_CTX_HYPERVISOR_LP_H
+
+#include <sc_hypervisor.h>
 #include <starpu_config.h>
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 /*
  * GNU Linear Programming Kit backend
  */
@@ -25,26 +33,38 @@
 #endif //STARPU_HAVE_GLPK_H
 
 /* returns 1/tmax, and computes in table res the nr of workers needed by each context st the system ends up in the smallest tmax*/
-double _lp_compute_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double velocity[nsched_ctxs][ntypes_of_workers], double flops[nsched_ctxs], 
+double sc_hypervisor_lp_compute_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double velocity[nsched_ctxs][ntypes_of_workers], double flops[nsched_ctxs], 
 				    double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
 
 /* returns tmax, and computes in table res the nr of workers needed by each context st the system ends up in the smallest tmax*/
-double _lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
+double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
 
 /* returns tmax of the system */
-double _lp_get_tmax(int nw, int *workers);
+double sc_hypervisor_lp_get_tmax(int nw, int *workers);
 
 /* the linear programme determins a rational number of ressources for each ctx, we round them depending on the type of ressource */
-void _lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw]);
+void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw]);
 
 /* redistribute the ressource in contexts by assigning the first x available ressources to each one */
-void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw]);
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw]);
 
 /* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
-void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);
+void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);
 
 /* place resources in contexts dependig on whether they already have workers or not */
-void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs, int *workers, unsigned do_size);
+void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs, int *workers, unsigned do_size);
 
 /* dichotomy btw t1 & t2 */
-double _find_tmax(double t1, double t2);
+double sc_hypervisor_lp_find_tmax(double t1, double t2);
+
+/* execute the lp trough dichotomy */
+unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw], unsigned solve_lp_integer, void *specific_data,
+					    double tmin, double tmax, double smallest_tmax,
+					    double (*lp_estimated_distrib_func)(int ns, int nw, double draft_w_in_s[ns][nw], 
+									     unsigned is_integer, double tmax, void *specifc_data));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 128 - 0
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 - 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SCHED_CTX_HYPERVISOR_MONITORING_H
+#define SCHED_CTX_HYPERVISOR_MONITORING_H
+
+#include <sc_hypervisor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/* structure to indicate when the moving of workers was actually done 
+   (moved workers can be seen in the new ctx ) */
+struct sc_hypervisor_resize_ack
+{
+	/* receiver context */
+	int receiver_sched_ctx;
+	/* list of workers required to be moved */
+	int *moved_workers;
+	/* number of workers required to be moved */
+	int nmoved_workers;
+	/* list of workers that actually got in the receiver ctx */
+	int *acked_workers;
+};
+
+/* wrapper attached to a sched_ctx storing monitoring information */
+struct sc_hypervisor_wrapper
+{
+	/* the sched_ctx it monitors */
+	unsigned sched_ctx;
+
+	/* user configuration meant to limit resizing */
+	struct sc_hypervisor_policy_config *config;
+
+	/* idle time of workers in this context */
+	double current_idle_time[STARPU_NMAXWORKERS];
+	
+	/* list of workers that will leave this contexts (lazy resizing process) */
+	int worker_to_be_removed[STARPU_NMAXWORKERS];
+
+	/* number of tasks pushed on each worker in this ctx */
+	int pushed_tasks[STARPU_NMAXWORKERS];
+
+	/* number of tasks poped from each worker in this ctx */
+	int poped_tasks[STARPU_NMAXWORKERS];
+
+	/* number of flops the context has to execute */
+	double total_flops;
+
+	/* number of flops executed since the biginning until now */
+	double total_elapsed_flops[STARPU_NMAXWORKERS];
+
+	/* number of flops executed since last resizing */
+	double elapsed_flops[STARPU_NMAXWORKERS];
+
+	/* data quantity executed on each worker in this ctx */
+	size_t elapsed_data[STARPU_NMAXWORKERS];
+
+	/* nr of tasks executed on each worker in this ctx */
+	int elapsed_tasks[STARPU_NMAXWORKERS];
+
+	/* the average speed of workers when they belonged to this context */
+	double ref_velocity[STARPU_NMAXWORKERS];
+
+	/* number of flops submitted to this ctx */
+	double submitted_flops;
+
+	/* number of flops that still have to be executed in this ctx */
+	double remaining_flops;
+	
+	/* the start time of the resizing sample of this context*/
+	double start_time;
+
+	/* the first time a task was pushed to this context*/
+	double real_start_time;
+
+	/* the workers don't leave the current ctx until the receiver ctx 
+	   doesn't ack the receive of these workers */
+	struct sc_hypervisor_resize_ack resize_ack;
+
+	/* mutex to protect the ack of workers */
+	starpu_pthread_mutex_t mutex;
+};
+
+/* return the wrapper of context that saves its monitoring information */
+struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
+
+/* get the list of registered contexts */
+int *sc_hypervisor_get_sched_ctxs();
+
+/* get the number of registered contexts */
+int sc_hypervisor_get_nsched_ctxs();
+
+/* get the number of workers of a certain architecture in a context */
+int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch);
+
+/* get the number of flops executed by a context since last resizing (reset to 0 when a resizing is done)*/
+double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
+
+/* get the number of flops executed by a context since the begining */
+double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w);
+
+/* compute an average value of the cpu/cuda velocity */
+double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+
+/* compte the actual velocity of all workers of a specific type of worker */
+double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_archtype arch);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 104 - 0
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -0,0 +1,104 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef SCHED_CTX_HYPERVISOR_POLICY_H
+#define SCHED_CTX_HYPERVISOR_POLICY_H
+
+#include <sc_hypervisor.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+#define HYPERVISOR_REDIM_SAMPLE 0.02
+#define HYPERVISOR_START_REDIM_SAMPLE 0.1
+
+/* task wrapper linked list */
+struct sc_hypervisor_policy_task_pool
+{
+	/* Which codelet has been executed */
+	struct starpu_codelet *cl;
+	/* Task footprint key */
+	uint32_t footprint;
+	/* Context the task belongs to */
+	unsigned sched_ctx_id;
+	/* Number of tasks of this kind */
+	unsigned long n;
+	/* Other task kinds */
+	struct sc_hypervisor_policy_task_pool *next;
+};
+
+/* add task information to a task wrapper linked list */
+void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools);
+
+/* remove task information from a task wrapper linked list */
+void sc_hypervisor_policy_remove_task_from_pool(struct starpu_task *task, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools);
+
+/* clone a task wrapper linked list */
+struct sc_hypervisor_policy_task_pool* sc_hypervisor_policy_clone_task_pool(struct sc_hypervisor_policy_task_pool *tp);
+
+/* find the context with the lowest priority in order to move some workers */
+unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move);
+
+/* find the first most idle workers of a context*/
+int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch);
+
+/* find the first most idle workers in a list */
+int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch);
+
+/* find workers that can be moved from a context (if the constraints of min, max, etc allow this) */
+unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch);
+
+/* compute how many workers should be moved from this context */
+int sc_hypervisor_compute_nworkers_to_move(unsigned req_sched_ctx);
+
+/* check the policy's constraints in order to resize */
+unsigned sc_hypervisor_policy_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize, unsigned now);
+
+/* check the policy's constraints in order to resize  and find a context willing the resources */
+unsigned sc_hypervisor_policy_resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now);
+
+/* compute the velocity of a context */
+double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w);
+
+/* get the time of execution of the slowest context */
+double sc_hypervisor_get_slowest_ctx_exec_time(void);
+
+/* get the time of execution of the fastest context */
+double sc_hypervisor_get_fastest_ctx_exec_time(void);
+
+/* compute the velocity of a workers in a context */
+double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker); 
+
+/* compute the velocity of a type of worker in a context */
+double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+
+/* compute the velocity of a type of worker in a context depending on its history */ 
+double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
+
+/* check if there are contexts a lot more delayed than others */
+int sc_hypervisor_has_velocity_gap_btw_ctxs(void);
+
+/* get the list of workers grouped by type */
+void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

+ 13 - 13
sched_ctx_hypervisor/src/Makefile.am

@@ -14,30 +14,30 @@
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sched_ctx_hypervisor/include/ -I$(top_srcdir)/sched_ctx_hypervisor/src
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src/ -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include/ -I$(top_srcdir)/sc_hypervisor/src
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
-lib_LTLIBRARIES = libsched_ctx_hypervisor.la
+lib_LTLIBRARIES = libsc_hypervisor.la
 
-libsched_ctx_hypervisor_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+libsc_hypervisor_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
-libsched_ctx_hypervisor_la_SOURCES = 			\
-	sched_ctx_hypervisor.c				\
-	sched_ctx_config.c				\
-	hypervisor_policies/policy_tools.c		\
-	hypervisor_policies/lp_tools.c			\
+libsc_hypervisor_la_SOURCES = 				\
+	sc_hypervisor.c					\
+	sc_config.c					\
+	policies_utils/policy_tools.c			\
+	policies_utils/task_pool.c			\
+	policies_utils/lp_tools.c			\
+	policies_utils/dichotomy.c			\
 	hypervisor_policies/idle_policy.c		\
 	hypervisor_policies/app_driven_policy.c		\
 	hypervisor_policies/gflops_rate_policy.c	\
-	hypervisor_policies/lp_policy.c			\
-	hypervisor_policies/lp2_policy.c		\
+	hypervisor_policies/feft_lp_policy.c		\
+	hypervisor_policies/teft_lp_policy.c		\
 	hypervisor_policies/ispeed_policy.c		\
 	hypervisor_policies/ispeed_lp_policy.c		\
 	hypervisor_policies/debit_lp_policy.c
 
-noinst_HEADERS = sched_ctx_hypervisor_intern.h		\
-	hypervisor_policies/policy_tools.h		\
-	hypervisor_policies/lp_tools.h
+noinst_HEADERS = sc_hypervisor_intern.h		
 
 showcheck:
 	-cat /dev/null

+ 3 - 6
sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c

@@ -13,17 +13,14 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-
-#include "policy_tools.h"
-
-#include <sched_ctx_hypervisor_intern.h>
+#include <sc_hypervisor_policy.h>
 
 static void app_driven_handle_post_exec_hook(unsigned sched_ctx, int task_tag)
 {
-	_resize_to_unknown_receiver(sched_ctx, 1);
+	sc_hypervisor_policy_resize_to_unknown_receiver(sched_ctx, 1);
 }
 
-struct sched_ctx_hypervisor_policy app_driven_policy =
+struct sc_hypervisor_policy app_driven_policy =
 {
 	.size_ctxs = NULL,
 	.handle_poped_task = NULL,

+ 14 - 12
sched_ctx_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -15,8 +15,10 @@
  */
 
 #include <starpu_config.h>
-#include "lp_tools.h"
+#include "sc_hypervisor_lp.h"
+#include "sc_hypervisor_policy.h"
 #include <math.h>
+#include <sys/time.h>
 
 static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
 
@@ -25,21 +27,21 @@ static unsigned _compute_max_velocity(int ns, int nw, double w_in_s[ns][nw], int
 {
 	double velocity[ns][nw];
 
-	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	int w,s;
 
-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
+	struct sc_hypervisor_wrapper* sc_w = NULL;
 	for(s = 0; s < ns; s++)
 	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
 		for(w = 0; w < nw; w++)
 		{
 			w_in_s[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 
 			enum starpu_archtype arch = starpu_worker_get_type(worker);
-			velocity[s][w] = sched_ctx_hypervisor_get_velocity(sc_w, arch);
+			velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
 		}
 	}
 	
@@ -225,14 +227,14 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double w_in_
 
 static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	_get_velocity_per_worker(sc_w, worker);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	sc_hypervisor_get_velocity_per_worker(sc_w, worker);
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
 		{
-			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
+			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 
 			double w_in_s[ns][nw];
@@ -276,7 +278,7 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 
-				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+				sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 
 			}
 		}
@@ -286,7 +288,7 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
 
 static void debit_lp_end_ctx(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	int worker;
 /* 	for(worker = 0; worker < 12; worker++) */
 /* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
@@ -294,7 +296,7 @@ static void debit_lp_end_ctx(unsigned sched_ctx)
 	return;
 }
 
-struct sched_ctx_hypervisor_policy debit_lp_policy = {
+struct sc_hypervisor_policy debit_lp_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = debit_lp_handle_poped_task,
 	.handle_pushed_task = NULL,

+ 138 - 0
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -0,0 +1,138 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 - 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "sc_hypervisor_lp.h"
+#include <starpu_config.h>
+#include <sys/time.h>
+
+#ifdef STARPU_HAVE_GLPK_H
+static void feft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+{
+	if(sc_hypervisor_has_velocity_gap_btw_ctxs())
+	{
+		int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
+
+		double nworkers[nsched_ctxs][2];
+
+		int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+		if(ret != EBUSY)
+		{
+			int nw = 1;
+#ifdef STARPU_USE_CUDA
+			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
+			nw = ncuda != 0 ? 2 : 1;
+#endif
+			int total_nw[nw];
+			sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
+
+
+			struct timeval start_time;
+			struct timeval end_time;
+			gettimeofday(&start_time, NULL);
+
+			double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers, total_nw);
+			gettimeofday(&end_time, NULL);
+
+			long diff_s = end_time.tv_sec  - start_time.tv_sec;
+			long diff_us = end_time.tv_usec  - start_time.tv_usec;
+
+			float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+			if(vmax != 0.0)
+			{
+				int nworkers_rounded[nsched_ctxs][nw];
+				sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers, nworkers_rounded);
+				sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_rounded, nworkers);
+			}
+			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+		}
+	}
+}
+static void feft_lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
+{
+	int nsched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : ns;
+	int nw = 1;
+#ifdef STARPU_USE_CUDA
+	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
+	nw = ncuda != 0 ? 2 : 1;
+#endif
+	double nworkers_per_type[nsched_ctxs][nw];
+	int total_nw[nw];
+	sc_hypervisor_group_workers_by_type(workers, nworkers, nw, total_nw);
+
+	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers_per_type, total_nw);
+	if(vmax != 0.0)
+	{
+// 		printf("********size\n");
+/* 		int i; */
+/* 		for( i = 0; i < nsched_ctxs; i++) */
+/* 		{ */
+/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 0, nworkers_per_type[i][0]); */
+/* #ifdef STARPU_USE_CUDA */
+/* 			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER); */
+/* 			if(ncuda != 0) */
+/* 				printf("ctx %d/worker type %d: n = %lf \n", i, 1, nworkers_per_type[i][1]); */
+/* #endif */
+/* 		} */
+		int nworkers_per_type_rounded[nsched_ctxs][nw];
+		sc_hypervisor_lp_round_double_to_int(nsched_ctxs, nw, nworkers_per_type, nworkers_per_type_rounded);
+/*       	for( i = 0; i < nsched_ctxs; i++) */
+/* 		{ */
+/* 			printf("ctx %d/worker type %d: n = %d \n", i, 0, nworkers_per_type_rounded[i][0]); */
+/* #ifdef STARPU_USE_CUDA */
+/* 			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER); */
+/* 			if(ncuda != 0) */
+/* 				printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]); */
+/* #endif */
+/* 		} */
+		int *current_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : 
+			sched_ctxs;
+
+		unsigned has_workers = 0;
+		int s;
+		for(s = 0; s < ns; s++)
+		{
+			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
+									     STARPU_ANY_WORKER);
+			if(nworkers_ctx != 0)
+			{
+				has_workers = 1;
+				break;
+			}
+		}
+		if(has_workers)
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(nsched_ctxs, nw, nworkers_per_type_rounded, nworkers_per_type);
+		else
+			sc_hypervisor_lp_distribute_resources_in_ctxs(sched_ctxs, nsched_ctxs, nw, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
+	}
+	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+}
+
+struct sc_hypervisor_policy feft_lp_policy = {
+	.size_ctxs = feft_lp_size_ctxs,
+	.handle_poped_task = feft_lp_handle_poped_task,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = NULL,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = NULL,
+	.handle_submitted_job = NULL,
+	.end_ctx = NULL,
+	.custom = 0,
+	.name = "feft_lp"
+};
+
+#endif /* STARPU_HAVE_GLPK_H */

+ 31 - 31
sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c

@@ -14,11 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "policy_tools.h"
+#include "sc_hypervisor_policy.h"
 
 static double _get_total_elapsed_flops_per_sched_ctx(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	double ret_val = 0.0;
 	int i;
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
@@ -28,8 +28,8 @@ static double _get_total_elapsed_flops_per_sched_ctx(unsigned sched_ctx)
 
 double _get_exp_end(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+	struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
 
 	if( elapsed_flops >= 1.0)
 	{
@@ -44,7 +44,7 @@ double _get_exp_end(unsigned sched_ctx)
 /* computes the instructions left to be executed out of the total instructions to execute */
 double _get_flops_left_pct(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_wrapper *wrapper = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_wrapper *wrapper = sc_hypervisor_get_wrapper(sched_ctx);
 	double total_elapsed_flops = _get_total_elapsed_flops_per_sched_ctx(sched_ctx);
 	if(wrapper->total_flops == total_elapsed_flops || total_elapsed_flops > wrapper->total_flops)
 		return 0.0;
@@ -55,13 +55,13 @@ double _get_flops_left_pct(unsigned sched_ctx)
 /* select the workers needed to be moved in order to force the sender and the receiver context to finish simultaneously */
 static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *nworkers)
 {
-	struct sched_ctx_hypervisor_wrapper* sender_sc_w = sched_ctx_hypervisor_get_wrapper(sender_sched_ctx);
-	struct sched_ctx_hypervisor_wrapper* receiver_sc_w = sched_ctx_hypervisor_get_wrapper(receiver_sched_ctx);
+	struct sc_hypervisor_wrapper* sender_sc_w = sc_hypervisor_get_wrapper(sender_sched_ctx);
+	struct sc_hypervisor_wrapper* receiver_sc_w = sc_hypervisor_get_wrapper(receiver_sched_ctx);
         int *workers = NULL;
-        double v_receiver = _get_ctx_velocity(receiver_sc_w);
+        double v_receiver = sc_hypervisor_get_ctx_velocity(receiver_sc_w);
         double receiver_remainig_flops = receiver_sc_w->remaining_flops;
         double sender_exp_end = _get_exp_end(sender_sched_ctx);
-        double sender_v_cpu = _get_velocity_per_worker_type(sender_sc_w, STARPU_CPU_WORKER);
+        double sender_v_cpu = sc_hypervisor_get_velocity_per_worker_type(sender_sc_w, STARPU_CPU_WORKER);
         double v_for_rctx = (receiver_remainig_flops/(sender_exp_end - starpu_timing_now())) - v_receiver;
 
         int nworkers_needed = v_for_rctx/sender_v_cpu;
@@ -69,11 +69,11 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 /*             v_receiver, v_for_rctx, sender_v_cpu, nworkers_needed); */
         if(nworkers_needed > 0)
         {
-                struct sched_ctx_hypervisor_policy_config *sender_config = sched_ctx_hypervisor_get_config(sender_sched_ctx);
-                unsigned potential_moving_cpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
-                unsigned potential_moving_gpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
+                struct sc_hypervisor_policy_config *sender_config = sc_hypervisor_get_config(sender_sched_ctx);
+                unsigned potential_moving_cpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
+                unsigned potential_moving_gpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
                 unsigned sender_nworkers = starpu_sched_ctx_get_nworkers(sender_sched_ctx);
-                struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(receiver_sched_ctx);
                 unsigned nworkers_ctx = starpu_sched_ctx_get_nworkers(receiver_sched_ctx);
 
                 if(nworkers_needed < (potential_moving_cpus + 5 * potential_moving_gpus))
@@ -87,10 +87,10 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
                                 {
                                         int ngpus = nworkers_needed / 5;
                                         int *gpus;
-                                        gpus = _get_first_workers(sender_sched_ctx, &ngpus, STARPU_CUDA_WORKER);
+                                        gpus = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &ngpus, STARPU_CUDA_WORKER);
                                         int ncpus = nworkers_needed - ngpus;
                                         int *cpus;
-                                        cpus = _get_first_workers(sender_sched_ctx, &ncpus, STARPU_CPU_WORKER);
+                                        cpus = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &ncpus, STARPU_CPU_WORKER);
                                         workers = (int*)malloc(nworkers_needed*sizeof(int));
                                         int i;
 					printf("%d: gpus: ", nworkers_needed);
@@ -115,7 +115,7 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
                 {
 			/*if the needed number of workers is to big we only move the number of workers
 			  corresponding to the granularity set by the user */
-                        int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+                        int nworkers_to_move = sc_hypervisor_compute_nworkers_to_move(sender_sched_ctx);
 
                         if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers)
                         {
@@ -125,7 +125,7 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 
                                 if(nworkers_to_move > 0)
                                 {
-                                        workers = _get_first_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
+                                        workers = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
                                         *nworkers = nworkers_to_move;
                                 }
                         }
@@ -147,9 +147,9 @@ static unsigned _gflops_rate_resize(unsigned sender_sched_ctx, unsigned receiver
                 int *workers_to_move =  _get_workers_to_move(sender_sched_ctx, receiver_sched_ctx, &nworkers_to_move);
 		if(nworkers_to_move > 0)
                 {
-                        sched_ctx_hypervisor_move_workers(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move, 0);
+                        sc_hypervisor_move_workers(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move, 0);
 
-                        struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                        struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(receiver_sched_ctx);
                         int i;
                         for(i = 0; i < nworkers_to_move; i++)
                                 new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
@@ -165,8 +165,8 @@ static unsigned _gflops_rate_resize(unsigned sender_sched_ctx, unsigned receiver
 
 static int _find_fastest_sched_ctx()
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double first_exp_end = _get_exp_end(sched_ctxs[0]);
 	int fastest_sched_ctx = first_exp_end == -1.0  ? -1 : sched_ctxs[0];
@@ -188,8 +188,8 @@ static int _find_fastest_sched_ctx()
 
 static int _find_slowest_sched_ctx()
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int slowest_sched_ctx = -1;
 	double curr_exp_end = 0.0;
@@ -214,8 +214,8 @@ static int _find_slowest_sched_ctx()
 
 static int _find_slowest_available_sched_ctx(unsigned sched_ctx)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int slowest_sched_ctx = -1;
 	double curr_exp_end = 0.0;
@@ -256,12 +256,12 @@ static void gflops_rate_resize(unsigned sched_ctx)
 			double slowest_flops_left_pct = _get_flops_left_pct(slowest_sched_ctx);
 			if(slowest_flops_left_pct != 0.0f)
 			{
-				struct sched_ctx_hypervisor_policy_config* config = sched_ctx_hypervisor_get_config(sched_ctx);
+				struct sc_hypervisor_policy_config* config = sc_hypervisor_get_config(sched_ctx);
 				config->min_nworkers = 0;
 				config->max_nworkers = 0;
 				printf("ctx %d finished & gives away the res to %d; slow_left %lf\n", sched_ctx, slowest_sched_ctx, slowest_flops_left_pct);
-				_resize(sched_ctx, slowest_sched_ctx, 1, 1);
-				sched_ctx_hypervisor_stop_resize(slowest_sched_ctx);
+				sc_hypervisor_policy_resize(sched_ctx, slowest_sched_ctx, 1, 1);
+				sc_hypervisor_stop_resize(slowest_sched_ctx);
 			}
 		}
 	}
@@ -280,8 +280,8 @@ static void gflops_rate_resize(unsigned sched_ctx)
 			if(fast_flops_left_pct < 0.8)
 			{
 
-				struct sched_ctx_hypervisor_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(slowest_sched_ctx);
-				double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+				struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(slowest_sched_ctx);
+				double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
 				if((elapsed_flops/sc_w->total_flops) > 0.1)
 					_gflops_rate_resize(fastest_sched_ctx, slowest_sched_ctx, 0);
 			}
@@ -294,7 +294,7 @@ static void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
 	gflops_rate_resize(sched_ctx);
 }
 
-struct sched_ctx_hypervisor_policy gflops_rate_policy = {
+struct sc_hypervisor_policy gflops_rate_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = gflops_rate_handle_poped_task,
 	.handle_pushed_task = NULL,

+ 8 - 8
sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c

@@ -14,12 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "policy_tools.h"
+#include "sc_hypervisor_policy.h"
 
 unsigned worker_belong_to_other_sched_ctx(unsigned sched_ctx, int worker)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int i;
 	for(i = 0; i < nsched_ctxs; i++)
@@ -30,18 +30,18 @@ unsigned worker_belong_to_other_sched_ctx(unsigned sched_ctx, int worker)
 
 void idle_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	struct sched_ctx_hypervisor_policy_config *config = sc_w->config;
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_w->config;
 	if(config != NULL &&  sc_w->current_idle_time[worker] > config->max_idle[worker])
 	{
 		if(worker_belong_to_other_sched_ctx(sched_ctx, worker))
-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(&worker, 1, sched_ctx, 1);
+			sc_hypervisor_remove_workers_from_sched_ctx(&worker, 1, sched_ctx, 1);
 		else
-			_resize_to_unknown_receiver(sched_ctx, 0);
+			sc_hypervisor_policy_resize_to_unknown_receiver(sched_ctx, 0);
 	}
 }
 
-struct sched_ctx_hypervisor_policy idle_policy =
+struct sc_hypervisor_policy idle_policy =
 {
 	.size_ctxs = NULL,
 	.handle_poped_task = NULL,

+ 121 - 139
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -15,142 +15,38 @@
  */
 
 #include <starpu_config.h>
-#include "lp_tools.h"
+#include "sc_hypervisor_lp.h"
+#include "sc_hypervisor_policy.h"
 #include <math.h>
+#include <sys/time.h>
 
-static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops[ns], double tmax, double flops_on_w[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
-
-static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double flops_on_w[ns][nw], int *in_sched_ctxs, int *workers)
+struct ispeed_lp_data
 {
-	double draft_w_in_s[ns][nw];
-	double draft_flops_on_w[ns][nw];
-	double flops[ns];
-	double velocity[ns][nw];
-
-	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
-	
-	int w,s;
-
-	struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
-	double total_flops = 0.0;
-	for(s = 0; s < ns; s++)
-	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
-		for(w = 0; w < nw; w++)
-		{
-			w_in_s[s][w] = 0.0;
-			draft_w_in_s[s][w] = 0.0;
-			flops_on_w[s][w] = 0.0;
-			draft_flops_on_w[s][w] = 0.0;
-			int worker = workers == NULL ? w : workers[w];
-
-			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
-			if(velocity[s][w] == -1.0)
-			{
-				enum starpu_archtype arch = starpu_worker_get_type(worker);
-				velocity[s][w] = sched_ctx_hypervisor_get_velocity(sc_w, arch);
-				if(arch == STARPU_CUDA_WORKER)
-				{
-					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
-					if(!worker_in_ctx)
-					{
-						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
-						velocity[s][w] = (velocity[s][w] * transfer_velocity) / (velocity[s][w] + transfer_velocity);
-					}
-				}
-
-			}
-			
-//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
-		}
-		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
-		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
-	}
-	
-	/* take the exec time of the slowest ctx 
-	   as starting point and then try to minimize it
-	   as increasing it a little for the faster ctxs */
-	double tmax = _get_slowest_ctx_exec_time();
- 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
-//	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
-
-	double res = 1.0;
-	unsigned has_sol = 0;
-	double tmin = 0.0;
-	double old_tmax = 0.0;
-	unsigned found_sol = 0;
-
-	struct timeval start_time;
-	struct timeval end_time;
-	int nd = 0;
-	gettimeofday(&start_time, NULL);
-
-	/* we fix tmax and we do not treat it as an unknown
-	   we just vary by dichotomy its values*/
-	while(tmax > 1.0)
-	{
-		/* find solution and save the values in draft tables
-		   only if there is a solution for the system we save them
-		   in the proper table */
-		res = _glp_resolve(ns, nw, velocity, flops, tmax, draft_flops_on_w, draft_w_in_s, workers, 1);
-		if(res != 0.0)
-		{
-			for(s = 0; s < ns; s++)
-				for(w = 0; w < nw; w++)
-				{
-					w_in_s[s][w] = draft_w_in_s[s][w];
-					flops_on_w[s][w] = draft_flops_on_w[s][w];
-				}
-			has_sol = 1;
-			found_sol = 1;
-		}
-		else
-			has_sol = 0;
-
-		/* if we have a solution with this tmax try a smaller value
-		   bigger than the old min */
-		if(has_sol)
-		{
-			if(old_tmax != 0.0 && (old_tmax - tmax) < 0.5)
-				break;
-			old_tmax = tmax;
-		}
-		else /*else try a bigger one but smaller than the old tmax */
-		{
-			tmin = tmax;
-			if(old_tmax != 0.0)
-				tmax = old_tmax;
-		}
-		if(tmin == tmax) break;
-		tmax = _find_tmax(tmin, tmax);
-
-		if(tmax < smallest_tmax)
-		{
-			tmax = old_tmax;
-			tmin = smallest_tmax;
-			tmax = _find_tmax(tmin, tmax);
-		}
-		nd++;
-	}
-	gettimeofday(&end_time, NULL);
-
-	long diff_s = end_time.tv_sec  - start_time.tv_sec;
-	long diff_us = end_time.tv_usec  - start_time.tv_usec;
-
-	float timing = (float)(diff_s*1000000 + diff_us)/1000;
-
-//        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
-
-	return found_sol;
-}
+	double **velocity;
+	double *flops;
+	double **flops_on_w;
+	int *workers;
+};
 
 /*
  * GNU Linear Programming Kit backend
  */
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops[ns], double tmax, double flops_on_w[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer)
+static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
+			    unsigned is_integer, double tmax, void *specific_data)
 {
+	struct ispeed_lp_data *sd = (struct ispeed_lp_data *)specific_data;
+
+	double **velocity = sd->velocity;
+	double *flops = sd->flops;
+	
+	double **final_flops_on_w = sd->flops_on_w;
+        int *workers = sd->workers;
+	
+	double w_in_s[ns][nw];
+	double flops_on_w[ns][nw];
+
 	int w, s;
 	glp_prob *lp;
 
@@ -186,7 +82,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 
 				snprintf(name, sizeof(name), "w%ds%dn", w, s);
 				glp_set_col_name(lp, nw*ns+colnum(w,s), name);
-				if (integer)
+				if (is_integer)
 				{
                                         glp_set_col_kind(lp, nw*ns+colnum(w, s), GLP_IV);
 					glp_set_col_bnds(lp, nw*ns+colnum(w,s), GLP_DB, 0, 1);
@@ -264,7 +160,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 				ar[n] = 1;
 				n++;
 			}
-			if(integer)				
+			if(is_integer)				
 				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
 			else
 				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
@@ -309,7 +205,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 		return 0.0;
 	}
 
-        if (integer)
+        if (is_integer)
         {
                 glp_iocp iocp;
                 glp_init_iocp(&iocp);
@@ -340,7 +236,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 		for(w = 0; w < nw; w++)
 		{
 			flops_on_w[s][w] = glp_get_col_prim(lp, colnum(w, s));
-			if (integer)
+			if (is_integer)
 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum(w, s));
 			else
 				w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum(w,s));
@@ -348,25 +244,109 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 		}
 
 	glp_delete_prob(lp);
+	for(s = 0; s < ns; s++)
+		for(w = 0; w < nw; w++)
+		{
+			final_w_in_s[s][w] = w_in_s[s][w];
+			final_flops_on_w[s][w] = flops_on_w[s][w];
+		}
+
 	return res;
 }
 
+static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, int *in_sched_ctxs, int *workers)
+{
+//	double flops[ns];
+//	double velocity[ns][nw];
+	double *flops = (double*)malloc(ns*sizeof(double));
+	double **velocity = (double **)malloc(ns*sizeof(double*));
+	int i;
+	for(i = 0; i < ns; i++)
+		velocity[i] = (double*)malloc(nw*sizeof(double));
+
+	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+	
+	int w,s;
+
+	struct sc_hypervisor_wrapper* sc_w = NULL;
+	double total_flops = 0.0;
+	for(s = 0; s < ns; s++)
+	{
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+		for(w = 0; w < nw; w++)
+		{
+			w_in_s[s][w] = 0.0;
+			int worker = workers == NULL ? w : workers[w];
+
+			velocity[s][w] = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
+			if(velocity[s][w] == -1.0)
+			{
+				enum starpu_archtype arch = starpu_worker_get_type(worker);
+				velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
+				if(arch == STARPU_CUDA_WORKER)
+				{
+					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
+					if(!worker_in_ctx)
+					{
+						double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
+						velocity[s][w] = (velocity[s][w] * transfer_velocity) / (velocity[s][w] + transfer_velocity);
+					}
+				}
+
+			}
+			
+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+		}
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
+		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
+	}
+	
+	/* take the exec time of the slowest ctx 
+	   as starting point and then try to minimize it
+	   as increasing it a little for the faster ctxs */
+	double tmax = sc_hypervisor_get_slowest_ctx_exec_time();
+ 	double smallest_tmax = sc_hypervisor_get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; 
+//	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
+	double tmin = 0.0;
+
+        struct ispeed_lp_data specific_data;
+        specific_data.velocity = velocity;
+        specific_data.flops = flops;
+        specific_data.flops_on_w = flops_on_w;
+        specific_data.workers = workers;
+
+        unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
+								tmin, tmax, smallest_tmax, _glp_resolve);
+
+	for(i = 0; i < ns; i++)
+		free(velocity[i]);
+	free(velocity);
+	
+	return found_sol;
+}
+
+
 
 static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	_get_velocity_per_worker(sc_w, worker);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	sc_hypervisor_get_velocity_per_worker(sc_w, worker);
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
 		{
-			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
+			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 
 			double w_in_s[ns][nw];
-			double flops_on_w[ns][nw];
+//			double flops_on_w[ns][nw];
+			double **flops_on_w = (double**)malloc(ns*sizeof(double*));
+			int i;
+			for(i = 0; i < ns; i++)
+				flops_on_w[i] = (double*)malloc(nw*sizeof(double));
 
+			printf("ns = %d nw = %d\n", ns, nw);
 			unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, NULL, NULL);
 			/* if we did find at least one solution redistribute the resources */
 			if(found_sol)
@@ -407,9 +387,11 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct s
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 
-				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
-
+				sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 			}
+			for(i = 0; i < ns; i++)
+				free(flops_on_w[i]);
+			free(flops_on_w);
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
@@ -417,7 +399,7 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct s
 
 static void ispeed_lp_end_ctx(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 	int worker;
 /* 	for(worker = 0; worker < 12; worker++) */
 /* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
@@ -425,7 +407,7 @@ static void ispeed_lp_end_ctx(unsigned sched_ctx)
 	return;
 }
 
-struct sched_ctx_hypervisor_policy ispeed_lp_policy = {
+struct sc_hypervisor_policy ispeed_lp_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = ispeed_lp_handle_poped_task,
 	.handle_pushed_task = NULL,

+ 19 - 19
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -14,12 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "policy_tools.h"
+#include "sc_hypervisor_policy.h"
 
 static unsigned _get_fastest_sched_ctx(void)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int fastest_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 	double curr_velocity = 0.0;
@@ -27,7 +27,7 @@ static unsigned _get_fastest_sched_ctx(void)
 	int i;
 	for(i = 0; i < nsched_ctxs; i++)
 	{
-		curr_velocity = _get_ctx_velocity(sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]));
+		curr_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[i]));
 		if( curr_velocity > biggest_velocity)
 		{
 			fastest_sched_ctx = sched_ctxs[i];
@@ -40,16 +40,16 @@ static unsigned _get_fastest_sched_ctx(void)
 
 static unsigned _get_slowest_sched_ctx(void)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
-	double smallest_velocity = _get_ctx_velocity(sched_ctx_hypervisor_get_wrapper(sched_ctxs[0]));
+	double smallest_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[0]));
 	unsigned slowest_sched_ctx = smallest_velocity == -1.0  ? STARPU_NMAX_SCHED_CTXS : sched_ctxs[0];
 	double curr_velocity = 0.0;
 	int i;
 	for(i = 1; i < nsched_ctxs; i++)
 	{
-		curr_velocity = _get_ctx_velocity(sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]));
+		curr_velocity = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(sched_ctxs[i]));
 		if((curr_velocity < smallest_velocity || smallest_velocity == 0.0) && curr_velocity != -1.0)
 		{
 			smallest_velocity = curr_velocity;
@@ -64,8 +64,8 @@ static unsigned _get_slowest_sched_ctx(void)
 /* get first nworkers with the highest idle time in the context */
 static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
 
 	int *curr_workers = (int*)malloc((*nworkers) * sizeof(int));
 	int i;
@@ -104,7 +104,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 
 					if(!considered)
 					{
-						double worker_velocity = _get_velocity_per_worker(sc_w, worker);
+						double worker_velocity = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
 						if(worker_velocity != -1.0)
 						{
 							/* the first iteration*/
@@ -119,7 +119,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 							else if(config->priority[worker] ==
 								config->priority[curr_workers[index]])
 							{
-								double curr_worker_velocity = _get_velocity_per_worker(sc_w, curr_workers[index]);
+								double curr_worker_velocity = sc_hypervisor_get_velocity_per_worker(sc_w, curr_workers[index]);
 //								printf("speed[%d] = %lf speed[%d] = %lf\n", worker, worker_velocity, curr_workers[index], curr_worker_velocity);
 								if(worker_velocity < curr_worker_velocity && curr_worker_velocity != -1.0)
 								{
@@ -146,13 +146,13 @@ static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct star
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		if(_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
 		{
 			unsigned fastest_sched_ctx = _get_fastest_sched_ctx();
 			unsigned slowest_sched_ctx = _get_slowest_sched_ctx();
 			if(fastest_sched_ctx != STARPU_NMAX_SCHED_CTXS && slowest_sched_ctx != STARPU_NMAX_SCHED_CTXS && fastest_sched_ctx != slowest_sched_ctx)
 			{
-				int nworkers_to_move = _get_nworkers_to_move(fastest_sched_ctx);
+				int nworkers_to_move = sc_hypervisor_compute_nworkers_to_move(fastest_sched_ctx);
 				if(nworkers_to_move > 0)
 				{
 					int *workers_to_move = _get_slowest_workers(fastest_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
@@ -161,14 +161,14 @@ static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct star
 						double new_speed = 0.0;
 						int i;
 						for(i = 0; i < nworkers_to_move; i++)
-							new_speed += _get_velocity_per_worker(sched_ctx_hypervisor_get_wrapper(fastest_sched_ctx), workers_to_move[i]);
-						double fastest_speed = _get_ctx_velocity(sched_ctx_hypervisor_get_wrapper(fastest_sched_ctx));
-						double slowest_speed = _get_ctx_velocity(sched_ctx_hypervisor_get_wrapper(slowest_sched_ctx));
+							new_speed += sc_hypervisor_get_velocity_per_worker(sc_hypervisor_get_wrapper(fastest_sched_ctx), workers_to_move[i]);
+						double fastest_speed = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(fastest_sched_ctx));
+						double slowest_speed = sc_hypervisor_get_ctx_velocity(sc_hypervisor_get_wrapper(slowest_sched_ctx));
 //						printf("fast_speed(%d) %lf slow_speed(%d) %lf new speed(%d) %lf \n", fastest_sched_ctx, fastest_speed, slowest_sched_ctx, 
 //						       slowest_speed, workers_to_move[0], new_speed);
 						if(fastest_speed != -1.0 && slowest_speed != -1.0 && (slowest_speed + new_speed) <= (fastest_speed - new_speed))
 						{
-							sched_ctx_hypervisor_move_workers(fastest_sched_ctx, slowest_sched_ctx, workers_to_move, nworkers_to_move, 0);
+							sc_hypervisor_move_workers(fastest_sched_ctx, slowest_sched_ctx, workers_to_move, nworkers_to_move, 0);
 						}
 					}
 					
@@ -181,7 +181,7 @@ static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct star
 	}
 }
 
-struct sched_ctx_hypervisor_policy ispeed_policy = {
+struct sc_hypervisor_policy ispeed_policy = {
 	.size_ctxs = NULL,
 	.handle_poped_task = ispeed_handle_poped_task,
 	.handle_pushed_task = NULL,

+ 155 - 255
sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c

@@ -15,234 +15,28 @@
  */
 
 #include <starpu_config.h>
-#include "lp_tools.h"
+#include "sc_hypervisor_lp.h"
+#include "sc_hypervisor_policy.h"
 #include <math.h>
+#include <sys/time.h>
 
-static struct bound_task_pool *task_pools = NULL;
+static struct sc_hypervisor_policy_task_pool *task_pools = NULL;
 
 static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned interger,
-			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs);
-static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], 
-						     int *sched_ctxs, int *workers, struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
-{
-	double draft_tasks[nw][nt];
-	double draft_w_in_s[ns][nw];
-
-	int w,t, s;
-	for(w = 0; w < nw; w++)
-		for(t = 0; t < nt; t++)
-		{
-			tasks[w][t] = 0.0;
-			draft_tasks[w][t] = 0.0;
-		}
-
-	for(s = 0; s < ns; s++)
-		for(w = 0; w < nw; w++)
-		{
-			w_in_s[s][w] = 0.0;
-			draft_w_in_s[s][w] = 0.0;
-		}
-
-	/* smallest possible tmax, difficult to obtain as we
-	   compute the nr of flops and not the tasks */
-	double possible_tmax = _lp_get_tmax(nw, workers);
-	double smallest_tmax = possible_tmax / 3;
-	double tmax = possible_tmax * ns;
-	double res = 1.0;
-	unsigned has_sol = 0;
-	double tmin = smallest_tmax;
-	double old_tmax = 0.0;
-	unsigned found_sol = 0;
-
-//	printf("tmin = %lf tmax = %lf \n", tmin, tmax);
-	struct timeval start_time;
-	struct timeval end_time;
-	int nd = 0;
-	gettimeofday(&start_time, NULL);
-
-	/* we fix tmax and we do not treat it as an unknown
-	   we just vary by dichotomy its values*/
-	while(tmax > 1.0)
-	{
-		/* find solution and save the values in draft tables
-		   only if there is a solution for the system we save them
-		   in the proper table */
-		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers, 1, tmp_task_pools, size_ctxs);
-		if(res != 0.0)
-		{
-			for(w = 0; w < nw; w++)
-				for(t = 0; t < nt; t++)
-					tasks[w][t] = draft_tasks[w][t];
-			for(s = 0; s < ns; s++)
-				for(w = 0; w < nw; w++)
-					w_in_s[s][w] = draft_w_in_s[s][w];
-			has_sol = 1;
-			found_sol = 1;
-		}
-		else
-			has_sol = 0;
-
-		/* if we have a solution with this tmax try a smaller value
-		   bigger than the old min */
-		if(has_sol)
-		{
-			if(old_tmax != 0.0 && (old_tmax - tmax) < 0.5)
-				break;
-			old_tmax = tmax;
-		}
-		else /*else try a bigger one but smaller than the old tmax */
-		{
-			tmin = tmax;
-			if(old_tmax != 0.0)
-				tmax = old_tmax;
-		}
-		if(tmin == tmax) break;
-		tmax = _find_tmax(tmin, tmax);
-
-		if(tmax < smallest_tmax)
-		{
-			tmax = old_tmax;
-			tmin = smallest_tmax;
-			tmax = _find_tmax(tmin, tmax);
-		}
-		nd++;
-	}
-	gettimeofday(&end_time, NULL);
-
-	long diff_s = end_time.tv_sec  - start_time.tv_sec;
-	long diff_us = end_time.tv_usec  - start_time.tv_usec;
-
-	float timing = (float)(diff_s*1000000 + diff_us)/1000;
-
-//        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
-	return found_sol;
-}
-
-
-static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
-{
-	int ns = sched_ctxs == NULL ? sched_ctx_hypervisor_get_nsched_ctxs() : nsched_ctxs;
-	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
-	int nt = 0; /* Number of different kinds of tasks */
-	starpu_pthread_mutex_lock(&mutex);
-	struct bound_task_pool * tp;
-	for (tp = task_pools; tp; tp = tp->next)
-		nt++;
-
-	double w_in_s[ns][nw];
-	double tasks[nw][nt];
-	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks, sched_ctxs, workers, task_pools, 1);
-	starpu_pthread_mutex_unlock(&mutex);
-	/* if we did find at least one solution redistribute the resources */
-	if(found_sol)
-		_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 1);
-}
-
-static void size_if_required()
-{
-	int nsched_ctxs, nworkers;
-	int *sched_ctxs, *workers;
-	unsigned has_req = sched_ctx_hypervisor_get_size_req(&sched_ctxs, &nsched_ctxs, &workers, &nworkers);
-
-	if(has_req)
-	{
-		struct sched_ctx_hypervisor_wrapper* sc_w = NULL;
-		unsigned ready_to_size = 1;
-		int s;
-		starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-		for(s = 0; s < nsched_ctxs; s++)
-		{
-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
-			if(sc_w->submitted_flops < sc_w->total_flops)
-				ready_to_size = 0;
-		}
-
-		if(ready_to_size)
-			_size_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
-		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
-	}
-}
-
-static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprint)
-{
-	/* count the tasks of the same type */
-	starpu_pthread_mutex_lock(&mutex);
-	struct bound_task_pool *tp = NULL;
-
-	for (tp = task_pools; tp; tp = tp->next)
-	{
-		if (tp && tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
-			break;
-	}
-
-	if (!tp)
-	{
-		tp = (struct bound_task_pool *) malloc(sizeof(struct bound_task_pool));
-		tp->cl = task->cl;
-		tp->footprint = footprint;
-		tp->sched_ctx_id = task->sched_ctx;
-		tp->n = 0;
-		tp->next = task_pools;
-		task_pools = tp;
-	}
-
-	/* One more task of this kind */
-	tp->n++;
-	starpu_pthread_mutex_unlock(&mutex);
-
-	size_if_required();
-}
 
-static void _remove_task_from_pool(struct starpu_task *task, uint32_t footprint)
+struct teft_lp_data 
 {
-	/* count the tasks of the same type */
-	struct bound_task_pool *tp = NULL;
-
-	for (tp = task_pools; tp; tp = tp->next)
-	{
-		if (tp && tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
-			break;
-	}
-
-	if (tp)
-	{
-		if(tp->n > 1)
-			tp->n--;
-		else
-		{
-			if(tp == task_pools)
-			{
-				struct bound_task_pool *next_tp = NULL;
-				if(task_pools->next)
-					next_tp = task_pools->next;
-
-				free(tp);
-				tp = NULL;
-				
-				if(next_tp)
-					task_pools = next_tp;
-				
-			}
-			else
-			{
-				struct bound_task_pool *prev_tp = NULL;
-				for (prev_tp = task_pools; prev_tp; prev_tp = prev_tp->next)
-				{
-					if (prev_tp->next == tp)
-						prev_tp->next = tp->next;
-				}
-				
-				free(tp);
-				tp = NULL;
-			}
-		}
-	}
-}
+	int nt;
+	double **tasks;
+	int *in_sched_ctxs;
+	int *workers;
+	struct sc_hypervisor_policy_task_pool *tmp_task_pools;
+	unsigned size_ctxs;
+};
 
 static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs)
 {
-        struct bound_task_pool *tp;
+        struct sc_hypervisor_policy_task_pool *tp;
         int w, t;
         for (w = 0; w < nw; w++)
         {
@@ -279,17 +73,31 @@ static void _get_tasks_times(int nw, int nt, double times[nw][nt], int *workers,
         }
 }
 
+
+
 /*
  * GNU Linear Programming Kit backend
  */
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers, unsigned integer,
-			   struct bound_task_pool *tmp_task_pools, unsigned size_ctxs)
+static double _glp_resolve(int ns, int nw, double final_w_in_s[ns][nw], 
+			   unsigned is_integer, double tmax, void *specific_data)
 {
+	struct teft_lp_data *sd = (struct teft_lp_data *)specific_data;
+
+	int nt = sd->nt;
+	double **final_tasks = sd->tasks;
+	int *in_sched_ctxs = sd->in_sched_ctxs;
+	int *workers = sd->workers;
+	struct sc_hypervisor_policy_task_pool *tmp_task_pools = sd->tmp_task_pools;
+	unsigned size_ctxs = sd->size_ctxs;
+	
+	double w_in_s[ns][nw];
+	double tasks[nw][nt];
+	
 	if(tmp_task_pools == NULL)
 		return 0.0;
-	struct bound_task_pool * tp;
+	struct sc_hypervisor_policy_task_pool * tp;
 	int t, w, s;
 	glp_prob *lp;
 
@@ -337,7 +145,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				char name[32];
 				snprintf(name, sizeof(name), "w%ds%dn", w, s);
 				glp_set_col_name(lp, nw*nt+s*nw+w+1, name);
-				if (integer)
+				if (is_integer)
                                 {
                                         glp_set_col_kind(lp, nw*nt+s*nw+w+1, GLP_IV);
                                         glp_set_col_bnds(lp, nw*nt+s*nw+w+1, GLP_DB, 0, 1);
@@ -346,7 +154,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 					glp_set_col_bnds(lp, nw*nt+s*nw+w+1, GLP_DB, 0.0, 1.0);
 			}
 
-		int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+		int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 
 		int curr_row_idx = 0;
 		/* Total worker execution time */
@@ -433,7 +241,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 				ar[n] = 1;
 				n++;
 			}
-			if(integer)
+			if(is_integer)
                                 glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
 			else
 				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
@@ -474,7 +282,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	}
 
 
-	if (integer)
+	if (is_integer)
         {
                 glp_iocp iocp;
                 glp_init_iocp(&iocp);
@@ -503,7 +311,7 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 	for(s = 0; s < ns; s++)
 		for(w = 0; w < nw; w++)
 		{
-			if (integer)
+			if (is_integer)
 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*nt+s*nw+w+1);
                         else
 				w_in_s[s][w] = glp_get_col_prim(lp, nw*nt+s*nw+w+1);
@@ -512,22 +320,94 @@ static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double
 //	printf("\n");
 
 	glp_delete_prob(lp);
+	if(res != 0.0)
+	{
+		for(s = 0; s < ns; s++)
+			for(w = 0; w < nw; w++)
+				final_w_in_s[s][w] = w_in_s[s][w];
+
+		for(w = 0; w < nw; w++)
+			for(t = 0; t < nt; t++)
+				final_tasks[w][t] = tasks[w][t];
+	}
 	return res;
 }
+	
+static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
+	int nt = 0; /* Number of different kinds of tasks */
+	starpu_pthread_mutex_lock(&mutex);
+	struct sc_hypervisor_policy_task_pool * tp;
+	for (tp = task_pools; tp; tp = tp->next)
+		nt++;
 
-static struct bound_task_pool* _clone_linked_list(struct bound_task_pool *tp)
+	double w_in_s[ns][nw];
+	double tasks[nw][nt];
+
+	struct teft_lp_data specific_data;
+	specific_data.nt = nt;
+	specific_data.tasks = tasks;
+	specific_data.in_sched_ctxs = sched_ctxs;
+	specific_data.workers = workers;
+	specific_data.tmp_task_pools = task_pools;
+	specific_data.size_ctxs = 1;
+
+	/* smallest possible tmax, difficult to obtain as we
+	   compute the nr of flops and not the tasks */
+	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, workers);
+	double smallest_tmax = possible_tmax / 3;
+	double tmax = possible_tmax * ns;
+	double tmin = smallest_tmax;
+
+	unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
+								tmin, tmax, smallest_tmax, _glp_resolve);
+
+	starpu_pthread_mutex_unlock(&mutex);
+	/* if we did find at least one solution redistribute the resources */
+	if(found_sol)
+		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 1);
+}
+
+static void size_if_required()
+{
+	int nsched_ctxs, nworkers;
+	int *sched_ctxs, *workers;
+	unsigned has_req = sc_hypervisor_get_size_req(&sched_ctxs, &nsched_ctxs, &workers, &nworkers);
+
+	if(has_req)
+	{
+		struct sc_hypervisor_wrapper* sc_w = NULL;
+		unsigned ready_to_size = 1;
+		int s;
+		starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+		for(s = 0; s < nsched_ctxs; s++)
+		{
+			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+			if(sc_w->submitted_flops < sc_w->total_flops)
+				ready_to_size = 0;
+		}
+
+		if(ready_to_size)
+			_size_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
+
+static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint)
 {
-	if(tp == NULL) return NULL;
+	/* count the tasks of the same type */
+	starpu_pthread_mutex_lock(&mutex);
+	sc_hypervisor_policy_add_task_to_pool(cl, sched_ctx, footprint, &task_pools);
+	starpu_pthread_mutex_unlock(&mutex);
 
-	struct bound_task_pool *tmp_tp = (struct bound_task_pool*)malloc(sizeof(struct bound_task_pool));
-	memcpy(tmp_tp, tp, sizeof(struct bound_task_pool));
-	tmp_tp->next = _clone_linked_list(tp->next);
-	return tmp_tp;
+	size_if_required();
 }
 
-static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
@@ -538,9 +418,9 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 			return;
 		}
 
-		if(_velocity_gap_btw_ctxs())
+		if(sc_hypervisor_has_velocity_gap_btw_ctxs())
 		{
-			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
+			int ns = sc_hypervisor_get_nsched_ctxs();
 			int nw = starpu_worker_get_count(); /* Number of different workers */
 			int nt = 0; /* Number of different kinds of tasks */
 
@@ -551,25 +431,44 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 			   that the linear progr won't segfault if the list of 
 			   submitted task will change during the exec */
 
-			struct bound_task_pool *tp = NULL;
-			struct bound_task_pool *tmp_task_pools = _clone_linked_list(task_pools);
+			struct sc_hypervisor_policy_task_pool *tp = NULL;
+			struct sc_hypervisor_policy_task_pool *tmp_task_pools = sc_hypervisor_policy_clone_task_pool(task_pools);
 
 			for (tp = task_pools; tp; tp = tp->next)
 				nt++;
 
 
 			double w_in_s[ns][nw];
-			double tasks_per_worker[nw][nt];
-
-			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL, tmp_task_pools, 0);
+//			double tasks_per_worker[nw][nt];
+			double **tasks_per_worker=(double**)malloc(nw*sizeof(double*));
+			int i;
+			for(i = 0; i < nw; i++)
+				tasks_per_worker[i] = (double*)malloc(nt*sizeof(double));
+
+			struct teft_lp_data specific_data;
+			specific_data.nt = nt;
+			specific_data.tasks = tasks_per_worker;
+			specific_data.in_sched_ctxs = NULL;
+			specific_data.workers = NULL;
+			specific_data.tmp_task_pools = tmp_task_pools;
+			specific_data.size_ctxs = 0;
+
+			/* smallest possible tmax, difficult to obtain as we
+			   compute the nr of flops and not the tasks */
+			double possible_tmax = sc_hypervisor_lp_get_tmax(nw, NULL);
+			double smallest_tmax = possible_tmax / 3;
+			double tmax = possible_tmax * ns;
+			double tmin = smallest_tmax;
+			unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
+								tmin, tmax, smallest_tmax, _glp_resolve);
 //			starpu_pthread_mutex_unlock(&mutex);
 
 			/* if we did find at least one solution redistribute the resources */
 			if(found_sol)
-				_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
+				sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, NULL, NULL, 0);
 
-			struct bound_task_pool *next = NULL;
-			struct bound_task_pool *tmp_tp = tmp_task_pools;
+			struct sc_hypervisor_policy_task_pool *next = NULL;
+			struct sc_hypervisor_policy_task_pool *tmp_tp = tmp_task_pools;
 			while(tmp_task_pools)
 			{
 				next = tmp_tp->next;
@@ -577,35 +476,36 @@ static void lp2_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_
 				tmp_tp = next;
 				tmp_task_pools = next;
 			}
-			
-
+			for(i = 0; i < nw; i++)
+				free(tasks_per_worker[i]);
+			free(tasks_per_worker);
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 	/* too expensive to take this mutex and correct value of the number of tasks is not compulsory */
 //	starpu_pthread_mutex_lock(&mutex);
-	_remove_task_from_pool(task, footprint);
+	sc_hypervisor_policy_remove_task_from_pool(task, footprint, &task_pools);
 //	starpu_pthread_mutex_unlock(&mutex);
 
 }
 
 
-static void lp2_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void teft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	sched_ctx_hypervisor_save_size_req(sched_ctxs, nsched_ctxs, workers, nworkers);
+	sc_hypervisor_save_size_req(sched_ctxs, nsched_ctxs, workers, nworkers);
 }
 
-struct sched_ctx_hypervisor_policy lp2_policy = {
-	.size_ctxs = lp2_size_ctxs,
-	.handle_poped_task = lp2_handle_poped_task,
+struct sc_hypervisor_policy teft_lp_policy = {
+	.size_ctxs = teft_lp_size_ctxs,
+	.handle_poped_task = teft_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = NULL,
 	.handle_idle_end = NULL,
 	.handle_post_exec_hook = NULL,
-	.handle_submitted_job = lp2_handle_submitted_job,
+	.handle_submitted_job = teft_lp_handle_submitted_job,
 	.end_ctx = NULL,
 	.custom = 0,
-	.name = "lp2"
+	.name = "teft_lp"
 };
 
 #endif /* STARPU_HAVE_GLPK_H */

+ 88 - 0
sc_hypervisor/src/policies_utils/dichotomy.c

@@ -0,0 +1,88 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011 - 2013  INRIA 
+ *                                                                                                                                                   
+ * StarPU is free software; you can redistribute it and/or modify       
+ * it under the terms of the GNU Lesser General Public License as published by                                                             
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.     
+ *                                                                                                                                         
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *                                                        
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "sc_hypervisor_lp.h"
+#include "sc_hypervisor_policy.h"
+#include <math.h>
+#include <sys/time.h>
+
+unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw], unsigned solve_lp_integer, void *specific_data,
+					    double tmin, double tmax, double smallest_tmax,
+					    double (*lp_estimated_distrib_func)(int ns, int nw, double draft_w_in_s[ns][nw], 
+									     unsigned is_integer, double tmax, void *specifc_data))
+{
+	double res = 1.0;
+	unsigned has_sol = 0;
+	double old_tmax = 0.0;
+	unsigned found_sol = 0;
+
+	struct timeval start_time;
+	struct timeval end_time;
+	int nd = 0;
+	gettimeofday(&start_time, NULL);
+
+	/* we fix tmax and we do not treat it as an unknown
+	   we just vary by dichotomy its values*/
+	while(tmax > 1.0)
+	{
+		/* find solution and save the values in draft tables
+		   only if there is a solution for the system we save them
+		   in the proper table */
+		res = lp_estimated_distrib_func(ns, nw, w_in_s, solve_lp_integer, tmax, specific_data);
+		if(res != 0.0)
+		{
+			has_sol = 1;
+			found_sol = 1;
+		}
+		else
+			has_sol = 0;
+
+		/* if we have a solution with this tmax try a smaller value
+		   bigger than the old min */
+		if(has_sol)
+		{
+			if(old_tmax != 0.0 && (old_tmax - tmax) < 0.5)
+				break;
+			old_tmax = tmax;
+		}
+		else /*else try a bigger one but smaller than the old tmax */
+		{
+			tmin = tmax;
+			if(old_tmax != 0.0)
+				tmax = old_tmax;
+		}
+		if(tmin == tmax) break;
+		tmax = sc_hypervisor_lp_find_tmax(tmin, tmax);
+
+		if(tmax < smallest_tmax)
+		{
+			tmax = old_tmax;
+			tmin = smallest_tmax;
+			tmax = sc_hypervisor_lp_find_tmax(tmin, tmax);
+		}
+		nd++;
+	}
+	gettimeofday(&end_time, NULL);
+
+	long diff_s = end_time.tv_sec  - start_time.tv_sec;
+	long diff_us = end_time.tv_usec  - start_time.tv_usec;
+
+	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+//        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
+	return found_sol;
+}
+

+ 64 - 47
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c

@@ -15,12 +15,13 @@
  */
 
 #include <math.h>
-#include "lp_tools.h"
+#include "sc_hypervisor_lp.h"
+#include "sc_hypervisor_policy.h"
 #include <starpu_config.h>
 
 #ifdef STARPU_HAVE_GLPK_H
 
-double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
+double sc_hypervisor_lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
 {
 	int integer = 1;
 	int s, w;
@@ -220,44 +221,53 @@ double _lp_compute_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flo
 
 #endif //STARPU_HAVE_GLPK_H
 
-double _lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers])
+double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers])
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 #ifdef STARPU_HAVE_GLPK_H
 	double v[nsched_ctxs][ntypes_of_workers];
 	double flops[nsched_ctxs];
 
 	int i = 0;
-	struct sched_ctx_hypervisor_wrapper* sc_w;
+	struct sc_hypervisor_wrapper* sc_w;
 	for(i = 0; i < nsched_ctxs; i++)
 	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
-		v[i][0] = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
-		v[i][1] = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
-
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+#ifdef STARPU_USE_CUDA
+		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
+		if(ncuda != 0)
+		{
+			v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
+			v[i][1] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+		}
+		else
+			v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+#else
+		v[i][0] = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+#endif // STARPU_USE_CUDA
 		flops[i] = sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
 //		printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
 	}
 
-	return 1/_lp_compute_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
-#else
+	return 1/sc_hypervisor_lp_compute_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
+#else//STARPU_HAVE_GLPK_H
 	return 0.0;
-#endif
+#endif//STARPU_HAVE_GLPK_H
 }
 
-double _lp_get_tmax(int nw, int *workers)
+double sc_hypervisor_lp_get_tmax(int nw, int *workers)
 {
 	int ntypes_of_workers = 2;
 	int total_nw[ntypes_of_workers];
-	_get_total_nw(workers, nw, 2, total_nw);
+	sc_hypervisor_group_workers_by_type(workers, nw, 2, total_nw);
 
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double res[nsched_ctxs][ntypes_of_workers];
-	return _lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000;
+	return sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000;
 }
 
-void _lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
+void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
 {
 	int s, w;
 	double left_res[nw];
@@ -330,11 +340,11 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 		
 		if(w == 1)
 		{
-			int nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(sched_ctx, arch);
+			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch);
 			if(nworkers_ctx > res_rounded[sched_ctx_idx][w])
 			{
 				int nworkers_to_move = nworkers_ctx - res_rounded[sched_ctx_idx][w];
-				int *workers_to_move = _get_first_workers(sched_ctx, &nworkers_to_move, arch);
+				int *workers_to_move = sc_hypervisor_get_idlest_workers(sched_ctx, &nworkers_to_move, arch);
 				int i;
 				for(i = 0; i < nworkers_to_move; i++)
 					tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
@@ -343,7 +353,7 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 		}
 		else
 		{
-			double nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(sched_ctx, arch) * 1.0;
+			double nworkers_ctx = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch) * 1.0;
 			if(nworkers_ctx > res[sched_ctx_idx][w])
 			{
 				double nworkers_to_move = nworkers_ctx - res[sched_ctx_idx][w];
@@ -352,7 +362,7 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 				double diff = nworkers_to_move - x_double;
 				if(diff == 0.0)
 				{
-					int *workers_to_move = _get_first_workers(sched_ctx, &x, arch);
+					int *workers_to_move = sc_hypervisor_get_idlest_workers(sched_ctx, &x, arch);
 					if(x > 0)
 					{
 						int i;
@@ -365,7 +375,7 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 				else
 				{
 					x+=1;
-					int *workers_to_move = _get_first_workers(sched_ctx, &x, arch);
+					int *workers_to_move = sc_hypervisor_get_idlest_workers(sched_ctx, &x, arch);
 					if(x > 0)
 					{
 						int i;
@@ -401,7 +411,7 @@ void _lp_find_workers_to_accept(int nw, int ns, unsigned sched_ctx, int sched_ct
 		if(w == 0) arch = STARPU_CUDA_WORKER;
 		if(w == 1) arch = STARPU_CPU_WORKER;
 		
-		int nw_ctx2 = sched_ctx_hypervisor_get_nworkers_ctx(sched_ctx, arch);
+		int nw_ctx2 = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch);
 		int nw_needed = res_rounded[sched_ctx_idx][w] - nw_ctx2;
 		
 		if( nw_needed > 0 && tmp_nw_move[w] > 0)
@@ -470,9 +480,9 @@ void _lp_find_workers_to_remove(int nw, int tmp_nw_move[nw], int tmp_workers_mov
 	}
 }
 
-void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int s, s2, w;
 	for(s = 0; s < ns; s++)
 	{
@@ -522,13 +532,13 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				
 				if(nw_move > 0)
 				{
-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
+					sc_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
 					nw_move = 0;
 				}
 
 				if(nw_add > 0)
 				{
-					sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s2]);
+					sc_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s2]);
 					nw_add = 0;
 				}
 			}
@@ -542,11 +552,11 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		_lp_find_workers_to_remove(nw, tmp_nw_move, tmp_workers_move, 
 					   &nw_move, workers_move);
 		if(nw_move > 0)
-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
+			sc_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
 	}
 }
 
-void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers)
+void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers)
 {
 	unsigned current_nworkers = workers == NULL ? starpu_worker_get_count() : (unsigned)nworkers;
 	int s, w;
@@ -561,13 +571,23 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 		for(w = 0; w < nw; w++)
 		{
 			enum starpu_archtype arch;
-			if(w == 0) arch = STARPU_CUDA_WORKER;
-			if(w == 1) arch = STARPU_CPU_WORKER;
 
+#ifdef STARPU_USE_CUDA
+			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
+			if(ncuda != 0)
+			{
+				if(w == 0) arch = STARPU_CUDA_WORKER;
+				if(w == 1) arch = STARPU_CPU_WORKER;
+			}
+			else
+				if(w == 0) arch = STARPU_CPU_WORKER;
+#else
+			if(w == 0) arch = STARPU_CPU_WORKER;
+#endif //STARPU_USE_CUDA
 			if(w == 1)
 			{
 				int nworkers_to_add = res_rounded[s][w];
-				int *workers_to_add = _get_first_workers_in_list(&start[w], workers, current_nworkers, &nworkers_to_add, arch);
+				int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &nworkers_to_add, arch);
 				int i;
 				for(i = 0; i < nworkers_to_add; i++)
 					workers_add[nw_add++] = workers_to_add[i];
@@ -582,7 +602,7 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 				double diff = nworkers_to_add - x_double;
 				if(diff == 0.0)
 				{
-					int *workers_to_add = _get_first_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
 					int i;
 					for(i = 0; i < x; i++)
 						workers_add[nw_add++] = workers_to_add[i];
@@ -591,7 +611,7 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 				else
 				{
 					x+=1;
-					int *workers_to_add = _get_first_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
 					int i;
 					if(diff >= 0.3)
 						for(i = 0; i < x; i++)
@@ -606,16 +626,16 @@ void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_r
 		}
 		if(nw_add > 0)
 		{
-			sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s]);
-			sched_ctx_hypervisor_start_resize(sched_ctxs[s]);
+			sc_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s]);
+			sc_hypervisor_start_resize(sched_ctxs[s]);
 		}
 
-//		sched_ctx_hypervisor_stop_resize(current_sched_ctxs[s]);
+//		sc_hypervisor_stop_resize(current_sched_ctxs[s]);
 	}
 }
 
 /* nw = all the workers (either in a list or on all machine) */
-void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs_input, int *workers_input, unsigned do_size)
+void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs_input, int *workers_input, unsigned do_size)
 {
 	int w, s;
 	double nworkers[ns][2];
@@ -650,19 +670,16 @@ void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sche
 		}
 	}
 	
-/* 	for(s = 0; s < ns; s++) */
-/* 		printf("%d: cpus = %d gpus = %d \n", s, nworkers_rounded[s][1], nworkers_rounded[s][0]); */
-
 	if(!do_size)
-		_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 	else
 	{
-		int *current_sched_ctxs = sched_ctxs_input == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : sched_ctxs_input;
+		int *current_sched_ctxs = sched_ctxs_input == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs_input;
 
 		unsigned has_workers = 0;
 		for(s = 0; s < ns; s++)
 		{
-			int nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
+			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
 										 STARPU_ANY_WORKER);
 			if(nworkers_ctx != 0)
 			{
@@ -671,14 +688,14 @@ void _lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sche
 			}
 		}
 		if(has_workers)
-			_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 		else
-			_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, 2, nworkers_rounded, nworkers, workers_input, nw);
+			sc_hypervisor_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, 2, nworkers_rounded, nworkers, workers_input, nw);
 	}
 	return;
 }
 
-double _find_tmax(double t1, double t2)
+double sc_hypervisor_lp_find_tmax(double t1, double t2)
 {
 	return t1 + ((t2 - t1)/2);
 }

+ 69 - 65
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  INRIA
+ * Copyright (C) 2010-2013  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,13 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-/* #include <sched_ctx_hypervisor.h> */
 
-#include "policy_tools.h"
+#include "sc_hypervisor_policy.h"
 
 static int _compute_priority(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
 
 	int total_priority = 0;
 
@@ -40,25 +39,25 @@ static int _compute_priority(unsigned sched_ctx)
 	return total_priority;
 }
 
-/* find the context with the slowest priority */
-unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
+/* find the context with the lowest priority */
+unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
 {
 	int i;
 	int highest_priority = -1;
 	int current_priority = 0;
 	unsigned sched_ctx = STARPU_NMAX_SCHED_CTXS;
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 
-	struct sched_ctx_hypervisor_policy_config *config = NULL;
+	struct sc_hypervisor_policy_config *config = NULL;
 
 	for(i = 0; i < nsched_ctxs; i++)
 	{
 		if(sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && sched_ctxs[i] != req_sched_ctx)
 		{
 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctxs[i]);
-			config  = sched_ctx_hypervisor_get_config(sched_ctxs[i]);
+			config  = sc_hypervisor_get_config(sched_ctxs[i]);
 			if((nworkers + nworkers_to_move) <= config->max_nworkers)
 			{
 				current_priority = _compute_priority(sched_ctxs[i]);
@@ -74,7 +73,7 @@ unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
 	return sched_ctx;
 }
 
-int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch)
+int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_archtype arch)
 {
 	int *curr_workers = (int*)malloc((*nworkers)*sizeof(int));
 
@@ -102,10 +101,10 @@ int* _get_first_workers_in_list(int *start, int *workers, int nall_workers,  int
 }
 
 /* get first nworkers with the highest idle time in the context */
-int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
+int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
 {
-	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
 
 	int *curr_workers = (int*)malloc((*nworkers) * sizeof(int));
 	int i;
@@ -177,7 +176,7 @@ int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype
 }
 
 /* get the number of workers in the context that are allowed to be moved (that are not fixed) */
-unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch)
+unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_archtype arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 
@@ -204,13 +203,13 @@ unsigned _get_potential_nworkers(struct sched_ctx_hypervisor_policy_config *conf
 /* compute the number of workers that should be moved depending:
    - on the min/max number of workers in a context imposed by the user,
    - on the resource granularity imposed by the user for the resizing process*/
-int _get_nworkers_to_move(unsigned req_sched_ctx)
+int sc_hypervisor_compute_nworkers_to_move(unsigned req_sched_ctx)
 {
-       	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(req_sched_ctx);
+       	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(req_sched_ctx);
 	unsigned nworkers = starpu_sched_ctx_get_nworkers(req_sched_ctx);
 	unsigned nworkers_to_move = 0;
 
-	unsigned potential_moving_workers = _get_potential_nworkers(config, req_sched_ctx, STARPU_ANY_WORKER);
+	unsigned potential_moving_workers = sc_hypervisor_get_movable_nworkers(config, req_sched_ctx, STARPU_ANY_WORKER);
 	if(potential_moving_workers > 0)
 	{
 		if(potential_moving_workers <= config->min_nworkers)
@@ -248,7 +247,7 @@ int _get_nworkers_to_move(unsigned req_sched_ctx)
 	return nworkers_to_move;
 }
 
-unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize, unsigned now)
+unsigned sc_hypervisor_policy_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize, unsigned now)
 {
 	int ret = 1;
 	if(force_resize)
@@ -257,18 +256,18 @@ unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigne
 		ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
 	{
-		int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+		int nworkers_to_move = sc_hypervisor_compute_nworkers_to_move(sender_sched_ctx);
 		if(nworkers_to_move > 0)
 		{
 			unsigned poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 			if(receiver_sched_ctx == STARPU_NMAX_SCHED_CTXS)
 			{
-				poor_sched_ctx = _find_poor_sched_ctx(sender_sched_ctx, nworkers_to_move);
+				poor_sched_ctx = sc_hypervisor_find_lowest_prio_sched_ctx(sender_sched_ctx, nworkers_to_move);
 			}
 			else
 			{
 				poor_sched_ctx = receiver_sched_ctx;
-				struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(poor_sched_ctx);
 				unsigned nworkers = starpu_sched_ctx_get_nworkers(poor_sched_ctx);
 				unsigned nshared_workers = starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, poor_sched_ctx);
 				if((nworkers+nworkers_to_move-nshared_workers) > config->max_nworkers)
@@ -277,10 +276,10 @@ unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigne
 			}
 			if(poor_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 			{
-				int *workers_to_move = _get_first_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
-				sched_ctx_hypervisor_move_workers(sender_sched_ctx, poor_sched_ctx, workers_to_move, nworkers_to_move, now);
+				int *workers_to_move = sc_hypervisor_get_idlest_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
+				sc_hypervisor_move_workers(sender_sched_ctx, poor_sched_ctx, workers_to_move, nworkers_to_move, now);
 
-				struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(poor_sched_ctx);
 				int i;
 				for(i = 0; i < nworkers_to_move; i++)
 					new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
@@ -296,12 +295,12 @@ unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigne
 }
 
 
-unsigned _resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now)
+unsigned sc_hypervisor_policy_resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now)
 {
-	return _resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
+	return sc_hypervisor_policy_resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
 }
 
-static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype req_arch)
+static double _get_ispeed_sample_for_type_of_worker(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype req_arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
         int worker;
@@ -318,7 +317,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
                 enum starpu_archtype arch = starpu_worker_get_type(worker);
                 if(arch == req_arch)
                 {
-			struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
+			struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
 			avg += config->ispeed_w_sample[worker];
 			n++;
 		}
@@ -330,7 +329,7 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
         
 	int worker;
 	double ispeed_sample = 0.0;
@@ -348,13 +347,13 @@ static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
 	return ispeed_sample;
 }
 
-double _get_ctx_velocity(struct sched_ctx_hypervisor_wrapper* sc_w)
+double sc_hypervisor_get_ctx_velocity(struct sc_hypervisor_wrapper* sc_w)
 {
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-        double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
+        double elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
 	double sample = _get_ispeed_sample_for_sched_ctx(sc_w->sched_ctx);
 
-/* 	double total_elapsed_flops = sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w); */
+/* 	double total_elapsed_flops = sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w); */
 /* 	double prc = config->ispeed_ctx_sample != 0.0 ? elapsed_flops : elapsed_flops/sc_w->total_flops; */
 /* 	double redim_sample = config->ispeed_ctx_sample != 0.0 ? config->ispeed_ctx_sample :  */
 /* 		(elapsed_flops == total_elapsed_flops ? HYPERVISOR_START_REDIM_SAMPLE : HYPERVISOR_REDIM_SAMPLE); */
@@ -374,23 +373,23 @@ double _get_ctx_velocity(struct sched_ctx_hypervisor_wrapper* sc_w)
 	return -1.0;
 }
 
-double _get_slowest_ctx_exec_time(void)
+double sc_hypervisor_get_slowest_ctx_exec_time(void)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 /* 	double curr_time = starpu_timing_now(); */
 	double slowest_time = 0.0;
 
 	int s;
-	struct sched_ctx_hypervisor_wrapper* sc_w;		
+	struct sc_hypervisor_wrapper* sc_w;		
 	for(s = 0; s < nsched_ctxs; s++)
 	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
 
 //		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
-		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_velocity(sc_w);
 		if(elapsed_time > slowest_time)
 			slowest_time = elapsed_time;
 
@@ -398,22 +397,22 @@ double _get_slowest_ctx_exec_time(void)
 	return slowest_time;
 }
 
-double _get_fastest_ctx_exec_time(void)
+double sc_hypervisor_get_fastest_ctx_exec_time(void)
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double curr_time = starpu_timing_now();
  	double fastest_time = curr_time;
 
 	int s;
-	struct sched_ctx_hypervisor_wrapper* sc_w;		
+	struct sc_hypervisor_wrapper* sc_w;		
 	for(s = 0; s < nsched_ctxs; s++)
 	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
 
-		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/sc_hypervisor_get_ctx_velocity(sc_w);
 		
 		if(elapsed_time < fastest_time)
 			fastest_time = elapsed_time;
@@ -424,7 +423,7 @@ double _get_fastest_ctx_exec_time(void)
 }
 
 
-double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsigned worker)
+double sc_hypervisor_get_velocity_per_worker(struct sc_hypervisor_wrapper *sc_w, unsigned worker)
 {
 	if(!starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx))
 		return -1.0;
@@ -432,10 +431,10 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
 	size_t elapsed_data_used = sc_w->elapsed_data[worker];
 	int elapsed_tasks = sc_w->elapsed_tasks[worker];
-	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 
-	double ctx_elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+	double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
 	double ctx_sample = config->ispeed_ctx_sample;
 	if(ctx_elapsed_flops > ctx_sample && elapsed_flops == 0.0)
 		return 0.00000000000001;
@@ -481,7 +480,7 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
 }
 
-static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
+static double _get_best_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
 {
 	double ret_val = 0.0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
@@ -507,7 +506,7 @@ static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w,
 }
 
 /* compute an average value of the cpu/cuda velocity */
-double _get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
 {
         int npus = 0;
         double elapsed_flops = _get_best_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
@@ -527,18 +526,18 @@ double _get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w,
 
 
 /* check if there is a big velocity gap between the contexts */
-int _velocity_gap_btw_ctxs()
+int sc_hypervisor_has_velocity_gap_btw_ctxs()
 {
-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 	int i = 0, j = 0;
-	struct sched_ctx_hypervisor_wrapper* sc_w;
-	struct sched_ctx_hypervisor_wrapper* other_sc_w;
+	struct sc_hypervisor_wrapper* sc_w;
+	struct sc_hypervisor_wrapper* other_sc_w;
 
 	for(i = 0; i < nsched_ctxs; i++)
 	{
-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
-		double ctx_v = _get_ctx_velocity(sc_w);
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+		double ctx_v = sc_hypervisor_get_ctx_velocity(sc_w);
 		if(ctx_v != -1.0)
 		{
 			for(j = 0; j < nsched_ctxs; j++)
@@ -549,8 +548,8 @@ int _velocity_gap_btw_ctxs()
 					if(nworkers == 0) 
 						return 1;
 
-					other_sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[j]);
-					double other_ctx_v = _get_ctx_velocity(other_sc_w);
+					other_sc_w = sc_hypervisor_get_wrapper(sched_ctxs[j]);
+					double other_ctx_v = sc_hypervisor_get_ctx_velocity(other_sc_w);
 					if(other_ctx_v != -1.0)
 					{
 						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
@@ -567,7 +566,7 @@ int _velocity_gap_btw_ctxs()
 }
 
 
-void _get_total_nw(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers])
+void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers])
 {
 	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
 	int w;
@@ -578,8 +577,13 @@ void _get_total_nw(int *workers, int nworkers, int ntypes_of_workers, int total_
 	{
  		enum starpu_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
 			starpu_worker_get_type(workers[w]);
-		if(arch == STARPU_CPU_WORKER)
-			total_nw[1]++;
+		if(ntypes_of_workers == 2)
+		{
+			if(arch == STARPU_CPU_WORKER)
+				total_nw[1]++;
+			else
+				total_nw[0]++;
+		}
 		else
 			total_nw[0]++;
 	}

+ 99 - 0
sc_hypervisor/src/policies_utils/task_pool.c

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include "sc_hypervisor_policy.h"
+
+void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools)
+{
+	struct sc_hypervisor_policy_task_pool *tp = NULL;
+
+	for (tp = *task_pools; tp; tp = tp->next)
+	{
+		if (tp && tp->cl == cl && tp->footprint == footprint && tp->sched_ctx_id == sched_ctx)
+			break;
+	}
+
+	if (!tp)
+	{
+		tp = (struct sc_hypervisor_policy_task_pool *) malloc(sizeof(struct sc_hypervisor_policy_task_pool));
+		tp->cl = cl;
+		tp->footprint = footprint;
+		tp->sched_ctx_id = sched_ctx;
+		tp->n = 0;
+		tp->next = *task_pools;
+		*task_pools = tp;
+	}
+
+	/* One more task of this kind */
+	tp->n++;
+}
+
+void sc_hypervisor_policy_remove_task_from_pool(struct starpu_task *task, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools)
+{
+	/* count the tasks of the same type */
+	struct sc_hypervisor_policy_task_pool *tp = NULL;
+
+	for (tp = *task_pools; tp; tp = tp->next)
+	{
+		if (tp && tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
+			break;
+	}
+
+	if (tp)
+	{
+		if(tp->n > 1)
+			tp->n--;
+		else
+		{
+			if(tp == *task_pools)
+			{
+				struct sc_hypervisor_policy_task_pool *next_tp = NULL;
+				if((*task_pools)->next)
+					next_tp = (*task_pools)->next;
+
+				free(tp);
+				tp = NULL;
+				
+				if(next_tp)
+					*task_pools = next_tp;
+				
+			}
+			else
+			{
+				struct sc_hypervisor_policy_task_pool *prev_tp = NULL;
+				for (prev_tp = *task_pools; prev_tp; prev_tp = prev_tp->next)
+				{
+					if (prev_tp->next == tp)
+						prev_tp->next = tp->next;
+				}
+				
+				free(tp);
+				tp = NULL;
+			}
+		}
+	}
+}
+
+struct sc_hypervisor_policy_task_pool* sc_hypervisor_policy_clone_task_pool(struct sc_hypervisor_policy_task_pool *tp)
+{
+	if(tp == NULL) return NULL;
+
+	struct sc_hypervisor_policy_task_pool *tmp_tp = (struct sc_hypervisor_policy_task_pool*)malloc(sizeof(struct sc_hypervisor_policy_task_pool));
+	memcpy(tmp_tp, tp, sizeof(struct sc_hypervisor_policy_task_pool));
+	tmp_tp->next = sc_hypervisor_policy_clone_task_pool(tp->next);
+	return tmp_tp;
+}

+ 14 - 14
sched_ctx_hypervisor/src/sched_ctx_config.c

@@ -14,11 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <sched_ctx_hypervisor_intern.h>
+#include <sc_hypervisor_intern.h>
 
-static struct sched_ctx_hypervisor_policy_config* _create_config(void)
+static struct sc_hypervisor_policy_config* _create_config(void)
 {
-	struct sched_ctx_hypervisor_policy_config *config = (struct sched_ctx_hypervisor_policy_config *)malloc(sizeof(struct sched_ctx_hypervisor_policy_config));
+	struct sc_hypervisor_policy_config *config = (struct sc_hypervisor_policy_config *)malloc(sizeof(struct sc_hypervisor_policy_config));
 	config->min_nworkers = -1;
 	config->max_nworkers = -1;
 	config->new_workers_max_idle = -1.0;
@@ -39,7 +39,7 @@ static struct sched_ctx_hypervisor_policy_config* _create_config(void)
 	return config;
 }
 
-static void _update_config(struct sched_ctx_hypervisor_policy_config *old, struct sched_ctx_hypervisor_policy_config* new)
+static void _update_config(struct sc_hypervisor_policy_config *old, struct sc_hypervisor_policy_config* new)
 {
 	old->min_nworkers = new->min_nworkers != -1 ? new->min_nworkers : old->min_nworkers ;
 	old->max_nworkers = new->max_nworkers != -1 ? new->max_nworkers : old->max_nworkers ;
@@ -57,7 +57,7 @@ static void _update_config(struct sched_ctx_hypervisor_policy_config *old, struc
 	}
 }
 
-void sched_ctx_hypervisor_set_config(unsigned sched_ctx, void *config)
+void sc_hypervisor_set_config(unsigned sched_ctx, void *config)
 {
 	if(hypervisor.sched_ctx_w[sched_ctx].config != NULL && config != NULL)
 	{
@@ -73,7 +73,7 @@ void sched_ctx_hypervisor_set_config(unsigned sched_ctx, void *config)
 
 void _add_config(unsigned sched_ctx)
 {
-	struct sched_ctx_hypervisor_policy_config *config = _create_config();
+	struct sc_hypervisor_policy_config *config = _create_config();
 	config->min_nworkers = 0;
 	config->max_nworkers = STARPU_NMAXWORKERS;
 	config->new_workers_max_idle = MAX_IDLE_TIME;
@@ -89,27 +89,27 @@ void _add_config(unsigned sched_ctx)
 		config->min_working[i] = MIN_WORKING_TIME;
 	}
 
-	sched_ctx_hypervisor_set_config(sched_ctx, config);
+	sc_hypervisor_set_config(sched_ctx, config);
 }
 
 void _remove_config(unsigned sched_ctx)
 {
-	sched_ctx_hypervisor_set_config(sched_ctx, NULL);
+	sc_hypervisor_set_config(sched_ctx, NULL);
 }
 
-struct sched_ctx_hypervisor_policy_config* sched_ctx_hypervisor_get_config(unsigned sched_ctx)
+struct sc_hypervisor_policy_config* sc_hypervisor_get_config(unsigned sched_ctx)
 {
 	return hypervisor.sched_ctx_w[sched_ctx].config;
 }
 
-static struct sched_ctx_hypervisor_policy_config* _ioctl(unsigned sched_ctx, va_list varg_list, unsigned later)
+static struct sc_hypervisor_policy_config* _ioctl(unsigned sched_ctx, va_list varg_list, unsigned later)
 {
-	struct sched_ctx_hypervisor_policy_config *config = NULL;
+	struct sc_hypervisor_policy_config *config = NULL;
 
 	if(later)
 		config = _create_config();
 	else
-		config = sched_ctx_hypervisor_get_config(sched_ctx);
+		config = sc_hypervisor_get_config(sched_ctx);
 
 	assert(config != NULL);
 
@@ -215,7 +215,7 @@ static struct sched_ctx_hypervisor_policy_config* _ioctl(unsigned sched_ctx, va_
 }
 
 
-void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...)
+void sc_hypervisor_ioctl(unsigned sched_ctx, ...)
 {
 	va_list varg_list;
 	va_start(varg_list, sched_ctx);
@@ -246,7 +246,7 @@ void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...)
 	va_start(varg_list, sched_ctx);
 
 	/* if config not null => save hypervisor configuration and consider it later */
-	struct sched_ctx_hypervisor_policy_config *config = _ioctl(sched_ctx, varg_list, (task_tag > 0));
+	struct sc_hypervisor_policy_config *config = _ioctl(sched_ctx, varg_list, (task_tag > 0));
 	if(config != NULL)
 	{
 		struct configuration_entry *entry;

+ 76 - 73
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -14,11 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <sched_ctx_hypervisor_intern.h>
+#include <sc_hypervisor_intern.h>
 #include <common/uthash.h>
 #include <starpu_config.h>
 
 unsigned imposed_resize = 0;
+unsigned type_of_tasks_known = 0;
 struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
@@ -29,25 +30,25 @@ static void notify_idle_end(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
 static void notify_delete_context(unsigned sched_ctx);
 
-extern struct sched_ctx_hypervisor_policy idle_policy;
-extern struct sched_ctx_hypervisor_policy app_driven_policy;
-extern struct sched_ctx_hypervisor_policy gflops_rate_policy;
+extern struct sc_hypervisor_policy idle_policy;
+extern struct sc_hypervisor_policy app_driven_policy;
+extern struct sc_hypervisor_policy gflops_rate_policy;
 #ifdef STARPU_HAVE_GLPK_H
-extern struct sched_ctx_hypervisor_policy lp_policy;
-extern struct sched_ctx_hypervisor_policy lp2_policy;
-extern struct sched_ctx_hypervisor_policy ispeed_lp_policy;
-extern struct sched_ctx_hypervisor_policy debit_lp_policy;
+extern struct sc_hypervisor_policy feft_lp_policy;
+extern struct sc_hypervisor_policy teft_lp_policy;
+extern struct sc_hypervisor_policy ispeed_lp_policy;
+extern struct sc_hypervisor_policy debit_lp_policy;
 #endif // STARPU_HAVE_GLPK_
-extern struct sched_ctx_hypervisor_policy ispeed_policy;
+extern struct sc_hypervisor_policy ispeed_policy;
 
 
-static struct sched_ctx_hypervisor_policy *predefined_policies[] =
+static struct sc_hypervisor_policy *predefined_policies[] =
 {
         &idle_policy,
 	&app_driven_policy,
 #ifdef STARPU_HAVE_GLPK_H
-	&lp_policy,
-	&lp2_policy,
+	&feft_lp_policy,
+	&teft_lp_policy,
 	&ispeed_lp_policy,
 	&debit_lp_policy,
 #endif // STARPU_HAVE_GLPK_H
@@ -55,7 +56,7 @@ static struct sched_ctx_hypervisor_policy *predefined_policies[] =
 	&ispeed_policy
 };
 
-static void _load_hypervisor_policy(struct sched_ctx_hypervisor_policy *policy)
+static void _load_hypervisor_policy(struct sc_hypervisor_policy *policy)
 {
 	STARPU_ASSERT(policy);
 
@@ -71,7 +72,7 @@ static void _load_hypervisor_policy(struct sched_ctx_hypervisor_policy *policy)
 }
 
 
-static struct sched_ctx_hypervisor_policy *_find_hypervisor_policy_from_name(const char *policy_name)
+static struct sc_hypervisor_policy *_find_hypervisor_policy_from_name(const char *policy_name)
 {
 
 	if (!policy_name)
@@ -80,7 +81,7 @@ static struct sched_ctx_hypervisor_policy *_find_hypervisor_policy_from_name(con
 	unsigned i;
 	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
 	{
-		struct sched_ctx_hypervisor_policy *p;
+		struct sc_hypervisor_policy *p;
 		p = predefined_policies[i];
 		if (p->name)
 		{
@@ -96,9 +97,9 @@ static struct sched_ctx_hypervisor_policy *_find_hypervisor_policy_from_name(con
 	return NULL;
 }
 
-static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sched_ctx_hypervisor_policy* hypervisor_policy)
+static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervisor_policy* hypervisor_policy)
 {
-	struct sched_ctx_hypervisor_policy *selected_policy = NULL;
+	struct sc_hypervisor_policy *selected_policy = NULL;
 
 	if(hypervisor_policy && hypervisor_policy->custom)
 		return hypervisor_policy;
@@ -128,7 +129,7 @@ static struct sched_ctx_hypervisor_policy *_select_hypervisor_policy(struct sche
 
 
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
-struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *hypervisor_policy)
+struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 {
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
@@ -171,7 +172,7 @@ struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct s
 		}
 	}
 
-	struct sched_ctx_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
+	struct sc_hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
 	_load_hypervisor_policy(selected_hypervisor_policy);
 
 	perf_counters = (struct starpu_sched_ctx_performance_counters*)malloc(sizeof(struct starpu_sched_ctx_performance_counters));
@@ -188,20 +189,20 @@ struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct s
 	return perf_counters;
 }
 
-const char* sched_ctx_hypervisor_get_policy()
+const char* sc_hypervisor_get_policy()
 {
 	return hypervisor.policy.name;
 }
 
 /* the user can forbid the resizing process*/
-void sched_ctx_hypervisor_stop_resize(unsigned sched_ctx)
+void sc_hypervisor_stop_resize(unsigned sched_ctx)
 {
 	imposed_resize = 1;
 	hypervisor.resize[sched_ctx] = 0;
 }
 
 /* the user can restart the resizing process*/
-void sched_ctx_hypervisor_start_resize(unsigned sched_ctx)
+void sc_hypervisor_start_resize(unsigned sched_ctx)
 {
 	imposed_resize = 1;
 	hypervisor.resize[sched_ctx] = 1;
@@ -217,19 +218,19 @@ static void _print_current_time()
 	{
 		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
 		{
-			struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
+			struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
 
-			double cpu_speed = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
-			double cuda_speed = sched_ctx_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
-			int ncpus = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
-			int ncuda = sched_ctx_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
+			double cpu_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CPU_WORKER);
+			double cuda_speed = sc_hypervisor_get_velocity(sc_w, STARPU_CUDA_WORKER);
+			int ncpus = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CPU_WORKER);
+			int ncuda = sc_hypervisor_get_nworkers_ctx(sc_w->sched_ctx, STARPU_CUDA_WORKER);
 			fprintf(stdout, "%d: cpu_v = %lf cuda_v = %lf ncpus = %d ncuda = %d\n", hypervisor.sched_ctxs[i], cpu_speed, cuda_speed, ncpus, ncuda);
 		}
 	}
 	return;
 }
 
-void sched_ctx_hypervisor_shutdown(void)
+void sc_hypervisor_shutdown(void)
 {
 //	printf("shutdown\n");
 	int i;
@@ -237,8 +238,8 @@ void sched_ctx_hypervisor_shutdown(void)
 	{
                 if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && hypervisor.nsched_ctxs > 0)
 		{
-			sched_ctx_hypervisor_stop_resize(hypervisor.sched_ctxs[i]);
-			sched_ctx_hypervisor_unregister_ctx(hypervisor.sched_ctxs[i]);
+			sc_hypervisor_stop_resize(hypervisor.sched_ctxs[i]);
+			sc_hypervisor_unregister_ctx(hypervisor.sched_ctxs[i]);
 			starpu_pthread_mutex_destroy(&hypervisor.sched_ctx_w[i].mutex);
 		}
 	}
@@ -256,7 +257,7 @@ void sched_ctx_hypervisor_shutdown(void)
 }
 
 /* the hypervisor is in charge only of the contexts registered to it*/
-void sched_ctx_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
+void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 {
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	hypervisor.configurations[sched_ctx] = NULL;
@@ -308,7 +309,7 @@ static void _rearange_sched_ctxs(int *sched_ctxs, int old_nsched_ctxs)
 }
 
 /* unregistered contexts will no longer be resized */
-void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
+void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 {
 	if(hypervisor.policy.end_ctx)
 		hypervisor.policy.end_ctx(sched_ctx);
@@ -333,12 +334,12 @@ void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
 	starpu_pthread_mutex_destroy(&hypervisor.conf_mut[sched_ctx]);
 	starpu_pthread_mutex_destroy(&hypervisor.resize_mut[sched_ctx]);
 	if(hypervisor.nsched_ctxs == 1)
-		sched_ctx_hypervisor_stop_resize(hypervisor.sched_ctxs[0]);
+		sc_hypervisor_stop_resize(hypervisor.sched_ctxs[0]);
 
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
-static double _get_best_total_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
+static double _get_best_total_elapsed_flops(struct sc_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
 {
 	double ret_val = 0.0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
@@ -364,7 +365,7 @@ static double _get_best_total_elapsed_flops(struct sched_ctx_hypervisor_wrapper*
 }
 
 /* compute an average value of the cpu/cuda velocity */
-double sched_ctx_hypervisor_get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
 {
         int npus = 0;
         double elapsed_flops = _get_best_total_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
@@ -383,7 +384,7 @@ double sched_ctx_hypervisor_get_velocity_per_worker_type(struct sched_ctx_hyperv
 }
 
 /* compute an average value of the cpu/cuda old velocity */
-double _get_ref_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_ref_velocity_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
 {
 	double ref_velocity = 0.0;
 	unsigned nw = 0;
@@ -436,7 +437,7 @@ static void _get_cpus(int *workers, int nworkers, int *cpus, int *ncpus)
 	}
 }
 
-int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch)
+int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch)
 {
 	int nworkers_ctx = 0;
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
@@ -470,7 +471,7 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 	}
 }
 
-double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
+double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w)
 {
 	double ret_val = 0.0;
 	int i;
@@ -479,7 +480,7 @@ double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hyp
 	return ret_val;
 }
 
-double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
+double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper* sc_w)
 {
 	double ret_val = 0.0;
 	int i;
@@ -492,24 +493,22 @@ double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_c
 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
 {
 	/* info concerning only the gflops_rate strateg */
-	struct sched_ctx_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
-	struct sched_ctx_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
+	struct sc_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
+	struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
 	
 	double start_time =  starpu_timing_now();
 	sender_sc_w->start_time = start_time;
-//	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
 
 	receiver_sc_w->start_time = start_time;
-//	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
 }
 
 /* actually move the workers: the cpus are moved, gpus are only shared  */
 /* forbids another resize request before this one is take into account */
-void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move, unsigned now)
+void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move, unsigned now)
 {
-	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])// && hypervisor.resize[receiver_sched_ctx])
+	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])
 	{
 		_print_current_time();
 		unsigned j;
@@ -553,12 +552,11 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 				}
 
 				hypervisor.resize[sender_sched_ctx] = 0;
-//				hypervisor.resize[receiver_sched_ctx] = 0;
 
 				starpu_pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
 			}
 		}
-		struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+		struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(receiver_sched_ctx);
 		unsigned i;
 		for(i = 0; i < nworkers_to_move; i++)
 			new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
@@ -567,7 +565,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 	return;
 }
 
-void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx)
+void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx)
 {
 	if(nworkers_to_add > 0 && hypervisor.resize[sched_ctx])
 	{
@@ -578,7 +576,7 @@ void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned
 			printf(" %d", workers_to_add[j]);
 		printf("\n");
 		starpu_sched_ctx_add_workers(workers_to_add, nworkers_to_add, sched_ctx);
-		struct sched_ctx_hypervisor_policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctx);
+		struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(sched_ctx);
 		unsigned i;
 		for(i = 0; i < nworkers_to_add; i++)
 			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
@@ -587,12 +585,12 @@ void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned
 	return;
 }
 
-unsigned sched_ctx_hypervisor_can_resize(unsigned sched_ctx)
+unsigned sc_hypervisor_can_resize(unsigned sched_ctx)
 {
 	return hypervisor.resize[sched_ctx];
 }
 
-void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now)
+void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now)
 {
 	if(nworkers_to_remove > 0 && hypervisor.resize[sched_ctx] && hypervisor.allow_remove[sched_ctx])
 	{
@@ -653,7 +651,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 	if(worker != -1 && !starpu_sched_ctx_contains_worker(worker, sched_ctx))
 		return 0;
 
-	struct sched_ctx_hypervisor_resize_ack *resize_ack = NULL;
+	struct sc_hypervisor_resize_ack *resize_ack = NULL;
 	unsigned sender_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 	int i;
@@ -661,7 +659,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 	{
 		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
 		{
-			struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
+			struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
 			starpu_pthread_mutex_lock(&sc_w->mutex);
 			unsigned only_remove = 0;
 			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != (int)sched_ctx &&
@@ -736,7 +734,6 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
 				hypervisor.resize[sender_sched_ctx] = 1;
 				hypervisor.allow_remove[receiver_sched_ctx] = 1;
-				//	hypervisor.resize[receiver_sched_ctx] = 1;
 				/* if the user allowed resizing leave the decisions to the application */
 				if(imposed_resize)  imposed_resize = 0;
 
@@ -756,7 +753,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
 /* Enqueue a resize request for 'sched_ctx', to be executed when the
  * 'task_tag' tasks of 'sched_ctx' complete.  */
-void sched_ctx_hypervisor_resize(unsigned sched_ctx, int task_tag)
+void sc_hypervisor_resize(unsigned sched_ctx, int task_tag)
 {
 	struct resize_request_entry *entry;
 
@@ -787,7 +784,7 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 {
 	if(hypervisor.resize[sched_ctx])
 	{
-		struct sched_ctx_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
+		struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
 		sc_w->current_idle_time[worker] += idle_time;
 		if(hypervisor.policy.handle_idle_cycle)
 		{
@@ -828,7 +825,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += task->flops;
-	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops; //sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 
 	if(hypervisor.resize[sched_ctx])
 	{	
@@ -862,9 +859,9 @@ static void notify_post_exec_hook(unsigned sched_ctx, int task_tag)
 
 		if (entry != NULL)
 		{
-			struct sched_ctx_hypervisor_policy_config *config = entry->configuration;
+			struct sc_hypervisor_policy_config *config = entry->configuration;
 
-			sched_ctx_hypervisor_set_config(conf_sched_ctx, config);
+			sc_hypervisor_set_config(conf_sched_ctx, config);
 			HASH_DEL(hypervisor.configurations[conf_sched_ctx], entry);
 			free(config);
 		}
@@ -882,8 +879,7 @@ static void notify_post_exec_hook(unsigned sched_ctx, int task_tag)
 			HASH_FIND_INT(hypervisor.resize_requests[sched_ctx], &task_tag, entry);
 			if (entry != NULL)
 			{
-				hypervisor.policy.handle_post_exec_hook(sched_ctx,
-									task_tag);
+				hypervisor.policy.handle_post_exec_hook(sched_ctx, task_tag);
 				HASH_DEL(hypervisor.resize_requests[sched_ctx], entry);
 				free(entry);
 			}
@@ -900,17 +896,24 @@ static void notify_submitted_job(struct starpu_task *task, uint32_t footprint)
 	hypervisor.sched_ctx_w[task->sched_ctx].submitted_flops += task->flops;
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
+	if(hypervisor.policy.handle_submitted_job && !type_of_tasks_known)
+		hypervisor.policy.handle_submitted_job(task->cl, task->sched_ctx, footprint);
+}
+
+void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint)
+{
+	type_of_tasks_known = 1;
 	if(hypervisor.policy.handle_submitted_job)
-		hypervisor.policy.handle_submitted_job(task, footprint);
+		hypervisor.policy.handle_submitted_job(cl, sched_ctx, footprint);
 }
 
 static void notify_delete_context(unsigned sched_ctx)
 {
 	_print_current_time();
-	sched_ctx_hypervisor_unregister_ctx(sched_ctx);
+	sc_hypervisor_unregister_ctx(sched_ctx);
 }
 
-void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+void sc_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	unsigned curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
@@ -924,24 +927,24 @@ void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *worke
 		hypervisor.policy.size_ctxs(curr_sched_ctxs, curr_nsched_ctxs, workers, nworkers);
 }
 
-struct sched_ctx_hypervisor_wrapper* sched_ctx_hypervisor_get_wrapper(unsigned sched_ctx)
+struct sc_hypervisor_wrapper* sc_hypervisor_get_wrapper(unsigned sched_ctx)
 {
 	return &hypervisor.sched_ctx_w[sched_ctx];
 }
 
-int* sched_ctx_hypervisor_get_sched_ctxs()
+int* sc_hypervisor_get_sched_ctxs()
 {
 	return hypervisor.sched_ctxs;
 }
 
-int sched_ctx_hypervisor_get_nsched_ctxs()
+int sc_hypervisor_get_nsched_ctxs()
 {
 	int ns;
 	ns = hypervisor.nsched_ctxs;
 	return ns;
 }
 
-void sched_ctx_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+void sc_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	hypervisor.sr = (struct size_request*)malloc(sizeof(struct size_request));
 	hypervisor.sr->sched_ctxs = sched_ctxs;
@@ -950,7 +953,7 @@ void sched_ctx_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *w
 	hypervisor.sr->nworkers = nworkers;
 }
 
-unsigned sched_ctx_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
+unsigned sc_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
 {
 	if(hypervisor.sr != NULL)
 	{
@@ -963,7 +966,7 @@ unsigned sched_ctx_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, i
 	return 0;
 }
 
-void sched_ctx_hypervisor_free_size_req(void)
+void sc_hypervisor_free_size_req(void)
 {
 	if(hypervisor.sr != NULL)
 	{
@@ -972,12 +975,12 @@ void sched_ctx_hypervisor_free_size_req(void)
 	}
 }
 
-double sched_ctx_hypervisor_get_velocity(struct sched_ctx_hypervisor_wrapper *sc_w, enum starpu_archtype arch)
+double sc_hypervisor_get_velocity(struct sc_hypervisor_wrapper *sc_w, enum starpu_archtype arch)
 {
 
-	double velocity = sched_ctx_hypervisor_get_velocity_per_worker_type(sc_w, arch);
+	double velocity = sc_hypervisorsc_hypervisor_get_velocity_per_worker_type(sc_w, arch);
 	if(velocity == -1.0)
-		velocity = _get_ref_velocity_per_worker_type(sc_w, arch);
+		velocity = sc_hypervisor_get_ref_velocity_per_worker_type(sc_w, arch);
 	if(velocity == -1.0)
 		velocity = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
        

+ 7 - 7
sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <sched_ctx_hypervisor.h>
+#include <sc_hypervisor.h>
 #include <common/uthash.h>
 struct size_request
 {
@@ -46,21 +46,21 @@ struct configuration_entry
 	uint32_t task_tag;
 
 	/* Value: configuration of the scheduling context.  */
-	struct sched_ctx_hypervisor_policy_config *configuration;
+	struct sc_hypervisor_policy_config *configuration;
 
 	/* Bookkeeping.  */
 	UT_hash_handle hh;
 };
 
-struct sched_ctx_hypervisor
+struct sc_hypervisor
 {
-	struct sched_ctx_hypervisor_wrapper sched_ctx_w[STARPU_NMAX_SCHED_CTXS];
+	struct sc_hypervisor_wrapper sched_ctx_w[STARPU_NMAX_SCHED_CTXS];
 	int sched_ctxs[STARPU_NMAX_SCHED_CTXS];
 	unsigned nsched_ctxs;
 	unsigned resize[STARPU_NMAX_SCHED_CTXS];
 	unsigned allow_remove[STARPU_NMAX_SCHED_CTXS];
 	int min_tasks;
-	struct sched_ctx_hypervisor_policy policy;
+	struct sc_hypervisor_policy policy;
 
 	struct configuration_entry *configurations[STARPU_NMAX_SCHED_CTXS];
 
@@ -76,13 +76,13 @@ struct sched_ctx_hypervisor
 	double start_executing_time;
 };
 
-struct sched_ctx_hypervisor_adjustment
+struct sc_hypervisor_adjustment
 {
 	int workerids[STARPU_NMAXWORKERS];
 	int nworkers;
 };
 
-struct sched_ctx_hypervisor hypervisor;
+struct sc_hypervisor hypervisor;
 
 
 void _add_config(unsigned sched_ctx);

+ 0 - 238
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -1,238 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011, 2012  INRIA
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef SCHED_CTX_HYPERVISOR_H
-#define SCHED_CTX_HYPERVISOR_H
-
-#include <starpu.h>
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-#ifdef STARPU_DEVEL
-#  warning rename all objects to start with sched_ctx_hypervisor
-#endif
-
-/* ioctl properties*/
-#define HYPERVISOR_MAX_IDLE -1
-#define HYPERVISOR_MIN_WORKING -2
-#define HYPERVISOR_PRIORITY -3
-#define HYPERVISOR_MIN_WORKERS -4
-#define HYPERVISOR_MAX_WORKERS -5
-#define HYPERVISOR_GRANULARITY -6
-#define HYPERVISOR_FIXED_WORKERS -7
-#define HYPERVISOR_MIN_TASKS -8
-#define HYPERVISOR_NEW_WORKERS_MAX_IDLE -9
-#define HYPERVISOR_TIME_TO_APPLY -10
-#define HYPERVISOR_EMPTY_CTX_MAX_IDLE -11
-#define HYPERVISOR_NULL -12
-#define	HYPERVISOR_ISPEED_W_SAMPLE -13
-#define HYPERVISOR_ISPEED_CTX_SAMPLE -14
-
-starpu_pthread_mutex_t act_hypervisor_mutex;
-
-#define MAX_IDLE_TIME 5000000000
-#define MIN_WORKING_TIME 500
-
-struct sched_ctx_hypervisor_policy_config
-{
-	/* underneath this limit we cannot resize */
-	int min_nworkers;
-
-	/* above this limit we cannot resize */
-	int max_nworkers;
-
-	/*resize granularity */
-	int granularity;
-
-	/* priority for a worker to stay in this context */
-	/* the smaller the priority the faster it will be moved */
-	/* to another context */
-	int priority[STARPU_NMAXWORKERS];
-
-	/* above this limit the priority of the worker is reduced */
-	double max_idle[STARPU_NMAXWORKERS];
-
-	/* underneath this limit the priority of the worker is reduced */
-	double min_working[STARPU_NMAXWORKERS];
-
-	/* workers that will not move */
-	int fixed_workers[STARPU_NMAXWORKERS];
-
-	/* max idle for the workers that will be added during the resizing process*/
-	double new_workers_max_idle;
-
-	/* above this context we allow removing all workers */
-	double empty_ctx_max_idle[STARPU_NMAXWORKERS];
-
-	/* sample used to compute the instant speed per worker*/
-	double ispeed_w_sample[STARPU_NMAXWORKERS];
-
-	/* sample used to compute the instant speed per ctx*/
-	double ispeed_ctx_sample;
-
-};
-
-struct sched_ctx_hypervisor_resize_ack
-{
-	int receiver_sched_ctx;
-	int *moved_workers;
-	int nmoved_workers;
-	int *acked_workers;
-};
-
-/* wrapper attached to a sched_ctx storing monitoring information */
-struct sched_ctx_hypervisor_wrapper
-{
-	/* the sched_ctx it monitors */
-	unsigned sched_ctx;
-
-	/* user configuration meant to limit resizing */
-	struct sched_ctx_hypervisor_policy_config *config;
-
-	/* idle time of workers in this context */
-	double current_idle_time[STARPU_NMAXWORKERS];
-	
-	/* list of workers that will leave this contexts (lazy resizing process) */
-	int worker_to_be_removed[STARPU_NMAXWORKERS];
-
-	/* number of tasks pushed on each worker in this ctx */
-	int pushed_tasks[STARPU_NMAXWORKERS];
-
-	/* number of tasks poped from each worker in this ctx */
-	int poped_tasks[STARPU_NMAXWORKERS];
-
-	/* number of flops the context has to execute */
-	double total_flops;
-
-	/* number of flops executed since the biginning until now */
-	double total_elapsed_flops[STARPU_NMAXWORKERS];
-
-	/* number of flops executed since last resizing */
-	double elapsed_flops[STARPU_NMAXWORKERS];
-
-	/* data quantity executed on each worker in this ctx */
-	size_t elapsed_data[STARPU_NMAXWORKERS];
-
-	/* nr of tasks executed on each worker in this ctx */
-	int elapsed_tasks[STARPU_NMAXWORKERS];
-
-	/* the average speed of workers when they belonged to this context */
-	double ref_velocity[STARPU_NMAXWORKERS];
-
-	/* number of flops submitted to this ctx */
-	double submitted_flops;
-
-	/* number of flops that still have to be executed in this ctx */
-	double remaining_flops;
-	
-	/* the start time of the resizing sample of this context*/
-	double start_time;
-
-	/* the first time a task was pushed to this context*/
-	double real_start_time;
-
-	/* the workers don't leave the current ctx until the receiver ctx 
-	   doesn't ack the receive of these workers */
-	struct sched_ctx_hypervisor_resize_ack resize_ack;
-
-	/* mutex to protect the ack of workers */
-	starpu_pthread_mutex_t mutex;
-};
-
-/* Forward declaration of an internal data structure
- * FIXME: Remove when no longer exposed.  */
-struct resize_request_entry;
-
-struct sched_ctx_hypervisor_policy
-{
-	const char* name;
-	unsigned custom;
-	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
-	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
-	void (*handle_pushed_task)(unsigned sched_ctx, int worker);
-	void (*handle_poped_task)(unsigned sched_ctx, int worker,struct starpu_task *task, uint32_t footprint);
-	void (*handle_idle_end)(unsigned sched_ctx, int worker);
-
-	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
-
-	void (*handle_submitted_job)(struct starpu_task *task, unsigned footprint);
-	
-	void (*end_ctx)(unsigned sched_ctx);
-};
-
-struct starpu_sched_ctx_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
-
-void sched_ctx_hypervisor_shutdown(void);
-
-void sched_ctx_hypervisor_register_ctx(unsigned sched_ctx, double total_flops);
-
-void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx);
-
-void sched_ctx_hypervisor_resize(unsigned sched_ctx, int task_tag);
-
-void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
-
-void sched_ctx_hypervisor_stop_resize(unsigned sched_ctx);
-
-void sched_ctx_hypervisor_start_resize(unsigned sched_ctx);
-
-void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...);
-
-void sched_ctx_hypervisor_set_config(unsigned sched_ctx, void *config);
-
-struct sched_ctx_hypervisor_policy_config *sched_ctx_hypervisor_get_config(unsigned sched_ctx);
-
-int *sched_ctx_hypervisor_get_sched_ctxs();
-
-int sched_ctx_hypervisor_get_nsched_ctxs();
-
-int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch);
-
-struct sched_ctx_hypervisor_wrapper *sched_ctx_hypervisor_get_wrapper(unsigned sched_ctx);
-
-double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper *sc_w);
-
-double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w);
-
-const char *sched_ctx_hypervisor_get_policy();
-
-void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx);
-
-void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now);
-
-void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
-
-unsigned sched_ctx_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers);
-
-void sched_ctx_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
-
-void sched_ctx_hypervisor_free_size_req(void);
-
-unsigned sched_ctx_hypervisor_can_resize(unsigned sched_ctx);
-
-/* compute an average value of the cpu/cuda velocity */
-double sched_ctx_hypervisor_get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch);
-
-double sched_ctx_hypervisor_get_velocity(struct sched_ctx_hypervisor_wrapper *sc_w, enum starpu_archtype arch);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif

+ 0 - 120
sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c

@@ -1,120 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011, 2012  INRIA
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "lp_tools.h"
-#include <starpu_config.h>
-
-
-#ifdef STARPU_HAVE_GLPK_H
-static void lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
-{
-	if(_velocity_gap_btw_ctxs())
-	{
-		int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
-
-		double nworkers[nsched_ctxs][2];
-
-		int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
-		if(ret != EBUSY)
-		{
-			int total_nw[2];
-			_get_total_nw(NULL, -1, 2, total_nw);
-
-
-			struct timeval start_time;
-			struct timeval end_time;
-			gettimeofday(&start_time, NULL);
-
-			double vmax = _lp_get_nworkers_per_ctx(nsched_ctxs, 2, nworkers, total_nw);
-			gettimeofday(&end_time, NULL);
-
-			long diff_s = end_time.tv_sec  - start_time.tv_sec;
-			long diff_us = end_time.tv_usec  - start_time.tv_usec;
-
-			float timing = (float)(diff_s*1000000 + diff_us)/1000;
-
-			if(vmax != 0.0)
-			{
-				int nworkers_rounded[nsched_ctxs][2];
-				_lp_round_double_to_int(nsched_ctxs, 2, nworkers, nworkers_rounded);
-				_lp_redistribute_resources_in_ctxs(nsched_ctxs, 2, nworkers_rounded, nworkers);
-			}
-			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
-		}
-	}
-}
-static void lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
-{
-	int nsched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_nsched_ctxs() : ns;
-	double nworkers_per_type[nsched_ctxs][2];
-	int total_nw[2];
-	_get_total_nw(workers, nworkers, 2, total_nw);
-
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	double vmax = _lp_get_nworkers_per_ctx(nsched_ctxs, 2, nworkers_per_type, total_nw);
-	if(vmax != 0.0)
-	{
-/*  		printf("********size\n"); */
-/* 		int i; */
-/* 		for( i = 0; i < nsched_ctxs; i++) */
-/* 		{ */
-/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 0, nworkers_per_type[i][0]); */
-/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 1, nworkers_per_type[i][1]); */
-/* 		} */
-		int nworkers_per_type_rounded[nsched_ctxs][2];
-		_lp_round_double_to_int(nsched_ctxs, 2, nworkers_per_type, nworkers_per_type_rounded);
-/*       		for( i = 0; i < nsched_ctxs; i++) */
-/* 		{ */
-/* 			printf("ctx %d/worker type %d: n = %d \n", i, 0, nworkers_per_type_rounded[i][0]); */
-/* 			printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]); */
-/* 		} */
-		int *current_sched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : 
-			sched_ctxs;
-
-		unsigned has_workers = 0;
-		int s;
-		for(s = 0; s < ns; s++)
-		{
-			int nworkers_ctx = sched_ctx_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], 
-									     STARPU_ANY_WORKER);
-			if(nworkers_ctx != 0)
-			{
-				has_workers = 1;
-				break;
-			}
-		}
-		if(has_workers)
-			_lp_redistribute_resources_in_ctxs(nsched_ctxs, 2, nworkers_per_type_rounded, nworkers_per_type);
-		else
-			_lp_distribute_resources_in_ctxs(sched_ctxs, nsched_ctxs, 2, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
-	}
-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
-}
-
-struct sched_ctx_hypervisor_policy lp_policy = {
-	.size_ctxs = lp_size_ctxs,
-	.handle_poped_task = lp_handle_poped_task,
-	.handle_pushed_task = NULL,
-	.handle_idle_cycle = NULL,
-	.handle_idle_end = NULL,
-	.handle_post_exec_hook = NULL,
-	.handle_submitted_job = NULL,
-	.end_ctx = NULL,
-	.custom = 0,
-	.name = "lp"
-};
-
-#endif /* STARPU_HAVE_GLPK_H */

+ 0 - 0
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h


Some files were not shown because too many files changed in this diff