Browse Source

merge branches/sched_ctx in trunk

Nathalie Furmento 12 years ago
parent
commit
e07abf83c9
100 changed files with 12698 additions and 1374 deletions
  1. 5 0
      Makefile.am
  2. 37 1
      configure.ac
  3. 2 1
      doc/Makefile.am
  4. 137 28
      doc/chapters/advanced-api.texi
  5. 14 2
      doc/chapters/configuration.texi
  6. 36 1
      doc/chapters/perf-optimization.texi
  7. 395 0
      doc/chapters/sched_ctx_hypervisor.texi
  8. 25 16
      doc/starpu.texi
  9. 7 1
      examples/Makefile.am
  10. 27 0
      examples/cholesky/cholesky.h
  11. 59 31
      examples/cholesky/cholesky_implicit.c
  12. 128 0
      examples/sched_ctx/sched_ctx.c
  13. 301 0
      examples/sched_ctx_utils/sched_ctx_utils.c
  14. 12 0
      examples/sched_ctx_utils/sched_ctx_utils.h
  15. 60 22
      examples/scheduler/dummy_sched.c
  16. 8 2
      include/starpu.h
  17. 4 0
      include/starpu_config.h.in
  18. 1 0
      include/starpu_perfmodel.h
  19. 128 0
      include/starpu_sched_ctx.h
  20. 25 22
      include/starpu_scheduler.h
  21. 26 1
      include/starpu_task.h
  22. 2 0
      include/starpu_task_util.h
  23. 2 2
      libstarpu.pc.in
  24. 21 0
      sched_ctx_hypervisor/Makefile.am
  25. 57 0
      sched_ctx_hypervisor/examples/Makefile.am
  26. 119 0
      sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
  27. 159 0
      sched_ctx_hypervisor/examples/cholesky/cholesky.h
  28. 422 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c
  29. 371 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c
  30. 251 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c
  31. 160 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_models.c
  32. 407 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c
  33. 333 0
      sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c
  34. 525 0
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  35. 32 0
      sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h
  36. 167 0
      sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
  37. 39 0
      sched_ctx_hypervisor/src/Makefile.am
  38. 36 0
      sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c
  39. 307 0
      sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
  40. 54 0
      sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c
  41. 595 0
      sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
  42. 101 0
      sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
  43. 420 0
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
  44. 22 0
      sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h
  45. 401 0
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
  46. 41 0
      sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h
  47. 506 0
      sched_ctx_hypervisor/src/hypervisor_policies/simple_policy.c
  48. 249 0
      sched_ctx_hypervisor/src/sched_ctx_config.c
  49. 826 0
      sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
  50. 81 0
      sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
  51. 7 1
      src/Makefile.am
  52. 96 0
      src/common/barrier_counter.c
  53. 37 0
      src/common/barrier_counter.h
  54. 2 2
      src/common/fxt.h
  55. 4 4
      src/core/dependencies/implicit_data_deps.c
  56. 23 13
      src/core/jobs.c
  57. 10 7
      src/core/perfmodel/perfmodel.c
  58. 10 0
      src/core/perfmodel/perfmodel_history.c
  59. 994 0
      src/core/sched_ctx.c
  60. 139 0
      src/core/sched_ctx.h
  61. 240 46
      src/core/sched_policy.c
  62. 9 6
      src/core/sched_policy.h
  63. 100 25
      src/core/task.c
  64. 4 0
      src/core/task.h
  65. 7 0
      src/core/topology.c
  66. 126 18
      src/core/workers.c
  67. 29 6
      src/core/workers.h
  68. 2 1
      src/datawizard/filters.c
  69. 3 3
      src/datawizard/malloc.c
  70. 3 3
      src/datawizard/reduction.c
  71. 5 5
      src/datawizard/user_interactions.c
  72. 45 5
      src/debug/traces/starpu_fxt.c
  73. 90 0
      src/debug/traces/starpu_paje.c
  74. 11 10
      src/drivers/driver_common/driver_common.c
  75. 310 185
      src/sched_policies/deque_modeling_policy_data_aware.c
  76. 173 535
      src/sched_policies/detect_combined_workers.c
  77. 1 1
      src/sched_policies/detect_combined_workers.h
  78. 76 21
      src/sched_policies/eager_central_policy.c
  79. 82 25
      src/sched_policies/eager_central_priority_policy.c
  80. 641 0
      src/sched_policies/heft.c
  81. 117 60
      src/sched_policies/parallel_greedy.c
  82. 241 136
      src/sched_policies/parallel_heft.c
  83. 67 24
      src/sched_policies/random_policy.c
  84. 158 98
      src/sched_policies/work_stealing_policy.c
  85. 6 0
      src/top/starpu_top.c
  86. 1 1
      src/util/execute_on_all.c
  87. 2 1
      src/util/starpu_create_sync_task.c
  88. 1 1
      src/util/starpu_data_cpy.c
  89. 14 0
      src/util/starpu_insert_task_utils.c
  90. 166 0
      src/worker_collection/worker_list.c
  91. 1 1
      starpu-1.0.pc.in
  92. 39 0
      tests/cholesky_ctxs/all_sched.sh
  93. 108 0
      tests/cholesky_ctxs/comp.sh
  94. 57 0
      tests/cholesky_ctxs/comp_all.sh
  95. 58 0
      tests/cholesky_ctxs/evaluate_expression.sh
  96. 70 0
      tests/cholesky_ctxs/gnuplot_efficiency.sh
  97. 44 0
      tests/cholesky_ctxs/gnuplot_gflopsrate.sh
  98. 56 0
      tests/cholesky_ctxs/sched_no_ctxs.sh
  99. 70 0
      tests/cholesky_ctxs/sched_with_ctxs.sh
  100. 0 0
      tools/dev/experimental/test_return_values.sh

+ 5 - 0
Makefile.am

@@ -44,6 +44,10 @@ if BUILD_STARPUFFT
 SUBDIRS += starpufft
 endif
 
+if STARPU_BUILD_SCHED_CTX_HYPERVISOR
+SUBDIRS += sched_ctx_hypervisor
+endif
+
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libstarpu.pc starpu-1.0.pc
 
@@ -66,6 +70,7 @@ versinclude_HEADERS = 				\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
+	include/starpu_sched_ctx.h		\
 	include/starpu_top.h			\
 	include/starpu_deprecated_api.h         \
 	include/starpu_hash.h			\

+ 37 - 1
configure.ac

@@ -223,6 +223,38 @@ if test x$enable_libnuma = xyes; then
 fi
 
 ###############################################################################
+#									      #
+#				SCHED_CTX settings			      #
+#									      #
+###############################################################################
+AC_MSG_CHECKING(maximum number of sched_ctxs)
+AC_ARG_ENABLE(max_sched_ctxs, [AS_HELP_STRING([--enable-max-sched-ctxs=<number>],
+			[maximum number of sched_ctxs])],
+			max_sched_ctxs=$enableval, max_sched_ctxs=10)
+AC_MSG_RESULT($max_sched_ctxs)
+AC_DEFINE_UNQUOTED(STARPU_NMAX_SCHED_CTXS, [$max_sched_ctxs], [Maximum number of sched_ctxs supported])
+
+AC_ARG_ENABLE([sched_ctx_hypervisor],
+  [AS_HELP_STRING([--enable-sched-ctx-hypervisor],
+    [enable resizing contexts (experimental)])],
+  [enable_sched_ctx_hypervisor="yes"],
+  [enable_sched_ctx_hypervisor="no"])
+
+#for pkgconfig
+AC_SUBST(STARPU_SCHED_CTX_HYPERVISOR)
+if test "x$enable_sched_ctx_hypervisor" = "xyes"; then
+  AC_DEFINE(STARPU_USE_SCHED_CTX_HYPERVISOR, [1], [enable sched_ctx_hypervisor lib])
+#   PKG_CHECK_MODULES([SCHED_CTX_HYPERVISOR], [libsched_ctx_hypervisor], [], build_sched_ctx_hypervisor="yes")
+   STARPU_SCHED_CTX_HYPERVISOR="-lsched_ctx_hypervisor"
+   build_sched_ctx_hypervisor="yes"
+else
+   build_sched_ctx_hypervisor="no"
+fi
+
+
+AM_CONDITIONAL([STARPU_BUILD_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hypervisor" = "xyes"])
+AM_CONDITIONAL([STARPU_USE_SCHED_CTX_HYPERVISOR], [test "x$build_sched_ctx_hypervisor" = "xyes"])
+###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #
 #                                                                             #
@@ -985,7 +1017,7 @@ fi
 
 AC_CHECK_HEADERS([glpk.h])
 STARPU_HAVE_LIBRARY(GLPK, [glpk])
-
+AM_CONDITIONAL([STARPU_HAVE_GLPK], [test "x$build_sched_ctx_hypervisor" = "xyes"])
 AC_CHECK_HEADERS([Ayudame.h])
 
 ###############################################################################
@@ -1822,6 +1854,9 @@ AC_OUTPUT([
 	gcc-plugin/tests/Makefile
 	gcc-plugin/tests/run-test
 	gcc-plugin/examples/Makefile
+	sched_ctx_hypervisor/Makefile
+	sched_ctx_hypervisor/src/Makefile
+	sched_ctx_hypervisor/examples/Makefile
 ])
 
 AC_MSG_NOTICE([
@@ -1860,6 +1895,7 @@ AC_MSG_NOTICE([
 	       GCC plug-in test suite (requires GNU Guile): $run_gcc_plugin_test_suite
 	       SOCL enabled:                                $build_socl
                SOCL test suite:                             $run_socl_check
+               Scheduler Hypervisor:                        $build_sched_ctx_hypervisor
                simgrid enabled:                             $enable_simgrid
                ayudame enabled:                             $ac_cv_header_Ayudame_h
 ])

+ 2 - 1
doc/Makefile.am

@@ -35,7 +35,8 @@ starpu_TEXINFOS = chapters/advanced-api.texi \
 	chapters/using.texi \
 	chapters/vector_scal_opencl.texi \
 	chapters/socl.texi \
-	chapters/version.texi
+	chapters/version.texi \
+	chapters/sched_ctx_hypervisor.texi
 
 MAINTAINERCLEANFILES = starpu.pdf starpu.html
 

+ 137 - 28
doc/chapters/advanced-api.texi

@@ -12,6 +12,7 @@
 * Task Bundles::
 * Task Lists::
 * Using Parallel Tasks::
+* Scheduling Contexts::
 * Defining a new scheduling policy::
 * Running drivers::
 * Expert mode::
@@ -473,28 +474,6 @@ Get the description of a combined worker
 Variant of starpu_worker_can_execute_task compatible with combined workers
 @end deftypefun
 
-
-@node Defining a new scheduling policy
-@section Defining a new scheduling policy
-
-TODO
-
-A full example showing how to define a new scheduling policy is available in
-the StarPU sources in the directory @code{examples/scheduler/}.
-
-@menu
-* Scheduling Policy API:: Scheduling Policy API
-* Source code::
-@end menu
-
-@node Scheduling Policy API
-@subsection Scheduling Policy API
-
-While StarPU comes with a variety of scheduling policies (@pxref{Task
-scheduling policy}), it may sometimes be desirable to implement custom
-policies to address specific problems.  The API described below allows
-users to write their own scheduling policy.
-
 @deftp {Data Type} {struct starpu_machine_topology}
 @table @asis
 @item @code{unsigned nworkers}
@@ -549,10 +528,118 @@ driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 they are taken in ID order.
 
+@end table
+@end deftp
+
+@node Scheduling Contexts
+@section Scheduling Contexts
+StarPU permits on one hand grouping workers in combined workers in order to execute a parallel task and on the other hand grouping tasks in bundles that will be executed by a single specified worker.
+In contrast when we group workers in scheduling contexts we submit starpu tasks to them and we schedule them with the policy assigned to the context.
+Scheduling contexts can be created, deleted and modified dynamically.
+
+@deftypefun unsigned starpu_create_sched_ctx (const char *@var{policy_name}, int *@var{workerids_ctx}, int @var{nworkers_ctx}, const char *@var{sched_ctx_name})
+This function creates a scheduling context which uses the scheduling policy indicated in the first argument and assigns the workers indicated in the second argument to execute the tasks submitted to it.
+The return value represents the identifier of the context that has just been created. It will be further used to indicate the context the tasks will be submitted to. The return value should be at most @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+@deftypefun void starpu_delete_sched_ctx (unsigned @var{sched_ctx_id}, unsigned @var{inheritor_sched_ctx_id}) 
+Delete scheduling context @var{sched_ctx_id} and lets scheduling context @var{inheritor_sched_ctx_id} take over its workers.
+@end deftypefun
+
+@deftypefun void starpu_add_workers_to_sched_ctx ({int *}@var{workerids_ctx}, int @var{nworkers_ctx}, unsigned @var{sched_ctx})
+This function adds dynamically the workers indicated in the first argument to the context indicated in the last argument. The last argument cannot be greater than  @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+@deftypefun void starpu_remove_workers_from_sched_ctx ({int *}@var{workerids_ctx}, int @var{nworkers_ctx}, unsigned @var{sched_ctx})
+This function removes the workers indicated in the first argument from the context indicated in the last argument. The last argument cannot be greater than  @code{STARPU_NMAX_SCHED_CTXS}.
+@end deftypefun
+
+A scheduling context manages a collection of workers that can be memorized using different data structures. Thus, a generic structure is available in order to simplify the choice of its type. 
+Only the list data structure is available but further data structures(like tree) implementations are foreseen. 
+
+@deftp {Data Type} {struct worker_collection}
+@table @asis
+@item @code{void *workerids}
+The workerids managed by the collection
+@item @code{unsigned nworkers}
+The number of workerids
+@item @code{pthread_key_t cursor_key} (optional)
+The cursor needed to iterate the collection (depending on the data structure)
+@item @code{int type}
+The type of structure (currently WORKER_LIST is the only one available) 
+@item @code{unsigned (*has_next)(struct worker_collection *workers)}
+Checks if there is a next worker
+@item @code{int (*get_next)(struct worker_collection *workers)}
+Gets the next worker
+@item @code{int (*add)(struct worker_collection *workers, int worker)}
+Adds a worker to the collection
+@item @code{int (*remove)(struct worker_collection *workers, int worker)}
+Removes a worker from the collection
+@item @code{void* (*init)(struct worker_collection *workers)}
+Initialize the collection
+@item @code{void (*deinit)(struct worker_collection *workers)}
+Deinitialize the colection
+@item @code{void (*init_cursor)(struct worker_collection *workers)} (optional)
+Initialize the cursor if there is one
+@item @code{void (*deinit_cursor)(struct worker_collection *workers)} (optional)
+Deinitialize the cursor if there is one
 
 @end table
 @end deftp
 
+@deftypefun struct worker_collection* starpu_create_worker_collection_for_sched_ctx (unsigned @var{sched_ctx_id}, int @var{type})
+Creates a worker collection of the type indicated by the last parameter for the context specified through the first parameter.
+@end deftypefun
+
+@deftypefun void starpu_delete_worker_collection_for_sched_ctx (unsigned @var{sched_ctx_id})
+Deletes the worker collection of the specified scheduling context
+@end deftypefun
+ 
+@deftypefun struct worker_collection* starpu_get_worker_collection_of_sched_ctx (unsigned @var{sched_ctx_id})
+Returns the worker collection managed by the indicated context
+@end deftypefun
+
+@deftypefun pthread_mutex_t* starpu_get_changing_ctx_mutex (unsigned @var{sched_ctx_id})
+@end deftypefun
+
+@deftypefun void starpu_set_sched_ctx (unsigned *@var{sched_ctx})
+Sets the scheduling context the task will be submitted to
+@end deftypefun
+
+@deftypefun unsigned starpu_get_sched_ctx (void)
+Returns the scheduling contexts the tasks are currently submitted to
+@end deftypefun
+
+@deftypefun unsigned starpu_get_nworkers_of_sched_ctx (unsigned @var{sched_ctx})
+Returns the number of workers managed by the specified contexts
+(Usually needed to verify if it manages any workers or if it should be blocked)
+@end deftypefun
+
+@deftypefun unsigned starpu_get_nshared_workers (unsigned @var{sched_ctx_id}, unsigned @var{sched_ctx_id2})
+Returns the number of workers shared by two contexts
+@end deftypefun
+
+@node Defining a new scheduling policy
+@section Defining a new scheduling policy
+
+TODO
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory @code{examples/scheduler/}.
+
+@menu
+* Scheduling Policy API:: Scheduling Policy API
+* Source code::
+@end menu
+
+@node Scheduling Policy API
+@subsection Scheduling Policy API
+
+While StarPU comes with a variety of scheduling policies (@pxref{Task
+scheduling policy}), it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own scheduling policy.
+
 @deftp {Data Type} {struct starpu_sched_policy}
 This structure contains all the methods that implement a scheduling policy.  An
 application may specify which scheduling strategy in the @code{sched_policy}
@@ -560,10 +647,10 @@ field of the @code{starpu_conf} structure passed to the @code{starpu_init}
 function. The different fields are:
 
 @table @asis
-@item @code{void (*init_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+@item @code{void (*init_sched)(unsigned sched_ctx_id)}
 Initialize the scheduling policy.
 
-@item @code{void (*deinit_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+@item @code{void (*deinit_sched)(unsigned sched_ctx_id)}
 Cleanup the scheduling policy.
 
 @item @code{int (*push_task)(struct starpu_task *)}
@@ -576,14 +663,14 @@ is about to be executed by the worker. This method therefore permits to keep
 the state of of the scheduler coherent even when StarPU bypasses the scheduling
 strategy.
 
-@item @code{struct starpu_task *(*pop_task)(void)} (optional)
+@item @code{struct starpu_task *(*pop_task)(unsigned sched_ctx_id)} (optional)
 Get a task from the scheduler. The mutex associated to the worker is already
 taken when this method is called. If this method is defined as @code{NULL}, the
 worker will only execute tasks from its local queue. In this case, the
 @code{push_task} method should use the @code{starpu_push_local_task} method to
 assign tasks to the different workers.
 
-@item @code{struct starpu_task *(*pop_every_task)(void)}
+@item @code{struct starpu_task *(*pop_every_task)(unsigned sched_ctx_id)}
 Remove all available tasks from the scheduler (tasks are chained by the means
 of the prev and next fields of the starpu_task structure). The mutex associated
 to the worker is already taken when this method is called. This is currently
@@ -595,6 +682,12 @@ This method is called every time a task is starting.
 @item @code{void (*post_exec_hook)(struct starpu_task *)} (optional)
 This method is called every time a task has been executed.
 
+@item @code{void (*add_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)}
+Initialize scheduling structures corresponding to each worker used by the policy.
+
+@item @code{void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers)}
+Deinitialize scheduling structures corresponding to each worker used by the policy.
+
 @item @code{const char *policy_name} (optional)
 Name of the policy.
 
@@ -603,8 +696,8 @@ Description of the policy.
 @end table
 @end deftp
 
-@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
-This function specifies the condition variable associated to a worker
+@deftypefun void starpu_worker_set_sched_condition (unsigned @var{sched_ctx_id}, int @var{workerid}, {pthread_cond_t *}@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
+This function specifies the condition variable associated to a worker per context
 When there is no available task for a worker, StarPU blocks this worker on a
 condition variable. This function specifies which condition variable (and the
 associated mutex) should be used to block (and to wake up) a worker. Note that
@@ -615,6 +708,20 @@ The initialization method of a scheduling strategy (@code{init_sched}) must
 call this function once per worker.
 @end deftypefun
 
+@deftypefun void starpu_worker_get_sched_condition (unsigned @var{sched_ctx_id}, int @var{workerid}, {pthread_cond_t **}@var{sched_cond}, {pthread_mutex_t **}@var{sched_mutex})
+This function returns the condition variables associated to a worker in a context
+It is used in the policy to access to the local queue of the worker
+@end deftypefun
+
+@deftypefun void starpu_set_sched_ctx_policy_data (unsigned @var{sched_ctx}, {void*} @var{policy_data})
+Each scheduling policy uses some specific data (queues, variables, additional condition variables). 
+It is memorize through a local structure. This function assigns it to a scheduling context.
+@end deftypefun
+
+@deftypefun void* starpu_get_sched_ctx_policy_data (unsigned @var{sched_ctx})
+Returns the policy data previously assigned to a context
+@end deftypefun
+
 @deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
 Defines the minimum priority level supported by the scheduling policy. The
 default minimum priority level is the same as the default priority level which
@@ -690,6 +797,8 @@ Returns expected conversion time in ms (multiformat interface only)
 static struct starpu_sched_policy dummy_sched_policy = @{
     .init_sched = init_dummy_sched,
     .deinit_sched = deinit_dummy_sched,
+    .add_workers = dummy_sched_add_workers,
+    .remove_workers = dummy_sched_remove_workers,
     .push_task = push_task_dummy,
     .push_prio_task = NULL,
     .pop_task = pop_task_dummy,

+ 14 - 2
doc/chapters/configuration.texi

@@ -7,8 +7,8 @@
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Compilation configuration::
-* Execution configuration through environment variables::
+* Compilation configuration::   
+* Execution configuration through environment variables::  
 @end menu
 
 @node Compilation configuration
@@ -162,6 +162,12 @@ target device.  This information is then available as the
 @code{STARPU_MAXIMPLEMENTATIONS} macro.
 @end defvr
 
+@defvr {Configure option} --enable-max-sched-ctxs=@var{count}
+Allow for at most @var{count} scheduling contexts
+This information is then available as the
+@code{STARPU_NMAX_SCHED_CTXS} macro.
+@end defvr
+
 @defvr {Configure option} --disable-asynchronous-copy
 Disable asynchronous copies between CPU and GPU devices.
 The AMD implementation of OpenCL is known to
@@ -285,6 +291,12 @@ MKL website} provides a script to determine the linking flags.
 Disable the build of examples.
 @end defvr
 
+
+@defvr {Configure option} --enable-sched-ctx-hypervisor
+Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}).
+By default, it is disabled.
+@end defvr
+
 @defvr {Configure option} --enable-memory-stats
 Enable memory statistics (@pxref{Memory feedback}).
 @end defvr

+ 36 - 1
doc/chapters/perf-optimization.texi

@@ -14,6 +14,7 @@ TODO: improve!
 * Task submission::
 * Task priorities::
 * Task scheduling policy::
+* Task scheduling contexts::
 * Performance model calibration::
 * Task distribution vs Data transfer::
 * Data prefetch::
@@ -199,6 +200,39 @@ parallel tasks (still experimental).
 The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
 supports parallel tasks (still experimental).
 
+@node Task scheduling contexts
+@section Task scheduling contexts
+Task scheduling contexts represent abstracts sets of workers that allow the programmers to control the distribution of computational resources (i.e. CPUs and
+GPUs) to concurrent parallel kernels. The main goal is to minimize interferences between the execution of multiple parallel kernels, by partitioning the underlying pool of workers using contexts.
+
+By default, the application submits tasks to an initial context, which disposes of all the computation ressources available to StarPU (all the workers). 
+If the application programmer plans to launch several parallel kernels simultaneusly, by default these kernels will be executed within this initial context, using a single scheduler policy(@pxref{Task scheduling policy}).
+Meanwhile, if the application programmer is aware of the demands of these kernels and of the specificity of the machine used to execute them, the workers can be divided between several contexts. 
+These scheduling contexts will isolate the execution of each kernel and they will permit the use of a scheduling policy proper to each one of them.
+In order to create the contexts, you have to know the indentifiers of the workers running within StarPU. 
+By passing a set of workers together with the scheduling policy to the function @code{starpu_create_sched_ctx}, you will get an identifier of the context created which you will use to indicate the context you want to submit the tasks to.
+
+@cartouche
+@smallexample
+/* @b{the list of ressources the context will manage} */
+int workerids[3] = @{1, 3, 10@};
+
+/* @b{indicate the scheduling policy to be used within the context, the list of 
+   workers assigned to it, the number of workers, the name of the context} */
+int id_ctx = starpu_create_sched_ctx("heft", workerids, 3, "my_ctx");
+
+/* @b{let StarPU know that the folowing tasks will be submitted to this context} */
+starpu_set_sched_ctx(id);
+
+/* @b{submit the task to StarPU} */
+starpu_task_submit(task);
+
+@end smallexample
+@end cartouche
+
+Note: Parallel greedy and parallel heft scheduling policies do not support the existence of several disjoint contexts on the machine. 
+Combined workers are constructed depending on the entire topology of the machine, not only the one belonging to a context.
+
 @node Performance model calibration
 @section Performance model calibration
 
@@ -410,7 +444,8 @@ detailed in the next chapter. The various informations should be checked for.
 @itemize
 @item What does the Gantt diagram look like? (see @ref{Gantt diagram})
 @itemize
-  @item If it's mostly green (running tasks), then the machine is properly
+  @item If it's mostly green (tasks running in the initial context) or context specific 
+  color prevailing, then the machine is properly
   utilized, and perhaps the codelets are just slow. Check their performance, see
   @ref{Codelet performance}.
   @item If it's mostly purple (FetchingInput), tasks keep waiting for data

+ 395 - 0
doc/chapters/sched_ctx_hypervisor.texi

@@ -0,0 +1,395 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@cindex Scheduling Context Hypervisor
+
+StarPU proposes a platform for constructing Scheduling Contexts, for deleting and modifying them dynamically.
+A parallel kernel, can thus be isolated into a scheduling context and interferences between several parallel kernels are avoided.
+If the user knows exactly how many workers each scheduling context needs, he can assign them to the contexts at their creation time or modify them during the execution of the program.
+
+The Scheduling Context Hypervisor Plugin is available for the users who do not dispose of a regular parallelism, who cannot not know in advance the exact size of the context and need to resize the contexts according to the behavior of the parallel kernel.
+The Hypervisor receives information from StarPU concerning the execution of the tasks, the efficiency of the resources, etc. and it decides accordingly when and how the contexts can be resized.
+Basic strategies of resizing scheduling contexts already exist but a platform for implementing additional custom ones is available.
+
+@menu
+* Managing the hypervisor::				Initialize the hypervisor
+* Registering Scheduling Contexts to the hypervisor:: 	Contexts have to register to the hypervisor
+* The user's input in the resizing process:: 		The user can help the hypervisor decide how to resize
+* Resizing strategies::					Several resizing strategies are proposed
+* Performance Counters::              			StarPU provides information to the Hypervisor through performance counters
+* Defining a new hypervisor policy::      		New Policies can be implemented
+@end menu
+
+@node Managing the hypervisor
+@section Managing the hypervisor
+There is a single hypervisor that is in charge of resizing contexts and the resizing strategy is chosen at the initialization of the hypervisor. A single resize can be done at a time.
+
+@deftypefun struct starpu_performance_counters* sched_ctx_hypervisor_init ({struct hypervisor_policy*} @var{policy})
+Initializes the hypervisor to use the strategy provided as parameter and creates the performance counters (see @pxref{Performance Counters}).
+These performance counters represent actually some callbacks that will be used by the contexts to notify the information needed by the hypervisor.
+@end deftypefun
+
+Note: The Hypervisor is actually a worker that takes this role once certain conditions trigger the resizing process (there is no additional thread assigned to the hypervisor).
+
+@deftypefun void sched_ctx_hypervisor_shutdown (void)
+The hypervisor and all information is freed. There is no synchronization between this function and starpu_shutdown. Thus, this should be done after starpu_shutdown(), 
+because the performance counters will still need allocated callback functions.
+@end deftypefun
+
+@node Registering Scheduling Contexts to the hypervisor
+@section Registering Scheduling Contexts to the hypervisor
+Scheduling Contexts that have to be resized by the hypervisor must be first registered to the hypervisor. Whenever we want to exclude contexts from the resizing process we have to unregister them from the hypervisor.
+
+@deftypefun void sched_ctx_hypervisor_register_ctx (unsigned @var{sched_ctx}, double @var{total_flops})
+Register the context to the hypervisor, and indicate the number of flops the context will execute (needed for Gflops rate based strategy @pxref{Resizing strategies} or any other custom strategy needing it, for the others we can pass 0.0)
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_unregister_ctx (unsigned @var{sched_ctx})
+Unregister the context from the hypervisor
+@end deftypefun
+
+@node The user's input in the resizing process
+@section The user's input in the resizing process
+The user can totally forbid the resizing of a certain context or can then change his mind and allow it (in this case the resizing is managed by the hypervisor, that can forbid it or allow it)
+
+@deftypefun void sched_ctx_hypervisor_stop_resize (unsigned @var{sched_ctx})
+Forbid resizing of a context
+@end deftypefun
+
+@deftypefun void sched_ctx_hypervisor_start_resize (unsigned @var{sched_ctx})
+Allow resizing of a context
+@end deftypefun
+
+The user can then provide information to the hypervisor concerning the conditions of resizing.
+
+@deftypefun void sched_ctx_hypervisor_ioctl (unsigned @var{sched_ctx}, ...)
+Inputs conditions to the context @code{sched_ctx} with the following arguments.  The argument list must be zero-terminated.
+
+@defmac HYPERVISOR_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments: 
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and a double value indicating 
+the maximum idle time allowed for a worker before the resizing process should be triggered
+@end defmac
+
+@defmac HYPERVISOR_PRIORITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 3 arguments: 
+an array of int for the workerids to apply the condition, an int to indicate the size of the array, and an int value indicating 
+the priority of the workers previously mentioned.
+The workers with the smallest priority are moved the first.
+@end defmac
+
+@defmac HYPERVISOR_MIN_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the minimum number of workers a context should have, underneath this limit the context cannot execute.
+@end defmac
+
+@defmac HYPERVISOR_MAX_WORKERS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating 
+the maximum number of workers a context should have, above this limit the context would not be able to scale
+@end defmac
+
+@defmac HYPERVISOR_GRANULARITY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument(int) indicating
+the granularity of the resizing process (the number of workers should be moved from the context once it is resized)
+This parameter is ignore for the Gflops rate based strategy @pxref{Resizing strategies}, the number of workers that have to be moved is calculated by the strategy.
+@end defmac
+
+@defmac HYPERVISOR_FIXED_WORKERS 
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 2 arguments: 
+an array of int for the workerids to apply the condition and an int to indicate the size of the array.
+These workers are not allowed to be moved from the context.
+@end defmac
+
+@defmac HYPERVISOR_MIN_TASKS
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int)
+that indicated the minimum number of tasks that have to be executed before the context could be resized.
+This parameter is ignored for the Application Driven strategy @pxref{Resizing strategies} where the user indicates exactly when the resize should be done.
+@end defmac
+
+@defmac HYPERVISOR_NEW_WORKERS_MAX_IDLE
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument, a double value indicating 
+the maximum idle time allowed for workers that have just been moved from other contexts in the current context.
+@end defmac
+
+@defmac HYPERVISOR_TIME_TO_APPLY
+This macro is used when calling sched_ctx_hypervisor_ioctl and must be followed by 1 argument (int) indicating the tag
+an executed task should have such that this configuration should be taken into account.
+@end defmac
+@end deftypefun
+@node Resizing strategies
+@section Resizing strategies
+
+The plugin proposes several strategies for resizing the scheduling context.
+
+The @b{Application driven} strategy uses the user's input concerning the moment when he wants to resize the contexts. 
+Thus, the users tags the task that should trigger the resizing process. We can set directly the corresponding field in the @code{starpu_task} data structure is @code{hypervisor_tag} or
+use the macro @code{STARPU_HYPERVISOR_TAG} in @code{starpu_insert_task} function. 
+
+@cartouche
+@smallexample
+task.hypervisor_tag = 2;
+@end smallexample
+@end cartouche
+
+or
+
+@cartouche
+@smallexample
+@{starpu_insert_task(&codelet,
+		    ...,
+		    STARPU_HYPERVISOR_TAG, 2,
+                    0);
+@}
+@end smallexample
+@end cartouche
+
+The the user has to indicate that when a task with the specified tag is executed the contexts should resize.
+
+@cartouche
+@smallexample
+sched_ctx_hypervisor_resize(sched_ctx, 2);
+@end smallexample
+@end cartouche
+
+The user can use the same tag to change the resizing configuration of the contexts if he considers it necessary.
+@cartouche
+@smallexample
+sched_ctx_hypervisor_ioctl(sched_ctx,
+                          HYPERVISOR_MIN_WORKERS, 6,
+                          HYPERVISOR_MAX_WORKERS, 12,
+                          HYPERVISOR_TIME_TO_APPLY, 2,
+                          NULL);
+@end smallexample
+@end cartouche
+
+
+The @b{Idleness} based strategy resizes the scheduling contexts every time one of their workers stays idle 
+for a period longer than the one imposed by the user (see @pxref{The user's input in the resizing process})
+
+@cartouche
+@smallexample
+int workerids[3] = @{1, 3, 10@};
+int workerids2[9] = @{0, 2, 4, 5, 6, 7, 8, 9, 11@};
+sched_ctx_hypervisor_ioctl(sched_ctx_id,
+                           HYPERVISOR_MAX_IDLE, workerids, 3, 10000.0,
+			   HYPERVISOR_MAX_IDLE, workerids2, 9, 50000.0,
+                           NULL);
+@end smallexample
+@end cartouche
+
+The @b{Gflops rate} based strategy resizes the scheduling contexts such that they all finish at the same time. 
+The velocity of each of them is considered and once one of them is significantly slower the resizing process is triggered.
+In order to do these computations the user has to input the total number of instructions needed to be executed by the 
+parallel kernels and the number of instruction to be executed by each task.
+The number of flops to be executed by a context are passed as parameter when they are registered to the hypervisor,
+ (@code{sched_ctx_hypervisor_register_ctx(sched_ctx_id, flops)}) and the one to be executed by each task are passed when the task is submitted.
+The corresponding field in the @code{starpu_task} data structure is @code{flops} and 
+the corresponding macro in @code{starpu_insert_task} function is @code{STARPU_FLOPS}. When the task is executed 
+the resizing process is triggered.
+@cartouche
+@smallexample
+task.flops = 100;
+@end smallexample
+@end cartouche
+
+or
+
+@cartouche
+@smallexample
+starpu_insert_task(&codelet,
+		    ...,
+		    STARPU_FLOPS, 100,
+                    0);
+@end smallexample
+@end cartouche
+
+@node Performance Counters
+@section Performance Counters
+
+The Scheduling Context Hypervisor Plugin provides a series of performance counters to StarPU. By incrementing them, StarPU can help the hypervisor in the resizing decision making process.
+
+@deftp {Data Type} {struct starpu_performance_counters}
+@anchor{struct starpu_performance_counters}
+
+@table @asis
+@item @code{void (*notify_idle_cycle)(unsigned sched_ctx, int worker, double idle_time)}
+Informs the hypervisor for how long a worker has been idle in the specified context
+@item @code{void (*notify_idle_end)(unsigned sched_ctx, int worker)}
+Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context.
+The idle counter it though reset.
+@item @code{void (*notify_pushed_task)(unsigned sched_ctx, int worker)}
+Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+@item @code{void (*notify_poped_task)(unsigned sched_ctx, int worker, double flops)}
+Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+@item @code{void (*notify_post_exec_hook)(unsigned sched_ctx, int taskid)}
+Notifies the hypervisor a task has just been executed
+
+@end table
+@end deftp
+
+TODO maybe they should be hidden to the user
+
+@node Defining a new hypervisor policy
+@section Defining a new hypervisor policy
+
+@menu
+* Hypervisor Policy API:: Hypervisor Policy API
+* Hypervisor example::
+@end menu
+
+@node Hypervisor Policy API
+@subsection Hypervisor Policy API
+
+While Scheduling Context Hypervisor Plugin comes with a variety of resizing policies (@pxref{Resizing strategies}), 
+it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own resizing policy.
+
+@deftp {Data Type} {struct hypervisor_policy}
+This structure contains all the methods that implement a hypervisor resizing policy. 
+
+@table @asis
+@item @code{const char* name}
+Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
+@item @code{unsigned custom}
+Indicates whether the policy is custom or not
+@item @code{void (*handle_idle_cycle)(unsigned sched_ctx, int worker)}
+It is called whenever the indicated worker executes another idle cycle in @code{sched_ctx}
+@item @code{void (*handle_pushed_task)(unsigned sched_ctx, int worker)}
+It is called whenever a task is pushed on the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_poped_task)(unsigned sched_ctx, int worker)}
+It is called whenever a task is poped from the worker's queue corresponding to the context @code{sched_ctx}
+@item @code{void (*handle_idle_end)(unsigned sched_ctx, int worker)}
+It is called whenever a task is executed on the indicated worker and context after a long period of idle time
+@item @code{void (*handle_post_exec_hook)(unsigned sched_ctx, struct starpu_htbl32_node* resize_requests, int task_tag)}
+It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
+@end table
+@end deftp
+
+The Hypervisor provides also a structure with configuration information of each context, which can be used to construct new resize strategies.
+
+@deftp {Data Type} {struct policy_config }
+This structure contains all configuration information of a context
+
+@table @asis
+@item @code{int min_nworkers}
+Indicates the minimum number of workers needed by the context
+@item @code{int max_nworkers}
+Indicates the maximum number of workers needed by the context
+@item @code{int granularity}
+Indicates the workers granularity of the context
+@item @code{int priority[STARPU_NMAXWORKERS]}
+Indicates the priority of each worker in the context
+@item @code{double max_idle[STARPU_NMAXWORKERS]}
+Indicates the maximum idle time accepted before a resize is triggered
+@item @code{int fixed_workers[STARPU_NMAXWORKERS]}
+Indicates which workers can be moved and which ones are fixed
+@item @code{double new_workers_max_idle}
+Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
+@end table
+@end deftp
+
+Additionally, the hypervisor provides a structure with information obtained from StarPU by means of the performance counters
+
+
+@deftp {Data Type} {struct sched_ctx_wrapper}
+This structure is a wrapper of the contexts available in StarPU
+and contains all information about a context obtained by incrementing the performance counters
+
+@table @asis
+@item @code{unsigned sched_ctx}
+The context wrapped
+@item @code{struct policy_config *config}
+The corresponding resize configuration
+@item @code{double current_idle_time[STARPU_NMAXWORKERS]}
+The idle time counter of each worker of the context
+@item @code{int pushed_tasks[STARPU_NMAXWORKERS]}
+The number of pushed tasks of each worker of the context
+@item @code{int poped_tasks[STARPU_NMAXWORKERS]}
+The number of poped tasks of each worker of the context
+@item @code{double total_flops}
+The total number of flops to execute by the context
+@item @code{double total_elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each workers of the context
+@item @code{double elapsed_flops[STARPU_NMAXWORKERS]}
+The number of flops executed by each worker of the context from last resize
+@item @code{double remaining_flops}
+The number of flops that still have to be executed by the workers in the context
+@item @code{double start_time}
+The time when he started executed
+@item @code{struct resize_ack resize_ack}
+The structure confirming the last resize finished and a new one can be done
+@end table
+@end deftp
+
+@deftp {Data Type} {struct resize_ack}
+This structures checks if the workers moved to another context are actually taken into account in that context
+@table @asis
+@item @code{int receiver_sched_ctx}
+The context receiving the new workers
+@item @code{int *moved_workers}
+The workers moved to the receiver context
+@item @code{int nmoved_workers}
+The number of workers moved
+@item @code{int *acked_workers}
+If the value corresponding to a worker is 1, this one is taken into account in the new context if 0 not yet
+@end table
+@end deftp
+
+The following functions can be used in the resizing strategies.
+
+@deftypefun void sched_ctx_hypervisor_move_workers (unsigned @var{sender_sched_ctx}, unsigned @var{receier_sched_ctx}, {int *}@var{workers_to_move}, unsigned @var{nworkers_to_move});
+Moves workers from one context to another
+@end deftypefun
+
+@deftypefun {struct policy_config*} sched_ctx_hypervisor_get_config (unsigned @var{sched_ctx});
+Returns the configuration structure of a context
+@end deftypefun
+
+@deftypefun {int*} sched_ctx_hypervisor_get_sched_ctxs ();
+Gets the contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun int sched_ctx_hypervisor_get_nsched_ctxs ();
+Gets the number of contexts managed by the hypervisor
+@end deftypefun
+
+@deftypefun {struct sched_ctx_wrapper*} sched_ctx_hypervisor_get_wrapper (unsigned @var{sched_ctx});
+Returns the wrapper corresponding the context @code{sched_ctx}
+@end deftypefun
+
+@deftypefun double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx ({struct sched_ctx_wrapper*} @var{sc_w});
+Returns the flops of a context elapsed from the last resize
+@end deftypefun
+
+@deftypefun {char*} sched_ctx_hypervisor_get_policy ();
+Returns the name of the resizing policy the hypervisor uses
+@end deftypefun
+
+@node Hypervisor example
+@subsection Hypervisor example
+
+@cartouche
+@smallexample
+
+struct hypervisor_policy dummy_policy = 
+@{
+       .handle_poped_task = dummy_handle_poped_task,
+       .handle_pushed_task = dummy_handle_pushed_task,
+       .handle_idle_cycle = dummy_handle_idle_cycle,
+       .handle_idle_end = dummy_handle_idle_end,
+       .handle_post_exec_hook = dummy_handle_post_exec_hook,
+       .custom = 1,
+       .name = ``dummy''
+@};
+
+@end smallexample
+@end cartouche
+
+@c Local Variables:
+@c TeX-master: "../starpu.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 25 - 16
doc/starpu.texi

@@ -65,22 +65,23 @@ was last updated on @value{UPDATED}.
 @comment  better formatting.
 @comment
 @menu
-* Introduction::                Getting started
-* Installing StarPU::           How to configure, build and install StarPU
-* Using StarPU::                How to run StarPU application
-* Basic Examples::              Basic examples of the use of StarPU
-* Advanced Examples::           Advanced examples of the use of StarPU
-* Benchmarks::                  Benchmarks worth running
-* Performance optimization::    How to optimize performance with StarPU
-* Performance feedback::        Performance debugging tools
-* Tips and Tricks::             Tips and tricks to know about
-* StarPU MPI support::          How to combine StarPU with MPI
-* StarPU FFT support::          How to perform FFT computations with StarPU
-* C Extensions::                Easier StarPU programming with GCC
-* SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
-* StarPU Basic API::            The Basic API to use StarPU
-* StarPU Advanced API::         Advanced use of StarPU
-* Configuring StarPU::          How to configure StarPU
+* Introduction::                	Getting started
+* Installing StarPU::           	How to configure, build and install StarPU
+* Using StarPU::                	How to run StarPU application
+* Basic Examples::              	Basic examples of the use of StarPU
+* Advanced Examples::           	Advanced examples of the use of StarPU
+* Benchmarks::                  	Benchmarks worth running
+* Performance optimization::    	How to optimize performance with StarPU
+* Performance feedback::        	Performance debugging tools
+* Tips and Tricks::             	Tips and tricks to know about
+* StarPU MPI support::          	How to combine StarPU with MPI
+* StarPU FFT support::          	How to perform FFT computations with StarPU
+* C Extensions::                	Easier StarPU programming with GCC
+* SOCL OpenCL Extensions::      	How to use OpenCL on top of StarPU
+* Scheduling Context Hypervisor:: 	How to use Scheduling Context Hypervisor with StarPU
+* StarPU Basic API::            	The Basic API to use StarPU
+* StarPU Advanced API::         	Advanced use of StarPU
+* Configuring StarPU::          	How to configure StarPU
 * Full source code for the 'Scaling a Vector' example::
 * GNU Free Documentation License::  How you can copy and share this manual.
 
@@ -195,6 +196,14 @@ was last updated on @value{UPDATED}.
 @include chapters/socl.texi
 
 @c ---------------------------------------------------------------------
+@c Scheduling Context Hypervisor
+@c ---------------------------------------------------------------------
+
+@node Scheduling Context Hypervisor
+@chapter Scheduling Context Hypervisor
+@include chapters/sched_ctx_hypervisor.texi
+
+@c ---------------------------------------------------------------------
 @c StarPU API
 @c ---------------------------------------------------------------------
 

+ 7 - 1
examples/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2012 INRIA
+# Copyright (C) 2011-2012  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,8 @@ EXTRA_DIST = 					\
 	lu/xlu_implicit_pivot.c			\
 	lu/xlu_kernels.c			\
 	lu/lu_example.c				\
+	sched_ctx_utils/sched_ctx_utils.c		\
+	sched_ctx/sched_ctx.c		\
 	incrementer/incrementer_kernels_opencl_kernel.cl 	\
 	basic_examples/variable_kernels_opencl_kernel.cl	\
 	matvecmult/matvecmult_kernel.cl				\
@@ -99,6 +101,7 @@ noinst_HEADERS = 				\
 	lu/complex_double.h			\
 	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
+	sched_ctx_utils/sched_ctx_utils.h	\
 	common/blas_model.h			\
 	common/blas.h				\
 	mult/simple.h				\
@@ -184,6 +187,7 @@ examplebin_PROGRAMS +=				\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
+	sched_ctx/sched_ctx			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
 	ppm_downscaler/ppm_downscaler		\
@@ -250,6 +254,7 @@ STARPU_EXAMPLES +=				\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
+	sched_ctx/sched_ctx				\
 	reductions/dot_product			\
 	reductions/minmax_reduction
 
@@ -512,6 +517,7 @@ cholesky_cholesky_implicit_SOURCES =		\
 	cholesky/cholesky_implicit.c		\
 	cholesky/cholesky_models.c		\
 	cholesky/cholesky_kernels.c		\
+	sched_ctx_utils/sched_ctx_utils.c	\
 	common/blas.c
 
 cholesky_cholesky_implicit_LDADD =		\

+ 27 - 0
examples/cholesky/cholesky.h

@@ -64,6 +64,10 @@ static unsigned check = 0;
 static unsigned bound = 0;
 static unsigned bound_deps = 0;
 static unsigned bound_lp = 0;
+static unsigned with_ctxs = 0;
+static unsigned with_noctxs = 0;
+static unsigned chole1 = 0;
+static unsigned chole2 = 0;
 
 void chol_cpu_codelet_update_u11(void **, void *);
 void chol_cpu_codelet_update_u21(void **, void *);
@@ -84,6 +88,29 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 	int i;
 	for (i = 1; i < argc; i++)
 	{
+		if (strcmp(argv[i], "-with_ctxs") == 0) 
+		{
+			with_ctxs = 1;
+			break;
+		}
+		if (strcmp(argv[i], "-with_noctxs") == 0) 
+		{
+			with_noctxs = 1;
+			break;
+		}
+		
+		if (strcmp(argv[i], "-chole1") == 0) 
+		{
+			chole1 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-chole2") == 0) 
+		{
+			chole2 = 1;
+			break;
+		}
+
 		if (strcmp(argv[i], "-size") == 0)
 		{
 		        char *argptr;

+ 59 - 31
examples/cholesky/cholesky_implicit.c

@@ -17,7 +17,7 @@
  */
 
 #include "cholesky.h"
-
+#include "../sched_ctx_utils/sched_ctx_utils.h"
 /*
  *	Create the codelets
  */
@@ -137,24 +137,31 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	end = starpu_timing_now();
 
+	//double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	double timing = end - start;
-	FPRINTF(stderr, "Computation took (in ms)\n");
-	FPRINTF(stdout, "%2.2f\n", timing/1000);
-
 	unsigned long n = starpu_matrix_get_nx(dataA);
 
 	double flop = (1.0f*n*n*n)/3.0f;
-	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-	if (bound_lp)
-	{
-		FILE *f = fopen("cholesky.lp", "w");
-		starpu_bound_print_lp(f);
-	}
-	if (bound)
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
+	else
 	{
-		double res;
-		starpu_bound_compute(&res, NULL, 0);
-		FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		FPRINTF(stderr, "Computation took (in ms)\n");
+		FPRINTF(stdout, "%2.2f\n", timing/1000);
+	
+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound_lp)
+		{
+			FILE *f = fopen("cholesky.lp", "w");
+			starpu_bound_print_lp(f);
+		}
+		if (bound)
+		{
+			double res;
+			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		}
 	}
 	return 0;
 }
@@ -189,24 +196,9 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	return ret;
 }
 
-int main(int argc, char **argv)
+static void execute_cholesky(unsigned size, unsigned nblocks)
 {
 	int ret;
-
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	ret = starpu_init(NULL);
-	if (ret == -ENODEV)
-		return 77;
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	starpu_helper_cublas_init();
-
 	float *mat = NULL;
 	unsigned i,j;
 
@@ -321,9 +313,45 @@ int main(int argc, char **argv)
 	        }
 		free(test_mat);
 	}
+	starpu_free(mat);
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		parse_args_ctx(argc, argv);
+
+	int ret;
+	ret = starpu_init(NULL);
+
+	if (ret == -ENODEV)
+                return 77;
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	if(with_ctxs)
+	{
+		construct_contexts(execute_cholesky);
+		start_2benchs(execute_cholesky);
+	}
+	else if(with_noctxs)
+		start_2benchs(execute_cholesky);
+	else if(chole1)
+		start_1stbench(execute_cholesky);
+	else if(chole2)
+		start_2ndbench(execute_cholesky);
+	else
+		execute_cholesky(size, nblocks);
 
 	starpu_helper_cublas_shutdown();
-	starpu_free(mat);
 	starpu_shutdown();
 
 	return ret;

+ 128 - 0
examples/sched_ctx/sched_ctx.c

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include<starpu.h>
+#include<pthread.h>
+
+#define NTASKS 1000
+int tasks_executed = 0;
+pthread_mutex_t mut;
+
+static void sched_ctx_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+	pthread_mutex_lock(&mut);
+	tasks_executed++;
+	pthread_mutex_unlock(&mut);
+}
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = {sched_ctx_func, NULL},
+	.opencl_funcs = {sched_ctx_func, NULL},
+	.model = NULL,
+	.nbuffers = 0
+};
+
+
+int main(int argc, char **argv)
+{
+	int ntasks = NTASKS;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_QUICK_CHECK
+	ntasks /= 100;
+#endif
+
+	pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int procs1[20], procs2[20];
+	procs1[0] = 0;
+	procs2[0] = 0;
+
+#ifdef STARPU_USE_CPU
+	unsigned ncpus =  starpu_cpu_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	nprocs1 = ncpus;
+#endif
+
+#ifdef STARPU_USE_CUDA
+	unsigned ncuda = starpu_cuda_worker_get_count();
+	starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, procs2, ncuda);
+
+	nprocs2 = ncuda;
+#endif
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_create_sched_ctx("heft", procs1, nprocs1, "ctx1");
+	unsigned sched_ctx2 = starpu_create_sched_ctx("heft", procs2, nprocs2, "ctx2");
+
+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
+	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	unsigned i;
+	for (i = 0; i < ntasks/2; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = NULL;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+#warning TODO: to be fixed
+//	starpu_sched_ctx_finished_submit(sched_ctx1);
+
+	for (i = 0; i < ntasks/2; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = NULL;
+
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+#warning TODO: to be fixed
+//	starpu_sched_ctx_finished_submit(sched_ctx2);
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks);
+	starpu_shutdown();
+
+	return 0;
+}

+ 301 - 0
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -0,0 +1,301 @@
+#include "sched_ctx_utils.h"
+#include <starpu.h>
+
+unsigned size1;
+unsigned size2;
+unsigned nblocks1;
+unsigned nblocks2;
+unsigned cpu1;
+unsigned cpu2;
+unsigned gpu;
+unsigned gpu1;
+unsigned gpu2;
+
+typedef struct {
+	unsigned id;
+	unsigned ctx;
+	int the_other_ctx;
+	int *procs;
+	int nprocs;
+	void (*bench)(unsigned, unsigned);
+	unsigned size;
+	unsigned nblocks;
+} params;
+
+typedef struct {
+	double flops;
+	double avg_timing;
+} retvals;
+
+#define NSAMPLES 1
+int first = 1;
+pthread_mutex_t mut;
+retvals rv[2];
+params p1, p2;
+
+pthread_key_t key;
+
+void init()
+{
+	size1 = 4*1024;
+	size2 = 4*1024;
+	nblocks1 = 16;
+	nblocks2 = 16;
+	cpu1 = 0;
+	cpu2 = 0;
+	gpu = 0;
+	gpu1 = 0;
+	gpu2 = 0;
+
+	rv[0].flops = 0.0;
+	rv[1].flops = 0.0;
+	rv[1].avg_timing = 0.0;
+	rv[1].avg_timing = 0.0;
+
+	p1.ctx = 0;
+	p2.ctx = 0;
+
+	p1.id = 0;
+	p2.id = 1;
+	pthread_key_create(&key, NULL);
+}
+
+void update_sched_ctx_timing_results(double flops, double avg_timing)
+{
+	unsigned *id = pthread_getspecific(key);
+	rv[*id].flops += flops;
+	rv[*id].avg_timing += avg_timing;
+}
+
+void* start_bench(void *val){
+	params *p = (params*)val;
+	int i;
+
+	pthread_setspecific(key, &p->id);
+
+	if(p->ctx != 0)
+		starpu_set_sched_ctx(&p->ctx);
+
+	for(i = 0; i < NSAMPLES; i++)
+		p->bench(p->size, p->nblocks);
+
+	if(p->ctx != 0)
+	{
+		pthread_mutex_lock(&mut);
+		if(first){
+			starpu_delete_sched_ctx(p->ctx, p->the_other_ctx);
+		}
+		
+		first = 0;
+		pthread_mutex_unlock(&mut);
+	}
+
+	rv[p->id].flops /= NSAMPLES;
+	rv[p->id].avg_timing /= NSAMPLES;
+
+	return NULL;
+}
+
+void start_2benchs(void (*bench)(unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	printf("size %d\n", size1);
+	p1.nblocks = nblocks1;
+	
+	p2.bench = bench;
+	p2.size = size2;
+	printf("size %d\n", size2);
+	p2.nblocks = nblocks2;
+	
+	pthread_t tid[2];
+	pthread_mutex_init(&mut, NULL);
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+ 
+	pthread_join(tid[0], NULL);
+	pthread_join(tid[1], NULL);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+  
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f %2.2f ", rv[0].flops, rv[1].flops);
+	printf("%2.2f %2.2f %2.2f\n", rv[0].avg_timing, rv[1].avg_timing, timing);
+
+}
+
+void start_1stbench(void (*bench)(unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+	
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p1);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+  
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[0].flops);
+	printf("%2.2f %2.2f\n", rv[0].avg_timing, timing);
+}
+
+void start_2ndbench(void (*bench)(unsigned, unsigned))
+{
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+	
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p2);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+  
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[1].flops);
+	printf("%2.2f %2.2f\n", rv[1].avg_timing, timing);
+}
+
+void construct_contexts(void (*bench)(unsigned, unsigned))
+{
+	int nprocs1 = cpu1 + gpu + gpu1;
+	int nprocs2 = cpu2 + gpu + gpu2;
+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
+
+
+	int procs[nprocs1];
+	int i;
+	int k = 0;
+
+	for(i = 0; i < gpu; i++)
+	{
+		procs[k++] = i;
+		printf("%d ", i);
+	}
+
+	for(i = gpu; i < gpu + gpu1; i++)
+	{
+		procs[k++] = i;
+		printf("%d ", i);
+	}
+
+
+	for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
+	{
+		procs[k++] = i;
+		printf("%d ", i);
+	}
+	printf("\n ");
+
+	p1.ctx = starpu_create_sched_ctx("heft", procs, nprocs1, "sched_ctx1");
+	p2.the_other_ctx = (int)p1.ctx;
+	p1.procs = procs;
+	p1.nprocs = nprocs1;
+	int procs2[nprocs2];
+
+	k = 0;
+
+	for(i = 0; i < gpu; i++)
+	{
+		procs2[k++] = i;
+		printf("%d ", i);
+	}
+
+	for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++)
+	{
+		procs2[k++] = i;
+		printf("%d ", i);
+	}
+
+	for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++)
+	{
+		procs2[k++] = i;
+		printf("%d ", i);
+	}
+	printf("\n");
+
+	p2.ctx = starpu_create_sched_ctx("heft", procs2, nprocs2, "sched_ctx2");
+	p1.the_other_ctx = (int)p2.ctx;
+	p2.procs = procs2;
+	p2.nprocs = nprocs2;
+}
+
+
+void parse_args_ctx(int argc, char **argv)
+{
+	init();
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size1") == 0) {
+			char *argptr;
+			size1 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks1") == 0) {
+			char *argptr;
+			nblocks1 = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-size2") == 0) {
+			char *argptr;
+			size2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks2") == 0) {
+			char *argptr;
+			nblocks2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-cpu1") == 0) {
+			char *argptr;
+			cpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-cpu2") == 0) {
+			char *argptr;
+			cpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu") == 0) {
+			char *argptr;
+			gpu = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu1") == 0) {
+			char *argptr;
+			gpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu2") == 0) {
+			char *argptr;
+			gpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+	}
+}
+

+ 12 - 0
examples/sched_ctx_utils/sched_ctx_utils.h

@@ -0,0 +1,12 @@
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <stdlib.h>
+
+void parse_args_ctx(int argc, char **argv);
+void update_sched_ctx_timing_results(double gflops, double timing);
+void construct_contexts(void (*bench)(unsigned size, unsigned nblocks));
+void start_2benchs(void (*bench)(unsigned size, unsigned nblocks));
+void start_1stbench(void (*bench)(unsigned size, unsigned nblocks));
+void start_2ndbench(void (*bench)(unsigned size, unsigned nblocks));

+ 60 - 22
examples/scheduler/dummy_sched.c

@@ -21,65 +21,103 @@
 #define NTASKS	32000
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
-struct starpu_task_list sched_list;
+typedef struct dummy_sched_data {
+	struct starpu_task_list sched_list;
+	pthread_mutex_t sched_mutex;
+	pthread_cond_t sched_cond;
+} dummy_sched_data;
 
-static pthread_cond_t sched_cond;
-static pthread_mutex_t sched_mutex;
+static void dummy_sched_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	
+	unsigned i;
+	int workerid;
+	for(i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, &data->sched_mutex,  &data->sched_cond);
+	}
+}
 
-static void init_dummy_sched(struct starpu_machine_topology *topology,
-			struct starpu_sched_policy *policy)
+static void dummy_sched_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
+	unsigned i;
+	int workerid;
+	for(i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL,  NULL);
+	}
+}
+
+static void init_dummy_sched(unsigned sched_ctx_id)
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	struct dummy_sched_data *data = (struct dummy_sched_data*)malloc(sizeof(struct dummy_sched_data));
+	
+
 	/* Create a linked-list of tasks and a condition variable to protect it */
-	starpu_task_list_init(&sched_list);
+	starpu_task_list_init(&data->sched_list);
 
-	pthread_mutex_init(&sched_mutex, NULL);
-	pthread_cond_init(&sched_cond, NULL);
+	pthread_mutex_init(&data->sched_mutex, NULL);
+	pthread_cond_init(&data->sched_cond, NULL);
 
-	unsigned workerid;
-	for (workerid = 0; workerid < topology->nworkers; workerid++)
-		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)data);		
 
 	FPRINTF(stderr, "Initialising Dummy scheduler\n");
 }
 
-static void deinit_dummy_sched(struct starpu_machine_topology *topology,
-				struct starpu_sched_policy *policy)
+static void deinit_dummy_sched(unsigned sched_ctx_id)
 {
-	STARPU_ASSERT(starpu_task_list_empty(&sched_list));
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
-	pthread_cond_destroy(&sched_cond);
-	pthread_mutex_destroy(&sched_mutex);
+	STARPU_ASSERT(starpu_task_list_empty(&data->sched_list));
 
+	pthread_cond_destroy(&data->sched_cond);
+	pthread_mutex_destroy(&data->sched_mutex);
+
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+
+	free(data);
+	
 	FPRINTF(stderr, "Destroying Dummy scheduler\n");
 }
 
 static int push_task_dummy(struct starpu_task *task)
 {
-	pthread_mutex_lock(&sched_mutex);
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	pthread_mutex_lock(&data->sched_mutex);
 
-	starpu_task_list_push_front(&sched_list, task);
+	starpu_task_list_push_front(&data->sched_list, task);
 
-	pthread_cond_signal(&sched_cond);
+	pthread_cond_signal(&data->sched_cond);
 
-	pthread_mutex_unlock(&sched_mutex);
+	pthread_mutex_unlock(&data->sched_mutex);
 
 	return 0;
 }
 
 /* The mutex associated to the calling worker is already taken by StarPU */
-static struct starpu_task *pop_task_dummy(void)
+static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 {
 	/* NB: In this simplistic strategy, we assume that all workers are able
 	 * to execute all tasks, otherwise, it would have been necessary to go
 	 * through the entire list until we find a task that is executable from
 	 * the calling worker. So we just take the head of the list and give it
 	 * to the worker. */
-	return starpu_task_list_pop_back(&sched_list);
+	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	return starpu_task_list_pop_back(&data->sched_list);
 }
 
 static struct starpu_sched_policy dummy_sched_policy =
 {
 	.init_sched = init_dummy_sched,
+	.add_workers = dummy_sched_add_workers,
+	.remove_workers = dummy_sched_remove_workers,
 	.deinit_sched = deinit_dummy_sched,
 	.push_task = push_task_dummy,
 	.pop_task = pop_task_dummy,

+ 8 - 2
include/starpu.h

@@ -51,6 +51,7 @@ typedef unsigned long long uint64_t;
 #endif
 #include <starpu_task_util.h>
 #include <starpu_scheduler.h>
+#include <starpu_sched_ctx.h>
 #include <starpu_expert.h>
 #include <starpu_rand.h>
 #include <starpu_cuda.h>
@@ -72,6 +73,9 @@ extern "C"
 
 enum starpu_archtype
 {
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	STARPU_ANY_WORKER, /* any worker, used in the hypervisor */
+#endif
 	STARPU_CPU_WORKER,    /* CPU core */
 	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
 	STARPU_OPENCL_WORKER, /* OpenCL device */
@@ -174,6 +178,7 @@ void starpu_topology_print(FILE *f);
  * StarPU tasks). The returned value should be at most STARPU_NMAXWORKERS. */
 unsigned starpu_worker_get_count(void);
 unsigned starpu_combined_worker_get_count(void);
+unsigned starpu_worker_is_combined_worker(int id);
 
 unsigned starpu_cpu_worker_get_count(void);
 unsigned starpu_cuda_worker_get_count(void);
@@ -230,8 +235,9 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
  *  identifier (as returned by the starpu_worker_get_id() function)
  */
 int starpu_worker_get_devid(int id);
-
-int starpu_driver_run(struct starpu_driver *d);
+void starpu_profiling_init();
+void starpu_display_stats();
+int starpu_driver_run(struct starpu_driver *);
 void starpu_drivers_request_termination(void);
 
 int starpu_driver_init(struct starpu_driver *d);

+ 4 - 0
include/starpu_config.h.in

@@ -72,7 +72,11 @@
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXGORDONDEVS
 #undef STARPU_NMAXWORKERS
+#undef STARPU_NMAX_SCHED_CTXS
 #undef STARPU_MAXIMPLEMENTATIONS
+#undef STARPU_USE_SCHED_CTX_HYPERVISOR
+/* Define to 1 if you have the <glpk.h> header file. */
+#undef HAVE_GLPK_H
 
 #undef STARPU_HAVE_LIBNUMA
 

+ 1 - 0
include/starpu_perfmodel.h

@@ -203,6 +203,7 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
 void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
 
+double starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint);
 int starpu_perfmodel_list(FILE *output);
 void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);

+ 128 - 0
include/starpu_sched_ctx.h

@@ -0,0 +1,128 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010 - 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_SCHED_CTX_H__
+#define __STARPU_SCHED_CTX_H__
+
+#include <starpu.h>
+
+/* generic structure used by the scheduling contexts to iterated the workers */
+struct worker_collection {
+	/* hidden data structure used to memorize the workers */
+	void *workerids;
+	/* the number of workers in the collection */
+	unsigned nworkers;
+	/* the current cursor of the collection*/
+	pthread_key_t cursor_key;
+	/* the type of structure (WORKER_LIST,...) */
+	int type;
+	/* checks if there is another element in collection */
+	unsigned (*has_next)(struct worker_collection *workers);
+	/* return the next element in the collection */
+	int (*get_next)(struct worker_collection *workers);
+	/* add a new element in the collection */
+	int (*add)(struct worker_collection *workers, int worker);
+	/* remove an element from the collection */
+	int (*remove)(struct worker_collection *workers, int worker);
+	/* initialize the structure */
+	void* (*init)(struct worker_collection *workers);
+	/* free the structure */
+	void (*deinit)(struct worker_collection *workers);
+	/* initialize the cursor if there is one */
+	void (*init_cursor)(struct worker_collection *workers);
+	/* free the cursor if there is one */
+	void (*deinit_cursor)(struct worker_collection *workers);
+};
+
+/* types of structures the worker collection can implement */
+#define WORKER_LIST 0
+
+struct starpu_performance_counters {
+	void (*notify_idle_cycle)(unsigned sched_ctx, int worker, double idle_time);
+	void (*notify_idle_end)(unsigned sched_ctx, int worker);
+	void (*notify_pushed_task)(unsigned sched_ctx, int worker);
+	void (*notify_poped_task)(unsigned sched_ctx, int worker, double flops);
+	void (*notify_post_exec_hook)(unsigned sched_ctx, int taskid);
+	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
+};
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
+void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
+unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids_ctx, int nworkers_ctx, const char *sched_name);
+
+unsigned starpu_create_sched_ctx_inside_interval(const char *policy_name, const char *sched_name, 
+						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
+						 unsigned allow_overlap);
+
+void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx_id);
+
+void starpu_add_workers_to_sched_ctx(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx);
+
+void starpu_remove_workers_from_sched_ctx(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx);
+
+void starpu_set_sched_ctx_policy_data(unsigned sched_ctx, void* policy_data);
+
+void* starpu_get_sched_ctx_policy_data(unsigned sched_ctx);
+
+void starpu_worker_set_sched_condition(unsigned sched_ctx, int workerid, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond);
+
+void starpu_worker_get_sched_condition(unsigned sched_ctx, int workerid, pthread_mutex_t **sched_mutex, pthread_cond_t **sched_cond);
+
+void starpu_worker_init_sched_condition(unsigned sched_ctx, int workerid);
+
+void starpu_worker_deinit_sched_condition(unsigned sched_ctx, int workerid);
+
+struct worker_collection* starpu_create_worker_collection_for_sched_ctx(unsigned sched_ctx_id, int type);
+	
+void starpu_delete_worker_collection_for_sched_ctx(unsigned sched_ctx_id); 
+
+struct worker_collection* starpu_get_worker_collection_of_sched_ctx(unsigned sched_ctx_id);
+
+pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id);
+
+void starpu_set_sched_ctx(unsigned *sched_ctx);
+
+unsigned starpu_get_sched_ctx(void);
+
+void starpu_notify_hypervisor_exists(void);
+
+unsigned starpu_check_if_hypervisor_exists(void);
+
+unsigned starpu_get_nworkers_of_sched_ctx(unsigned sched_ctx);
+
+unsigned starpu_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
+
+unsigned starpu_worker_belongs_to_sched_ctx(int workerid, unsigned sched_ctx_id);
+
+unsigned starpu_are_overlapping_ctxs_on_worker(int workerid);
+
+unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
+
+void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
+
+double starpu_get_max_time_worker_on_ctx(void);
+
+void starpu_stop_task_submission(void);
+
+void starpu_sched_ctx_set_inheritor(unsigned sched_ctx, unsigned inheritor);
+
+void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
+
+#endif /* __STARPU_SCHED_CTX_H__ */

+ 25 - 22
include/starpu_scheduler.h

@@ -41,6 +41,7 @@ struct starpu_machine_topology
 
 	unsigned ncombinedworkers;
 
+	unsigned nsched_ctxs;
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_topology_t hwtopology;
 #else
@@ -49,8 +50,8 @@ struct starpu_machine_topology
 #endif
 
 	unsigned nhwcpus;
-        unsigned nhwcudagpus;
-        unsigned nhwopenclgpus;
+	unsigned nhwcudagpus;
+	unsigned nhwopenclgpus;
 
 	unsigned ncpus;
 	unsigned ncudagpus;
@@ -73,10 +74,10 @@ struct starpu_machine_topology
 struct starpu_sched_policy
 {
 	/* Initialize the scheduling policy. */
-	void (*init_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *);
+	void (*init_sched)(unsigned sched_ctx_id);
 
 	/* Cleanup the scheduling policy. */
-	void (*deinit_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *);
+	void (*deinit_sched)(unsigned sched_ctx_id);
 
 	/* Insert a task into the scheduler. */
 	int (*push_task)(struct starpu_task *);
@@ -90,13 +91,13 @@ struct starpu_sched_policy
 
 	/* Get a task from the scheduler. The mutex associated to the worker is
 	 * already taken when this method is called. */
-	struct starpu_task *(*pop_task)(void);
+	struct starpu_task *(*pop_task)(unsigned sched_ctx);
 
 	 /* Remove all available tasks from the scheduler (tasks are chained by
 	  * the means of the prev and next fields of the starpu_task
 	  * structure). The mutex associated to the worker is already taken
 	  * when this method is called. */
-	struct starpu_task *(*pop_every_task)(void);
+	struct starpu_task *(*pop_every_task)(unsigned sched_ctx);
 
 	/* This method is called every time a task is starting. (optional) */
 	void (*pre_exec_hook)(struct starpu_task *);
@@ -104,6 +105,12 @@ struct starpu_sched_policy
 	/* This method is called every time a task has been executed. (optional) */
 	void (*post_exec_hook)(struct starpu_task *);
 
+	/* Initialize scheduling structures corresponding to each worker. */
+	void (*add_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
+
+	/* Deinitialize scheduling structures corresponding to each worker. */
+	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
+
 	/* Name of the policy (optionnal) */
 	const char *policy_name;
 
@@ -111,16 +118,6 @@ struct starpu_sched_policy
 	const char *policy_description;
 };
 
-/* When there is no available task for a worker, StarPU blocks this worker on a
-condition variable. This function specifies which condition variable (and the
-associated mutex) should be used to block (and to wake up) a worker. Note that
-multiple workers may use the same condition variable. For instance, in the case
-of a scheduling strategy with a single task queue, the same condition variable
-would be used to block and wake up all workers.  The initialization method of a
-scheduling strategy (init_sched) must call this function once per worker. */
-#if !defined(_MSC_VER) && !defined(STARPU_SIMGRID)
-void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond, pthread_mutex_t *sched_mutex);
-#endif
 
 /* Check if the worker specified by workerid can execute the codelet. */
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
@@ -174,23 +171,23 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, uint32_t node);
  *	Performance predictions
  */
 
-/* Return the current date in µs */
+/* Return the current date in us */
 double starpu_timing_now(void);
-/* Returns expected task duration in µs */
+/* Returns expected task duration in us */
 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */
 double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtype);
-/* Returns expected data transfer time in µs */
+/* Returns expected data transfer time in us */
 double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct starpu_task *task);
-/* Predict the transfer time (in µs) to move a handle to a memory node */
+/* Predict the transfer time (in us) to move a handle to a memory node */
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
 /* Returns expected power consumption in J */
 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns expected conversion time in ms (multiformat interface only) */
 double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
-/* Return the expected duration of the entire task bundle in µs. */
+/* Return the expected duration of the entire task bundle in us. */
 double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
-/* Return the time (in µs) expected to transfer all data used within the bundle */
+/* Return the time (in us) expected to transfer all data used within the bundle */
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 /* Return the expected power consumption of the entire task bundle in J. */
 double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perf_archtype arch, unsigned nimpl);
@@ -199,4 +196,10 @@ double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starp
 }
 #endif
 
+/* /\* Waits until all the tasks of a worker, already submitted, have been executed *\/ */
+/* int starpu_wait_for_all_tasks_of_worker(int workerid); */
+
+/* /\* Waits until all the tasks of a bunch of workers have been executed *\/ */
+/* int starpu_wait_for_all_tasks_of_workers(int *workerids_ctx, int nworkers_ctx); */
+
 #endif /* __STARPU_SCHEDULER_H__ */

+ 26 - 1
include/starpu_task.h

@@ -3,6 +3,7 @@
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -200,6 +201,21 @@ struct starpu_task
 	 * have not been properly initialised.
 	 */
 	int magic;
+
+	/* Scheduling context */
+	unsigned sched_ctx;
+
+	/* flag to differentiate tasks needed by starpu management purposes 
+	 from the ones provided by the appl*/
+	unsigned control_task;
+
+	int hypervisor_tag;
+	
+	double flops;
+
+	unsigned already_pushed;
+
+	unsigned scheduled;
 };
 
 /* It is possible to initialize statically allocated tasks with this value.
@@ -225,7 +241,13 @@ struct starpu_task
 	.predicted = -1.0,				\
 	.predicted_transfer = -1.0,			\
 	.starpu_private = NULL,				\
-	.magic = 42                  			\
+	.magic = 42,                  			\
+	.sched_ctx = 0,					\
+	.control_task = 0,				\
+	.hypervisor_tag = 0,				\
+	.flops = 0.0,					\
+	.already_pushed = 0,				\
+		.scheduled = 0				\
 }
 
 /*
@@ -293,6 +315,7 @@ struct starpu_task *starpu_task_create(void);
  * allocated task results in an undefined behaviour. */
 void starpu_task_destroy(struct starpu_task *task);
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
+int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 
 /* This function blocks until the task was executed. It is not possible to
  * synchronize with a task more than once. It is not possible to wait
@@ -305,6 +328,8 @@ int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
  * been executed. */
 int starpu_task_wait_for_all(void);
 
+int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx);
+
 /* This function waits until there is no more ready task. */
 int starpu_task_wait_for_no_ready(void);
 

+ 2 - 0
include/starpu_task_util.h

@@ -44,6 +44,8 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_EXECUTE_ON_DATA	(1<<10)	/* Used by MPI to define which task is going to execute the codelet */
 #define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
 #define STARPU_TAG       (1<<12) /* Tag */
+#define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
+#define STARPU_HYPERVISOR_FLOPS	(1<<14)	/* Used to specify the number of flops needed to be executed by a task */
 
 /* Wrapper to create a task. */
 int starpu_insert_task(struct starpu_codelet *cl, ...);

+ 2 - 2
libstarpu.pc.in

@@ -23,6 +23,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
-Requires: @HWLOC_REQUIRES@
+Requires: @HWLOC_REQUIRES@

+ 21 - 0
sched_ctx_hypervisor/Makefile.am

@@ -0,0 +1,21 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2011,2012 Institut National de Recherche en Informatique et Automatique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+SUBDIRS = src examples
+
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+
+versinclude_HEADERS = include/sched_ctx_hypervisor.h
+

+ 57 - 0
sched_ctx_hypervisor/examples/Makefile.am

@@ -0,0 +1,57 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2011,2012 Institut National de Recherche en Informatique et Automatique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+if !NO_BLAS_LIB
+noinst_PROGRAMS =				\
+	cholesky/cholesky_implicit  \
+	app_driven_test/app_driven_test
+
+noinst_HEADERS = 				\
+	cholesky/cholesky.h			\
+	sched_ctx_utils/sched_ctx_utils.h
+endif
+
+AM_LDFLAGS = $(top_builddir)/src/libstarpu-1.0.la 
+
+LIBS = $(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la
+
+AM_CPPFLAGS =						\
+  -I$(top_srcdir)/include				\
+  -I$(top_srcdir)/examples				\
+  -I$(top_builddir)/include				\
+  -I$(top_srcdir)/sched_ctx_hypervisor/include		\
+  $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
+
+
+if !NO_BLAS_LIB
+
+cholesky_cholesky_implicit_SOURCES =		\
+	cholesky/cholesky_implicit.c		\
+	cholesky/cholesky_models.c		\
+	cholesky/cholesky_kernels.c		\
+	sched_ctx_utils/sched_ctx_utils.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_cholesky_implicit_LDADD =		\
+	$(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la \
+	$(STARPU_BLAS_LDFLAGS)
+
+app_driven_test_app_driven_test_SOURCES =		\
+	app_driven_test/app_driven_test.c		
+
+app_driven_test_app_driven_test_LDADD =		\
+	$(top_builddir)/sched_ctx_hypervisor/src/libsched_ctx_hypervisor.la 
+
+endif

+ 119 - 0
sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <sched_ctx_hypervisor.h>
+
+#include <pthread.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+
+/* Every implementation of a codelet must have this prototype, the first                                                                                                                                             * argument (buffers) describes the buffers/streams that are managed by the                                                                                                                                       
+ * DSM; the second arguments references read-only data that is passed as an                                                                                                                                        
+ * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there                                                                                                                                      
+ * are no data input/output managed by the DSM (cl.nbuffers = 0) */
+struct params
+{
+	unsigned sched_ctx;
+    int task_tag;
+};
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    struct params *params = (struct params *) cl_arg;
+
+	int i;
+	for(i = 0; i < 1000; i++); 
+    FPRINTF(stdout, "Hello world sched_ctx = %d task_tag = %d \n", params->sched_ctx, params->task_tag);
+}
+
+struct starpu_codelet cl = {};
+
+int tag = 1;
+void* start_thread(void *arg)
+{
+	unsigned sched_ctx = *((unsigned*)arg);
+	starpu_set_sched_ctx(&sched_ctx);
+
+	struct starpu_task *task[10];
+    struct params params[10];
+	int i;
+	for(i = 0; i < 10; i++)
+	{
+		int j;
+		for(j = 0; j < 1000; j++);
+		task[i] = starpu_task_create();
+		
+		cl.where = STARPU_CPU;
+		cl.cpu_funcs[0] = cpu_func;
+		cl.nbuffers = 0;
+		
+		task[i]->cl = &cl;
+		
+		if(sched_ctx == 1 && i == 5)
+		{
+			task[i]->hypervisor_tag = tag;
+			sched_ctx_hypervisor_ioctl(sched_ctx,
+									   HYPERVISOR_TIME_TO_APPLY, tag,
+									   HYPERVISOR_MIN_WORKERS, 2,
+									   HYPERVISOR_MAX_WORKERS, 12,
+									   HYPERVISOR_NULL);
+			printf("require resize for sched_ctx %d at tag %d\n", sched_ctx, tag);
+			sched_ctx_hypervisor_resize(sched_ctx, tag);
+		}
+
+		params[i].sched_ctx = sched_ctx;
+		params[i].task_tag = task[i]->hypervisor_tag;
+
+		task[i]->cl_arg = &params[i];
+		task[i]->cl_arg_size = sizeof(params);
+		
+		starpu_task_submit(task[i]);
+	}
+	
+	starpu_task_wait_for_all();
+}
+
+int main()
+{
+	int ret = starpu_init(NULL);
+
+	if (ret == -ENODEV)
+        return 77;
+
+	int nres1 = 6;
+	int nres2 = 6;
+	int ressources1[nres1];
+	int ressources2[nres2];
+	int i;
+	for(i = 0; i < nres1; i++)
+		ressources1[i] = i;
+
+	for(i = 0; i < nres2; i++)
+		ressources2[i] = nres1+i;
+
+	unsigned sched_ctx1 = starpu_create_sched_ctx("heft", ressources1, nres1, "sched_ctx1");
+	unsigned sched_ctx2 = starpu_create_sched_ctx("heft", ressources2, nres2, "sched_ctx2");
+
+
+	struct hypervisor_policy policy;
+	policy.custom = 0;
+	policy.name = "app_driven";
+	void *perf_counters = sched_ctx_hypervisor_init(&policy);
+
+	starpu_set_perf_counters(sched_ctx1, (struct starpu_performance_counters*)perf_counters);
+	starpu_set_perf_counters(sched_ctx2, (struct starpu_performance_counters*)perf_counters);
+	sched_ctx_hypervisor_register_ctx(sched_ctx1, 0.0);
+	sched_ctx_hypervisor_register_ctx(sched_ctx2, 0.0);
+
+	pthread_t tid[2];
+
+	pthread_create(&tid[0], NULL, start_thread, (void*)&sched_ctx1);
+	pthread_create(&tid[1], NULL, start_thread, (void*)&sched_ctx2);
+
+	pthread_join(tid[0], NULL);
+	pthread_join(tid[1], NULL);
+
+	starpu_shutdown();
+	sched_ctx_hypervisor_shutdown();
+}

+ 159 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky.h

@@ -0,0 +1,159 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifiqu
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
+
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#ifdef STARPU_USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
+
+#include <common/blas.h>
+#include <starpu.h>
+#include <starpu_bound.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+#define NMAXBLOCKS	32
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
+					| ((unsigned long long)(i)<<16) 			\
+					| (unsigned long long)(j))))
+
+#define BLOCKSIZE	(size/nblocks)
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
+static unsigned pinned = 0;
+static unsigned noprio = 0;
+static unsigned check = 0;
+static unsigned bound = 0;
+static unsigned with_ctxs = 0;
+static unsigned with_noctxs = 0;
+static unsigned chole1 = 0;
+static unsigned chole2 = 0;
+
+void chol_cpu_codelet_update_u11(void **, void *);
+void chol_cpu_codelet_update_u21(void **, void *);
+void chol_cpu_codelet_update_u22(void **, void *);
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args);
+void chol_cublas_codelet_update_u21(void *descr[], void *_args);
+void chol_cublas_codelet_update_u22(void *descr[], void *_args);
+#endif
+
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-with_ctxs") == 0) 
+		{
+			with_ctxs = 1;
+			break;
+		}
+		if (strcmp(argv[i], "-with_noctxs") == 0) 
+		{
+			with_noctxs = 1;
+			break;
+		}
+		
+		if (strcmp(argv[i], "-chole1") == 0) 
+		{
+			chole1 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-chole2") == 0) 
+		{
+			chole2 = 1;
+			break;
+		}
+
+		if (strcmp(argv[i], "-size") == 0)
+		{
+		        char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nbigblocks") == 0)
+		{
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-pin") == 0)
+		{
+			pinned = 1;
+		}
+
+		if (strcmp(argv[i], "-no-prio") == 0)
+		{
+			noprio = 1;
+		}
+
+		if (strcmp(argv[i], "-bound") == 0)
+		{
+			bound = 1;
+		}
+
+		if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
+		}
+	}
+}
+
+#endif /* __DW_CHOLESKY_H__ */

+ 422 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_grain_tag.c

@@ -0,0 +1,422 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.modes = { STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
+
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+
+	/* this is an important task */
+	task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
+	}
+
+	return task;
+}
+
+static struct starpu_codelet cl21 =
+{
+	.modes = { STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
+{
+	int ret;
+
+	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+
+	task->cl = &cl21;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+
+	if (j == k+1)
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+	}
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static struct starpu_codelet cl22 =
+{
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+{
+	int ret;
+
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+
+static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
+{
+	int ret;
+
+	/* create a new codelet */
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	starpu_data_handle_t dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	for (k = 0; k < nbigblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, reclevel);
+		/* we defer the launch of the first task */
+		if (k == 0)
+		{
+			entry_task = task;
+		}
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j, reclevel);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j, reclevel);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	ret = starpu_task_submit(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		FPRINTF(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+	if (nblocks == nbigblocks)
+	{
+		/* stall the application until the end of computations */
+		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
+		starpu_data_unpartition(dataA, 0);
+		starpu_data_unregister(dataA);
+		return;
+	}
+	else
+	{
+		STARPU_ASSERT(reclevel == 0);
+		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
+
+		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
+		STARPU_ASSERT(tag_array);
+
+		unsigned ind = 0;
+		for (i = nbigblocks; i < nblocks; i++)
+		for (j = nbigblocks; j < nblocks; j++)
+		{
+			if (i <= j)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+		}
+
+		starpu_tag_wait_array(ind, tag_array);
+
+		free(tag_array);
+
+		starpu_data_unpartition(dataA, 0);
+		starpu_data_unregister(dataA);
+
+		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
+
+		cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1);
+	}
+}
+
+static void initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	if (pinned)
+	{
+		starpu_malloc((void **)A, dim*dim*sizeof(float));
+	}
+	else
+	{
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
+{
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+}
+
+static void shutdown_system(float **matA, unsigned pinned)
+{
+	if (pinned)
+	{
+	     starpu_free(*matA);
+	}
+	else
+	{
+	     free(*matA);
+	}
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+	initialize_system(&mat, size, pinned);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	cholesky_grain(mat, size, size, nblocks, nbigblocks, pinned);
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+
+	FPRINTF(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f,
+				mat, size, 0.0f, test_mat, size);
+
+	FPRINTF(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+	free(test_mat);
+#endif
+
+	shutdown_system(&mat, pinned);
+	return 0;
+}

+ 371 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_implicit.c

@@ -0,0 +1,371 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+#include "../sched_ctx_utils/sched_ctx_utils.h"
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &chol_model_11
+};
+
+static struct starpu_codelet cl21 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.model = &chol_model_21
+};
+
+static struct starpu_codelet cl22 =
+{
+	.where = STARPU_CPU|STARPU_CUDA,
+	.type = STARPU_SEQ,
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
+	.model = &chol_model_22
+};
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+
+static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
+{
+	cl22.type = STARPU_SPMD;
+}
+
+int hypervisor_tag = 1;
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
+{
+	int ret;
+	struct timeval start;
+	struct timeval end;
+
+	unsigned i,j,k;
+
+	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
+
+	gettimeofday(&start, NULL);
+
+	if (bound)
+		starpu_bound_start(0, 0);
+	/* create all the DAG nodes */
+	for (k = 0; k < nblocks; k++)
+	{
+                starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
+		if(k == 0 && with_ctxs)
+		{
+			 ret = starpu_insert_task(&cl11,
+					   STARPU_PRIORITY, prio_level,
+					   STARPU_RW, sdatakk,
+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					   STARPU_HYPERVISOR_TAG, hypervisor_tag,
+					   0);
+			set_hypervisor_conf(START_BENCH, hypervisor_tag++);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+		}
+		else
+			starpu_insert_task(&cl11,
+					   STARPU_PRIORITY, prio_level,
+					   STARPU_RW, sdatakk,
+					   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					   0);
+
+		for (j = k+1; j<nblocks; j++)
+		{
+                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+
+                        ret = starpu_insert_task(&cl21,
+						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+						 STARPU_R, sdatakk,
+						 STARPU_RW, sdatakj,
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+                                {
+					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
+					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+
+					if(k == (nblocks-2) && j == (nblocks-1) &&
+					   i == (k + 1) && with_ctxs)
+					{
+						ret = starpu_insert_task(&cl22,
+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								   STARPU_R, sdataki,
+								   STARPU_R, sdatakj,
+								   STARPU_RW, sdataij,
+								   STARPU_HYPERVISOR_TAG, hypervisor_tag,
+								   0);
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+						set_hypervisor_conf(END_BENCH, hypervisor_tag++);
+					}
+					
+					else
+						ret = starpu_insert_task(&cl22,
+								   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								   STARPU_R, sdataki,
+								   STARPU_R, sdatakj,
+								   STARPU_RW, sdataij,
+								   0);
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
+					
+                   }
+			}
+		}
+	}
+
+	starpu_task_wait_for_all();
+	if (bound)
+		starpu_bound_stop();
+
+	starpu_data_unpartition(dataA, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	unsigned long n = starpu_matrix_get_nx(dataA);
+
+	double flop = (1.0f*n*n*n)/3.0f;
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
+	else
+	{
+		FPRINTF(stderr, "Computation took (in ms)\n");
+		FPRINTF(stdout, "%2.2f\n", timing/1000);
+	
+		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound)
+		{
+			double res;
+			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		}
+	}
+}
+
+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle_t dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	_cholesky(dataA, nblocks);
+
+	starpu_data_unregister(dataA);
+}
+
+static void execute_cholesky(unsigned size, unsigned nblocks)
+{
+	float *mat;
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	cholesky(mat, size, size, nblocks);
+
+#ifdef PRINT_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+	if (check)
+	{
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i > j)
+				{
+					mat[j+i*size] = 0.0f; /* debug */
+				}
+			}
+		}
+		float *test_mat = malloc(size*size*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		SSYRK("L", "N", size, size, 1.0f,
+					mat, size, 0.0f, test_mat, size);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
+			}
+			FPRINTF(stdout, "\n");
+		}
+#endif
+
+		for (j = 0; j < size; j++)
+		{
+			for (i = 0; i < size; i++)
+			{
+				if (i <= j)
+				{
+	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+	                                float err = abs(test_mat[j +i*size] - orig);
+	                                if (err > 0.00001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
+	}
+	starpu_free(mat);
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	if(with_ctxs || with_noctxs || chole1 || chole2)
+		parse_args_ctx(argc, argv);
+
+	starpu_init(NULL);
+
+	starpu_helper_cublas_init();
+
+	if(with_ctxs)
+	{
+		construct_contexts(execute_cholesky);
+		start_2benchs(execute_cholesky);
+	}
+	else if(with_noctxs)
+		start_2benchs(execute_cholesky);
+	else if(chole1)
+		start_1stbench(execute_cholesky);
+	else if(chole2)
+		start_2ndbench(execute_cholesky);
+	else
+		execute_cholesky(size, nblocks);
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+
+	if(with_ctxs)
+		end_contexts();
+
+	return 0;
+}

+ 251 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -0,0 +1,251 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_config.h>
+#include "cholesky.h"
+//#include "../common/blas.h"
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#ifdef STARPU_HAVE_MAGMA
+#include "magma.h"
+#include "magma_lapack.h"
+#endif
+#endif
+
+/*
+ *   U22 
+ */
+
+static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+	/* printf("22\n"); */
+	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
+	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
+	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
+
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
+	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
+
+	if (s == 0)
+	{
+		int worker_size = starpu_combined_worker_get_size();
+
+		if (worker_size == 1)
+		{
+			/* Sequential CPU kernel */
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+				right, ld12, 1.0f, center, ld22);
+		}
+		else
+		{
+			/* Parallel CPU kernel */
+			int rank = starpu_combined_worker_get_rank();
+
+			int block_size = (dx + worker_size - 1)/worker_size;
+			int new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
+			
+			float *new_left = &left[block_size*rank];
+			float *new_center = &center[block_size*rank];
+
+			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
+				right, ld12, 1.0f, new_center, ld22);
+		}
+	}
+	else
+	{
+		/* CUDA kernel */
+#ifdef STARPU_USE_CUDA
+		cublasSgemm('n', 't', dy, dx, dz, 
+				-1.0f, left, ld21, right, ld12, 
+				 1.0f, center, ld22);
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
+#endif
+
+	}
+}
+
+void chol_cpu_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u22(void *descr[], void *_args)
+{
+	chol_common_cpu_codelet_update_u22(descr, 1, _args);
+}
+#endif /* STARPU_USE_CUDA */
+
+/* 
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
+{
+/*	printf("21\n"); */
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
+	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
+
+	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
+	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
+
+	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
+	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
+
+	switch (s)
+	{
+		case 0:
+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+void chol_cpu_codelet_update_u21(void *descr[], void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u21(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif 
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
+{
+/*	printf("11\n"); */
+	float *sub11;
+
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
+
+	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
+	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
+
+	unsigned z;
+
+	switch (s)
+	{
+		case 0:
+
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+		
+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+		
+				SSYR("L", nx - z - 1, -1.0f, 
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#ifdef STARPU_USE_CUDA
+		case 1:
+#ifdef STARPU_HAVE_MAGMA
+			{
+			int ret;
+			int info;
+			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
+			if (ret != MAGMA_SUCCESS)
+			{
+				fprintf(stderr, "Error in Magma: %d\n", ret);
+				STARPU_ABORT();
+			}
+			cudaError_t cures = cudaThreadSynchronize();
+			STARPU_ASSERT(!cures);
+			}
+#else
+			{
+
+			float *lambda11;
+			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
+
+			for (z = 0; z < nx; z++)
+			{
+				
+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+
+				STARPU_ASSERT(*lambda11 != 0.0f);
+				
+				*lambda11 = sqrt(*lambda11);
+
+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+
+				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaFreeHost(lambda11);
+			}
+#endif
+			break;
+#endif
+		default:
+			STARPU_ABORT();
+			break;
+	}
+}
+
+
+void chol_cpu_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef STARPU_USE_CUDA
+void chol_cublas_codelet_update_u11(void *descr[], void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif/* STARPU_USE_CUDA */

+ 160 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_models.c

@@ -0,0 +1,160 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * As a convention, in that file, buffers[0] is represented by A,
+ * 				  buffers[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+#include <starpu.h>
+#include "cholesky.h"
+
+/* #define USE_PERTURBATION	1 */
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
+{
+	uint32_t n;
+
+	n = starpu_matrix_get_nx(task->handles[0]);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
+
+#ifdef STARPU_MODEL_DEBUG
+	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel chol_model_11 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel chol_model_21 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel chol_model_22 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
+	},
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 407 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_tag.c

@@ -0,0 +1,407 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.modes = { STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+
+	/* this is an important task */
+	if (!noprio)
+		task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static struct starpu_codelet cl21 =
+{
+	.modes = { STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+
+	if (!noprio && (j == k+1))
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	int ret = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+
+}
+
+static struct starpu_codelet cl22 =
+{
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
+{
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if (!noprio && (i == k + 1) && (j == k +1) )
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	int ret = starpu_task_submit(task);
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+}
+
+
+
+/*
+ *	code to bootstrap the factorization
+ *	and construct the DAG
+ */
+
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	gettimeofday(&start, NULL);
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k);
+		/* we defer the launch of the first task */
+		if (k == 0)
+		{
+			entry_task = task;
+		}
+		else
+		{
+			int ret = starpu_task_submit(task);
+                        if (STARPU_UNLIKELY(ret == -ENODEV))
+			{
+                                FPRINTF(stderr, "No worker may execute this task\n");
+                                exit(0);
+                        }
+
+		}
+
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	int ret = starpu_task_submit(entry_task);
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+                FPRINTF(stderr, "No worker may execute this task\n");
+                exit(0);
+        }
+
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	starpu_data_unpartition(dataA, 0);
+
+	gettimeofday(&end, NULL);
+
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_matrix_get_nx(dataA);
+
+	double flop = (1.0f*n*n*n)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+static int initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_helper_cublas_init();
+
+	if (pinned)
+	{
+		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
+	}
+	else
+	{
+		*A = malloc(dim*dim*sizeof(float));
+	}
+	return 0;
+}
+
+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle_t dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_data_set_sequential_consistency_flag(dataA, 0);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_block_filter_func,
+		.nchildren = nblocks
+	};
+
+	starpu_data_map_filters(dataA, 2, &f, &f2);
+
+	_cholesky(dataA, nblocks);
+
+	starpu_data_unregister(dataA);
+}
+
+static void shutdown_system(float **matA, unsigned pinned)
+{
+	if (pinned)
+	{
+		starpu_free(*matA);
+	}
+	else
+	{
+		free(*matA);
+	}
+
+	starpu_helper_cublas_shutdown();
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+	int ret = initialize_system(&mat, size, pinned);
+	if (ret) return ret;
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+#endif
+
+
+	cholesky(mat, size, size, nblocks);
+
+#ifdef CHECK_OUTPUT
+	FPRINTF(stdout, "Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+				mat[j+i*size] = 0.0f; /* debug */
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+
+	FPRINTF(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f,
+				mat, size, 0.0f, test_mat, size);
+
+	FPRINTF(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j)
+			{
+				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+			}
+			else
+			{
+				FPRINTF(stdout, ".\t");
+			}
+		}
+		FPRINTF(stdout, "\n");
+	}
+	free(test_mat);
+#endif
+
+	shutdown_system(&mat, pinned);
+	return 0;
+}

+ 333 - 0
sched_ctx_hypervisor/examples/cholesky/cholesky_tile_tag.c

@@ -0,0 +1,333 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "cholesky.h"
+
+/* A [ y ] [ x ] */
+float *A[NMAXBLOCKS][NMAXBLOCKS];
+starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static struct starpu_codelet cl11 =
+{
+	.modes = { STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_POTRF
+	.gordon_func = SPU_FUNC_POTRF,
+#else
+#warning SPU_FUNC_POTRF is not available
+#endif
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
+{
+/*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
+
+	struct starpu_task *task = create_task(TAG11(k));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = A_state[k][k];
+
+	/* this is an important task */
+	task->priority = STARPU_MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static struct starpu_codelet cl21 =
+{
+	.modes = { STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_STRSM
+	.gordon_func = SPU_FUNC_STRSM,
+#else
+#warning SPU_FUNC_STRSM is not available
+#endif
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(unsigned k, unsigned j)
+{
+	int ret;
+
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = A_state[k][k];
+	task->handles[1] = A_state[j][k];
+
+	if (j == k+1)
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+static struct starpu_codelet cl22 =
+{
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
+#endif
+#ifdef STARPU_USE_GORDON
+#ifdef SPU_FUNC_SGEMM
+	.gordon_func = SPU_FUNC_SGEMM,
+#else
+#warning SPU_FUNC_SGEMM is not available
+#endif
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+	int ret;
+
+/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->handles[0] = A_state[i][k];
+	task->handles[1] = A_state[j][k];
+	task->handles[2] = A_state[j][i];
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
+		task->priority = STARPU_MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0)
+	{
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else
+	{
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void cholesky_no_stride(void)
+{
+	int ret;
+
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(k, nblocks);
+		/* we defer the launch of the first task */
+		if (k == 0)
+		{
+			entry_task = task;
+		}
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	ret = starpu_task_submit(entry_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	FPRINTF(stderr, "Computation took (in ms)\n");
+	FPRINTF(stdout, "%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+int main(int argc, char **argv)
+{
+	unsigned x, y;
+	unsigned i, j;
+	int ret;
+
+	parse_args(argc, argv);
+	assert(nblocks <= NMAXBLOCKS);
+
+	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Disable sequential consistency */
+	starpu_data_set_default_sequential_consistency_flag(0);
+
+	starpu_helper_cublas_init();
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y)
+		{
+#ifdef STARPU_HAVE_POSIX_MEMALIGN
+			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
+#else
+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
+#endif
+			assert(A[y][x]);
+		}
+	}
+
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
+	 * */
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	if (x <= y)
+	{
+		for (i = 0; i < BLOCKSIZE; i++)
+		for (j = 0; j < BLOCKSIZE; j++)
+		{
+			A[y][x][i*BLOCKSIZE + j] =
+				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+
+			/* make it a little more numerically stable ... ;) */
+			if ((x == y) && (i == j))
+				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
+		}
+	}
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y)
+		{
+			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
+				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
+		}
+	}
+
+	cholesky_no_stride();
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y)
+		{
+			starpu_data_unregister(A_state[y][x]);
+			free(A[y][x]);
+		}
+	}
+
+	starpu_helper_cublas_shutdown();
+
+	starpu_shutdown();
+	return 0;
+}
+
+

+ 525 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -0,0 +1,525 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "sched_ctx_utils.h"
+#include <starpu.h>
+#include "sched_ctx_hypervisor.h"
+#define NSAMPLES 3
+
+unsigned size1;
+unsigned size2;
+unsigned nblocks1;
+unsigned nblocks2;
+unsigned cpu1;
+unsigned cpu2;
+unsigned gpu;
+unsigned gpu1;
+unsigned gpu2;
+
+typedef struct {
+	unsigned id;
+	unsigned ctx;
+	int the_other_ctx;
+	int *workers;
+	int nworkers;
+	void (*bench)(float*, unsigned, unsigned);
+	unsigned size;
+	unsigned nblocks;
+	float *mat[NSAMPLES];
+} params;
+
+typedef struct {
+	double flops;
+	double avg_timing;
+} retvals;
+
+int first = 1;
+pthread_mutex_t mut;
+retvals rv[2];
+params p1, p2;
+int it = 0;
+int it2 = 0;
+
+pthread_key_t key;
+
+void init()
+{
+	size1 = 4*1024;
+	size2 = 4*1024;
+	nblocks1 = 16;
+	nblocks2 = 16;
+	cpu1 = 0;
+	cpu2 = 0;
+	gpu = 0;
+	gpu1 = 0;
+	gpu2 = 0;
+
+	rv[0].flops = 0.0;
+	rv[1].flops = 0.0;
+	rv[1].avg_timing = 0.0;
+	rv[1].avg_timing = 0.0;
+
+	p1.ctx = 0;
+	p2.ctx = 0;
+
+	p1.id = 0;
+	p2.id = 1;
+	pthread_key_create(&key, NULL);
+}
+
+void update_sched_ctx_timing_results(double flops, double avg_timing)
+{
+	unsigned *id = pthread_getspecific(key);
+	rv[*id].flops += flops;
+	rv[*id].avg_timing += avg_timing;
+}
+
+void* start_bench(void *val){
+	params *p = (params*)val;
+	int i;
+
+	pthread_setspecific(key, &p->id);
+
+	if(p->ctx != 0)
+		starpu_set_sched_ctx(&p->ctx);
+
+	for(i = 0; i < NSAMPLES; i++)
+		p->bench(p->mat[i], p->size, p->nblocks);
+	
+	/* if(p->ctx != 0) */
+	/* { */
+	/* 	pthread_mutex_lock(&mut); */
+	/* 	if(first){ */
+	/* 		sched_ctx_hypervisor_unregiser_ctx(p->ctx); */
+	/* 		starpu_delete_sched_ctx(p->ctx, p->the_other_ctx); */
+	/* 	} */
+		
+	/* 	first = 0; */
+	/* 	pthread_mutex_unlock(&mut); */
+	/* } */
+	sched_ctx_hypervisor_stop_resize(p->the_other_ctx);
+	rv[p->id].flops /= NSAMPLES;
+	rv[p->id].avg_timing /= NSAMPLES;
+}
+
+float* construct_matrix(unsigned size)
+{
+	float *mat;
+	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+		}
+	}
+	return mat;
+}
+void start_2benchs(void (*bench)(float*, unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+	
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+
+	int i;
+	for(i = 0; i < NSAMPLES; i++)
+	{
+		p1.mat[i] = construct_matrix(p1.size);
+		p2.mat[i] = construct_matrix(p2.size);
+	}
+
+	pthread_t tid[2];
+	pthread_mutex_init(&mut, NULL);
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	pthread_create(&tid[0], NULL, (void*)start_bench, (void*)&p1);
+	pthread_create(&tid[1], NULL, (void*)start_bench, (void*)&p2);
+ 
+	pthread_join(tid[0], NULL);
+	pthread_join(tid[1], NULL);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f %2.2f ", rv[0].flops, rv[1].flops);
+	printf("%2.2f %2.2f %2.2f\n", rv[0].avg_timing, rv[1].avg_timing, timing);
+
+}
+
+void start_1stbench(void (*bench)(float*, unsigned, unsigned))
+{
+	p1.bench = bench;
+	p1.size = size1;
+	p1.nblocks = nblocks1;
+
+	int i;
+	for(i = 0; i < NSAMPLES; i++)
+	{
+		p1.mat[i] = construct_matrix(p1.size);
+	}
+
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p1);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[0].flops);
+	printf("%2.2f %2.2f\n", rv[0].avg_timing, timing);
+}
+
+void start_2ndbench(void (*bench)(float*, unsigned, unsigned))
+{
+	p2.bench = bench;
+	p2.size = size2;
+	p2.nblocks = nblocks2;
+	int i;
+	for(i = 0; i < NSAMPLES; i++)
+	{
+		p2.mat[i] = construct_matrix(p2.size);
+	}
+	
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	start_bench((void*)&p2);
+
+	gettimeofday(&end, NULL);
+
+	pthread_mutex_destroy(&mut);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	timing /= 1000000;
+
+	printf("%2.2f ", rv[1].flops);
+	printf("%2.2f %2.2f\n", rv[1].avg_timing, timing);
+}
+
+void construct_contexts(void (*bench)(float*, unsigned, unsigned))
+{
+	struct hypervisor_policy policy;
+	policy.custom = 0;
+	policy.name = "idle";
+	struct starpu_performance_counters *perf_counters = sched_ctx_hypervisor_init(&policy);
+	int nworkers1 = cpu1 + gpu + gpu1;
+	int nworkers2 = cpu2 + gpu + gpu2;
+	unsigned n_all_gpus = gpu + gpu1 + gpu2;
+
+
+	int i;
+	int k = 0;
+	nworkers1 = 12;
+	p1.workers = (int*)malloc(nworkers1*sizeof(int));
+
+	/* for(i = 0; i < gpu; i++) */
+	/* 	p1.workers[k++] = i; */
+
+	/* for(i = gpu; i < gpu + gpu1; i++) */
+	/* 	p1.workers[k++] = i; */
+
+
+	/* for(i = n_all_gpus; i < n_all_gpus + cpu1; i++) */
+	/* 	p1.workers[k++] = i; */
+
+
+	for(i = 0; i < 12; i++)
+		p1.workers[i] = i; 
+
+	p1.ctx = starpu_create_sched_ctx("heft", p1.workers, nworkers1, "sched_ctx1");
+	starpu_set_perf_counters(p1.ctx, perf_counters);
+	p2.the_other_ctx = (int)p1.ctx;
+	p1.nworkers = nworkers1;
+	sched_ctx_hypervisor_register_ctx(p1.ctx, 0.0);
+	
+	/* sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 			   HYPERVISOR_MAX_IDLE, p1.workers, p1.nworkers, 5000.0, */
+	/* 			   HYPERVISOR_MAX_IDLE, p1.workers, gpu+gpu1, 100000.0, */
+	/* 			   HYPERVISOR_EMPTY_CTX_MAX_IDLE, p1.workers, p1.nworkers, 500000.0, */
+	/* 			   HYPERVISOR_GRANULARITY, 2, */
+	/* 			   HYPERVISOR_MIN_TASKS, 1000, */
+	/* 			   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 100000.0, */
+	/* 			   HYPERVISOR_MIN_WORKERS, 6, */
+	/* 			   HYPERVISOR_MAX_WORKERS, 12, */
+	/* 			   NULL); */
+
+	sched_ctx_hypervisor_ioctl(p1.ctx,
+				   HYPERVISOR_GRANULARITY, 2,
+				   HYPERVISOR_MIN_TASKS, 1000,
+				   HYPERVISOR_MIN_WORKERS, 6,
+				   HYPERVISOR_MAX_WORKERS, 12,
+				   NULL);
+
+	k = 0;
+	p2.workers = (int*)malloc(nworkers2*sizeof(int));
+
+	/* for(i = 0; i < gpu; i++) */
+	/* 	p2.workers[k++] = i; */
+
+	/* for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++) */
+	/* 	p2.workers[k++] = i; */
+
+	/* for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++) */
+	/* 	p2.workers[k++] = i; */
+
+	p2.ctx = starpu_create_sched_ctx("heft", p2.workers, 0, "sched_ctx2");
+	starpu_set_perf_counters(p2.ctx, perf_counters);
+	p1.the_other_ctx = (int)p2.ctx;
+	p2.nworkers = 0;
+	sched_ctx_hypervisor_register_ctx(p2.ctx, 0.0);
+	
+	/* sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* 			   HYPERVISOR_MAX_IDLE, p2.workers, p2.nworkers, 2000.0, */
+	/* 			   HYPERVISOR_MAX_IDLE, p2.workers, gpu+gpu2, 5000.0, */
+	/* 			   HYPERVISOR_EMPTY_CTX_MAX_IDLE, p1.workers, p1.nworkers, 500000.0, */
+	/* 			   HYPERVISOR_GRANULARITY, 2, */
+	/* 			   HYPERVISOR_MIN_TASKS, 500, */
+	/* 			   HYPERVISOR_NEW_WORKERS_MAX_IDLE, 1000.0, */
+	/* 			   HYPERVISOR_MIN_WORKERS, 4, */
+	/* 			   HYPERVISOR_MAX_WORKERS, 8, */
+	/* 			   NULL); */
+
+	sched_ctx_hypervisor_ioctl(p2.ctx,
+				   HYPERVISOR_GRANULARITY, 2,
+				   HYPERVISOR_MIN_TASKS, 500,
+				   HYPERVISOR_MIN_WORKERS, 0,
+				   HYPERVISOR_MAX_WORKERS, 6,
+				   NULL);
+
+}
+
+void set_hypervisor_conf(int event, int task_tag)
+{
+/* 	unsigned *id = pthread_getspecific(key); */
+/* 	if(*id == 0) */
+/* 	{ */
+/* 		if(event == END_BENCH) */
+/* 		{ */
+/* 			if(it < 2) */
+/* 			{ */
+/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 2, */
+/* 							   HYPERVISOR_MAX_WORKERS, 4, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+
+/* 				printf("%d: set max %d for tag %d\n", p2.ctx, 4, task_tag); */
+/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 6, */
+/* 							   HYPERVISOR_MAX_WORKERS, 8, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+/* 				printf("%d: set max %d for tag %d\n", p1.ctx, 8, task_tag); */
+/* 				sched_ctx_hypervisor_resize(p1.ctx, task_tag); */
+/* 			} */
+/* 			if(it == 2) */
+/* 			{ */
+/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 12, */
+/* 							   HYPERVISOR_MAX_WORKERS, 12, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+/* 				printf("%d: set max %d for tag %d\n", p2.ctx, 12, task_tag); */
+/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 0, */
+/* 							   HYPERVISOR_MAX_WORKERS, 0, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+/* 				printf("%d: set max %d for tag %d\n", p1.ctx, 0, task_tag); */
+/* 				sched_ctx_hypervisor_resize(p1.ctx, task_tag); */
+/* 			} */
+/* 			it++; */
+				
+/* 		} */
+/* 	} */
+/* 	else */
+/* 	{ */
+/* 		if(event == END_BENCH) */
+/* 		{ */
+/* 			if(it2 < 3) */
+/* 			{ */
+/* 				sched_ctx_hypervisor_ioctl(p1.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 6, */
+/* 							   HYPERVISOR_MAX_WORKERS, 12, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+/* 				printf("%d: set max %d for tag %d\n", p1.ctx, 12, task_tag); */
+/* 				sched_ctx_hypervisor_ioctl(p2.ctx, */
+/* 							   HYPERVISOR_MIN_WORKERS, 0, */
+/* 							   HYPERVISOR_MAX_WORKERS, 0, */
+/* 							   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+/* 							   NULL); */
+/* 				printf("%d: set max %d for tag %d\n", p2.ctx, 0, task_tag); */
+/* 				sched_ctx_hypervisor_resize(p2.ctx, task_tag); */
+/* 			} */
+/* 			it2++; */
+/* 		} */
+/* 	} */
+
+	/* if(*id == 1) */
+	/* { */
+	/* 	if(event == START_BENCH) */
+	/* 	{ */
+	/* 		int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 		sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 					   HYPERVISOR_MAX_IDLE, workers, 12, 800000.0, */
+	/* 					   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 					   NULL); */
+	/* 	} */
+	/* 	else */
+	/* 	{ */
+	/* 		if(it2 < 2) */
+	/* 		{ */
+	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 			sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 500.0, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 200.0, */
+	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 						   NULL); */
+	/* 		} */
+	/* 		if(it2 == 2) */
+	/* 		{ */
+	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 			sched_ctx_hypervisor_ioctl(p2.ctx, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 1000.0, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 500.0, */
+	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 						   HYPERVISOR_MAX_WORKERS, 12, */
+	/* 						   NULL); */
+	/* 		} */
+	/* 		it2++; */
+	/* 	} */
+		
+	/* } else { */
+	/* 	if(event == START_BENCH) */
+	/* 	{ */
+	/* 		int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 		sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 					   HYPERVISOR_MAX_IDLE, workers, 12, 1500.0, */
+	/* 					   HYPERVISOR_MAX_IDLE, workers, 3, 4000.0, */
+	/* 					   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 					   NULL); */
+	/* 	} */
+	/* 	if(event == END_BENCH) */
+	/* 	{ */
+	/* 		if(it < 2) */
+	/* 		{ */
+	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 			sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 100.0, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 5000.0, */
+	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 						   NULL); */
+	/* 		} */
+	/* 		if(it == 2) */
+	/* 		{ */
+	/* 			int workers[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; */
+	/* 			sched_ctx_hypervisor_ioctl(p1.ctx, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 12, 5000.0, */
+	/* 						   HYPERVISOR_MAX_IDLE, workers, 3, 10000.0, */
+	/* 						   HYPERVISOR_TIME_TO_APPLY, task_tag, */
+	/* 						   NULL); */
+	/* 		} */
+			
+	/* 		it++; */
+	/* 	} */
+
+	/* } */
+}
+
+void end_contexts()
+{
+	free(p1.workers);
+	free(p2.workers);
+	sched_ctx_hypervisor_shutdown();
+}
+
+void parse_args_ctx(int argc, char **argv)
+{
+	init();
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size1") == 0) {
+			char *argptr;
+			size1 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks1") == 0) {
+			char *argptr;
+			nblocks1 = strtol(argv[++i], &argptr, 10);
+		}
+		
+		if (strcmp(argv[i], "-size2") == 0) {
+			char *argptr;
+			size2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks2") == 0) {
+			char *argptr;
+			nblocks2 = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-cpu1") == 0) {
+			char *argptr;
+			cpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-cpu2") == 0) {
+			char *argptr;
+			cpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu") == 0) {
+			char *argptr;
+			gpu = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu1") == 0) {
+			char *argptr;
+			gpu1 = strtol(argv[++i], &argptr, 10);
+		}    
+
+		if (strcmp(argv[i], "-gpu2") == 0) {
+			char *argptr;
+			gpu2 = strtol(argv[++i], &argptr, 10);
+		}    
+	}
+}
+

+ 32 - 0
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.h

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include <stdlib.h>
+
+#define START_BENCH 0
+#define END_BENCH 1
+
+void parse_args_ctx(int argc, char **argv);
+void update_sched_ctx_timing_results(double gflops, double timing);
+void construct_contexts(void (*bench)(float *mat, unsigned size, unsigned nblocks));
+void end_contexts(void);
+void start_2benchs(void (*bench)(float *mat, unsigned size, unsigned nblocks));
+void start_1stbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));
+void start_2ndbench(void (*bench)(float *mat, unsigned size, unsigned nblocks));

+ 167 - 0
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -0,0 +1,167 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef STARPU_SCHED_CTX_HYPERVISOR_H
+#define STARPU_SCHED_CTX_HYPERVISOR_H
+
+#include <starpu.h>
+#include <pthread.h>
+
+/* ioctl properties*/
+#define HYPERVISOR_MAX_IDLE -1
+#define HYPERVISOR_MIN_WORKING -2
+#define HYPERVISOR_PRIORITY -3
+#define HYPERVISOR_MIN_WORKERS -4
+#define HYPERVISOR_MAX_WORKERS -5
+#define HYPERVISOR_GRANULARITY -6
+#define HYPERVISOR_FIXED_WORKERS -7
+#define HYPERVISOR_MIN_TASKS -8
+#define HYPERVISOR_NEW_WORKERS_MAX_IDLE -9
+#define HYPERVISOR_TIME_TO_APPLY -10
+#define HYPERVISOR_EMPTY_CTX_MAX_IDLE -11
+#define HYPERVISOR_NULL -12
+
+pthread_mutex_t act_hypervisor_mutex;
+
+#define MAX_IDLE_TIME 5000000000
+#define MIN_WORKING_TIME 500
+
+struct policy_config {
+	/* underneath this limit we cannot resize */
+	int min_nworkers;
+
+	/* above this limit we cannot resize */
+	int max_nworkers;
+	
+	/*resize granularity */
+	int granularity;
+
+	/* priority for a worker to stay in this context */
+	/* the smaller the priority the faster it will be moved */
+	/* to another context */
+	int priority[STARPU_NMAXWORKERS];
+
+	/* above this limit the priority of the worker is reduced */
+	double max_idle[STARPU_NMAXWORKERS];
+
+	/* underneath this limit the priority of the worker is reduced */
+	double min_working[STARPU_NMAXWORKERS];
+
+	/* workers that will not move */
+	int fixed_workers[STARPU_NMAXWORKERS];
+
+	/* max idle for the workers that will be added during the resizing process*/
+	double new_workers_max_idle;
+
+	/* above this context we allow removing all workers */
+	double empty_ctx_max_idle[STARPU_NMAXWORKERS];
+};
+
+
+struct resize_ack{
+	int receiver_sched_ctx;
+	int *moved_workers;
+	int nmoved_workers;
+	int *acked_workers;
+};
+
+struct sched_ctx_wrapper {
+	unsigned sched_ctx;
+	struct policy_config *config;
+	double current_idle_time[STARPU_NMAXWORKERS];
+	int worker_to_be_removed[STARPU_NMAXWORKERS];
+	int pushed_tasks[STARPU_NMAXWORKERS];
+	int poped_tasks[STARPU_NMAXWORKERS];
+	double total_flops;
+	double total_elapsed_flops[STARPU_NMAXWORKERS];
+	double elapsed_flops[STARPU_NMAXWORKERS];
+	double submitted_flops;
+	double remaining_flops;
+	double start_time;
+	struct resize_ack resize_ack;
+	pthread_mutex_t mutex;
+};
+
+/* Forward declaration of an internal data structure
+ * FIXME: Remove when no longer exposed.  */
+struct resize_request_entry;
+
+struct hypervisor_policy {
+	const char* name;
+	unsigned custom;
+	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
+	void (*handle_pushed_task)(unsigned sched_ctx, int worker);
+	void (*handle_poped_task)(unsigned sched_ctx, int worker);
+	void (*handle_idle_end)(unsigned sched_ctx, int worker);
+
+	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
+
+	void (*handle_submitted_job)(struct starpu_task *task, unsigned footprint);
+};
+
+
+struct starpu_performance_counters* sched_ctx_hypervisor_init(struct hypervisor_policy* policy);
+
+void sched_ctx_hypervisor_shutdown(void);
+
+void sched_ctx_hypervisor_register_ctx(unsigned sched_ctx, double total_flops);
+
+void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx);
+
+void sched_ctx_hypervisor_resize(unsigned sched_ctx, int task_tag);
+
+void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receier_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
+
+void sched_ctx_hypervisor_stop_resize(unsigned sched_ctx);
+
+void sched_ctx_hypervisor_start_resize(unsigned sched_ctx);
+
+void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...);
+
+void sched_ctx_hypervisor_set_config(unsigned sched_ctx, void *config);
+
+struct policy_config* sched_ctx_hypervisor_get_config(unsigned sched_ctx);
+
+int* sched_ctx_hypervisor_get_sched_ctxs();
+
+int sched_ctx_hypervisor_get_nsched_ctxs();
+
+int get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch);
+
+struct sched_ctx_wrapper* sched_ctx_hypervisor_get_wrapper(unsigned sched_ctx);
+
+double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_wrapper* sc_w);
+
+double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_wrapper* sc_w);
+
+const char* sched_ctx_hypervisor_get_policy();
+
+void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx);
+
+void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now);
+
+void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
+
+unsigned sched_ctx_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers);	
+
+void sched_ctx_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);	
+
+void sched_ctx_hypervisor_free_size_req(void);
+
+unsigned sched_ctx_hypervisor_can_resize(unsigned sched_ctx);
+
+#endif

+ 39 - 0
sched_ctx_hypervisor/src/Makefile.am

@@ -0,0 +1,39 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2011, 2012  INRIA
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include/starpu/$(STARPU_EFFECTIVE_VERSION)/ -I$(top_builddir)/src/ -I$(top_srcdir)/src/ -I$(top_srcdir)/sched_ctx_hypervisor/include/
+
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+lib_LTLIBRARIES = libsched_ctx_hypervisor.la
+
+libsched_ctx_hypervisor_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+
+libsched_ctx_hypervisor_la_SOURCES = 			\
+	sched_ctx_hypervisor.c				\
+	sched_ctx_config.c				\
+	hypervisor_policies/policy_tools.c		\
+	hypervisor_policies/lp_tools.c			\
+	hypervisor_policies/idle_policy.c		\
+	hypervisor_policies/app_driven_policy.c		\
+	hypervisor_policies/gflops_rate_policy.c	\
+	hypervisor_policies/lp_policy.c			\
+	hypervisor_policies/lp2_policy.c		
+
+noinst_HEADERS = sched_ctx_hypervisor_intern.h		\
+	hypervisor_policies/policy_tools.h		\
+	hypervisor_policies/lp_tools.h

+ 36 - 0
sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c

@@ -0,0 +1,36 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "policy_tools.h"
+
+#include <sched_ctx_hypervisor_intern.h>
+
+static void app_driven_handle_post_exec_hook(unsigned sched_ctx, int task_tag)
+{
+	_resize_to_unknown_receiver(sched_ctx, 1);
+}
+
+struct hypervisor_policy app_driven_policy = {
+	.size_ctxs = NULL,
+	.handle_poped_task = NULL,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = NULL,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = app_driven_handle_post_exec_hook,
+	.handle_submitted_job = NULL,
+	.custom = 0,
+	.name = "app_driven"
+};

+ 307 - 0
sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c

@@ -0,0 +1,307 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "policy_tools.h"
+
+static double _get_total_elapsed_flops_per_sched_ctx(unsigned sched_ctx)
+{
+	struct sched_ctx_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	double ret_val = 0.0;
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		ret_val += sc_w->total_elapsed_flops[i];
+	return ret_val;
+}
+
+double _get_exp_end(unsigned sched_ctx)
+{
+	struct sched_ctx_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+
+	if( elapsed_flops >= 1.0)
+	{
+		double curr_time = starpu_timing_now();
+		double elapsed_time = curr_time - sc_w->start_time;
+		double exp_end = (elapsed_time * sc_w->remaining_flops /  elapsed_flops) + curr_time;
+		return exp_end;
+	}
+	return -1.0;
+}
+
+/* computes the instructions left to be executed out of the total instructions to execute */
+double _get_flops_left_pct(unsigned sched_ctx)
+{
+	struct sched_ctx_wrapper *wrapper = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	double total_elapsed_flops = _get_total_elapsed_flops_per_sched_ctx(sched_ctx);
+	if(wrapper->total_flops == total_elapsed_flops || total_elapsed_flops > wrapper->total_flops)
+		return 0.0;
+       
+	return (wrapper->total_flops - total_elapsed_flops)/wrapper->total_flops;
+}
+
+/* select the workers needed to be moved in order to force the sender and the receiver context to finish simultaneously */
+static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *nworkers)
+{
+	struct sched_ctx_wrapper* sender_sc_w = sched_ctx_hypervisor_get_wrapper(sender_sched_ctx);
+	struct sched_ctx_wrapper* receiver_sc_w = sched_ctx_hypervisor_get_wrapper(receiver_sched_ctx);
+        int *workers = NULL;
+        double v_receiver = _get_ctx_velocity(receiver_sc_w);
+        double receiver_remainig_flops = receiver_sc_w->remaining_flops;
+        double sender_exp_end = _get_exp_end(sender_sched_ctx);
+        double sender_v_cpu = _get_velocity_per_worker_type(sender_sc_w, STARPU_CPU_WORKER);
+        double v_for_rctx = (receiver_remainig_flops/(sender_exp_end - starpu_timing_now())) - v_receiver;
+
+        int nworkers_needed = v_for_rctx/sender_v_cpu;
+/*      printf("%d->%d: v_rec %lf v %lf v_cpu %lf w_needed %d \n", sender_sched_ctx, receiver_sched_ctx, */
+/*             v_receiver, v_for_rctx, sender_v_cpu, nworkers_needed); */
+        if(nworkers_needed > 0)
+        {
+                struct policy_config *sender_config = sched_ctx_hypervisor_get_config(sender_sched_ctx);
+                unsigned potential_moving_cpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
+                unsigned potential_moving_gpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
+                unsigned sender_nworkers = starpu_get_nworkers_of_sched_ctx(sender_sched_ctx);
+                struct policy_config *config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                unsigned nworkers_ctx = starpu_get_nworkers_of_sched_ctx(receiver_sched_ctx);
+
+                if(nworkers_needed < (potential_moving_cpus + 5 * potential_moving_gpus))
+                {
+                        if((sender_nworkers - nworkers_needed) >= sender_config->min_nworkers)
+                        {
+                                if((nworkers_ctx + nworkers_needed) > config->max_nworkers)
+                                        nworkers_needed = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx);
+
+                                if(nworkers_needed > 0)
+                                {
+                                        int ngpus = nworkers_needed / 5;
+                                        int *gpus;
+                                        gpus = _get_first_workers(sender_sched_ctx, &ngpus, STARPU_CUDA_WORKER);
+                                        int ncpus = nworkers_needed - ngpus;
+                                        int *cpus;
+                                        cpus = _get_first_workers(sender_sched_ctx, &ncpus, STARPU_CPU_WORKER);
+                                        workers = (int*)malloc(nworkers_needed*sizeof(int));
+                                        int i;
+					printf("%d: gpus: ", nworkers_needed);
+                                        for(i = 0; i < ngpus; i++)
+					{
+                                                workers[(*nworkers)++] = gpus[i];
+						printf("%d ", gpus[i]);
+					}
+					printf(" cpus:");
+                                        for(i = 0; i < ncpus; i++)
+					{
+                                                workers[(*nworkers)++] = cpus[i];
+						printf("%d ", cpus[i]);
+					}
+					printf("\n");
+                                        free(gpus);
+                                        free(cpus);
+                                }
+                        }
+                }
+		else
+                {
+			/*if the needed number of workers is to big we only move the number of workers 
+			  corresponding to the granularity set by the user */
+                        int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+			
+                        if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers)
+                        {
+                                unsigned nshared_workers = starpu_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx);
+                                if((nworkers_ctx + nworkers_to_move - nshared_workers) > config->max_nworkers)
+                                        nworkers_to_move = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx + nshared_workers);
+
+                                if(nworkers_to_move > 0)
+                                {
+                                        workers = _get_first_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
+                                        *nworkers = nworkers_to_move;
+                                }
+                        }
+                }
+        }
+        return workers;
+}
+
+static unsigned _gflops_rate_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize)
+{
+        int ret = 1;
+        if(force_resize)
+                pthread_mutex_lock(&act_hypervisor_mutex);
+        else
+                ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+        {
+                int nworkers_to_move = 0;
+                int *workers_to_move =  _get_workers_to_move(sender_sched_ctx, receiver_sched_ctx, &nworkers_to_move);
+		if(nworkers_to_move > 0)
+                {
+                        sched_ctx_hypervisor_move_workers(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move, 0);
+
+                        struct policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                        int i;
+                        for(i = 0; i < nworkers_to_move; i++)
+                                new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
+
+                        free(workers_to_move);
+                }
+                pthread_mutex_unlock(&act_hypervisor_mutex);
+                return 1;
+        }
+        return 0;
+
+}
+
+static int _find_fastest_sched_ctx()
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	double first_exp_end = _get_exp_end(sched_ctxs[0]);
+	int fastest_sched_ctx = first_exp_end == -1.0  ? -1 : sched_ctxs[0];
+	double curr_exp_end = 0.0;
+	int i;
+	for(i = 1; i < nsched_ctxs; i++)
+	{
+		curr_exp_end = _get_exp_end(sched_ctxs[i]);
+		if((curr_exp_end < first_exp_end || first_exp_end == -1.0) && curr_exp_end != -1.0)
+		{
+			first_exp_end = curr_exp_end;
+			fastest_sched_ctx = sched_ctxs[i];
+		}
+	}
+
+	return fastest_sched_ctx;
+
+}
+
+static int _find_slowest_sched_ctx()
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	int slowest_sched_ctx = -1;
+	double curr_exp_end = 0.0;
+	double last_exp_end = -1.0;
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		curr_exp_end = _get_exp_end(sched_ctxs[i]);
+		/*if it hasn't started bc of no ressources give it priority */
+		if(curr_exp_end == -1.0)
+			return sched_ctxs[i];
+		if( curr_exp_end > last_exp_end)
+		{
+			slowest_sched_ctx = sched_ctxs[i];
+			last_exp_end = curr_exp_end;
+		}
+	}
+
+	return slowest_sched_ctx;
+
+}
+
+static int _find_slowest_available_sched_ctx(unsigned sched_ctx)
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	int slowest_sched_ctx = -1;
+	double curr_exp_end = 0.0;
+	double last_exp_end = -1.0;
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		if(sched_ctxs[i] != sched_ctx)
+		{
+			curr_exp_end = _get_exp_end(sched_ctxs[i]);
+			/*if it hasn't started bc of no ressources give it priority */
+			if(curr_exp_end == -1.0)
+				return sched_ctxs[i];
+			if(last_exp_end < curr_exp_end)
+			{
+				slowest_sched_ctx = sched_ctxs[i];
+				last_exp_end = curr_exp_end;
+			}
+		}
+	}
+
+	return slowest_sched_ctx;
+
+}
+
+static void gflops_rate_resize(unsigned sched_ctx)
+{
+	_get_exp_end(sched_ctx);
+	double flops_left_pct = _get_flops_left_pct(sched_ctx);
+
+	/* if the context finished all the instructions it had to execute 
+	 we move all the resources to the slowest context */
+	if(flops_left_pct == 0.0f)
+	{
+		int slowest_sched_ctx = _find_slowest_available_sched_ctx(sched_ctx);
+		if(slowest_sched_ctx != -1)
+		{
+			double slowest_flops_left_pct = _get_flops_left_pct(slowest_sched_ctx);
+			if(slowest_flops_left_pct != 0.0f)
+			{
+				struct policy_config* config = sched_ctx_hypervisor_get_config(sched_ctx);
+				config->min_nworkers = 0;
+				config->max_nworkers = 0;
+				printf("ctx %d finished & gives away the res to %d; slow_left %lf\n", sched_ctx, slowest_sched_ctx, slowest_flops_left_pct);
+				_resize(sched_ctx, slowest_sched_ctx, 1, 1);
+				sched_ctx_hypervisor_stop_resize(slowest_sched_ctx);
+			}
+		}
+	}
+
+	int fastest_sched_ctx = _find_fastest_sched_ctx();
+	int slowest_sched_ctx = _find_slowest_sched_ctx();
+
+	if(fastest_sched_ctx != -1 && slowest_sched_ctx != -1 && fastest_sched_ctx != slowest_sched_ctx)
+	{
+		double fastest_exp_end = _get_exp_end(fastest_sched_ctx);
+		double slowest_exp_end = _get_exp_end(slowest_sched_ctx);
+
+		if((slowest_exp_end == -1.0 && fastest_exp_end != -1.0) || ((fastest_exp_end + (fastest_exp_end*0.5)) < slowest_exp_end ))
+		{
+			double fast_flops_left_pct = _get_flops_left_pct(fastest_sched_ctx);
+			if(fast_flops_left_pct < 0.8)
+			{
+
+				struct sched_ctx_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(slowest_sched_ctx);
+				double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+				if((elapsed_flops/sc_w->total_flops) > 0.1)
+					_gflops_rate_resize(fastest_sched_ctx, slowest_sched_ctx, 0);
+			}
+		}
+	}
+}
+
+void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
+{
+	gflops_rate_resize(sched_ctx);
+}
+
+struct hypervisor_policy gflops_rate_policy = {
+	.size_ctxs = NULL,
+	.handle_poped_task = gflops_rate_handle_poped_task,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = NULL,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = NULL,
+	.handle_submitted_job = NULL,
+	.custom = 0,
+	.name = "gflops_rate"
+};

+ 54 - 0
sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c

@@ -0,0 +1,54 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "policy_tools.h"
+
+unsigned worker_belong_to_other_sched_ctx(unsigned sched_ctx, int worker)
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+		if(sched_ctxs[i] != sched_ctx && starpu_worker_belongs_to_sched_ctx(worker, sched_ctxs[i]))
+			return 1;
+	return 0;
+}
+
+void idle_handle_idle_cycle(unsigned sched_ctx, int worker)
+{
+	struct sched_ctx_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct policy_config *config = sc_w->config;
+	if(config != NULL &&  sc_w->current_idle_time[worker] > config->max_idle[worker])
+	{
+		if(worker_belong_to_other_sched_ctx(sched_ctx, worker))
+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(&worker, 1, sched_ctx, 1);
+		else
+			_resize_to_unknown_receiver(sched_ctx, 0);
+	}
+}
+
+struct hypervisor_policy idle_policy = {
+	.size_ctxs = NULL,
+	.handle_poped_task = NULL,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = idle_handle_idle_cycle,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = NULL,
+	.handle_submitted_job = NULL,
+	.custom = 0,
+	.name = "idle"
+};

+ 595 - 0
sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c

@@ -0,0 +1,595 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "lp_tools.h"
+#include <math.h>
+
+static struct bound_task_pool *task_pools = NULL;
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers);
+static double _find_tmax(double t1, double t2);
+static unsigned _compute_task_distribution_over_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt], int *sched_ctxs, int *workers)
+{	
+	double draft_tasks[nw][nt];
+	double draft_w_in_s[ns][nw];
+	
+	int w,t, s;
+	for(w = 0; w < nw; w++)
+		for(t = 0; t < nt; t++)
+		{
+			tasks[w][t] = 0.0;
+			draft_tasks[w][t] = 0.0;
+		}
+	
+	for(s = 0; s < ns; s++)
+		for(w = 0; w < nw; w++)
+		{
+			w_in_s[s][w] = 0.0;
+			draft_w_in_s[s][w] = 0.0;
+		}
+
+	/* smallest possible tmax, difficult to obtain as we 
+	   compute the nr of flops and not the tasks */
+	double smallest_tmax = _lp_get_tmax(nw, workers);
+	double tmax = smallest_tmax * ns;
+	
+	double res = 1.0;
+	unsigned has_sol = 0;
+	double tmin = 0.0;
+	double old_tmax = 0.0;
+	unsigned found_sol = 0;
+
+	struct timeval start_time;
+	struct timeval end_time;
+	int nd = 0;
+	gettimeofday(&start_time, NULL);
+
+	/* we fix tmax and we do not treat it as an unknown
+	   we just vary by dichotomy its values*/
+	while(tmax > 1.0)
+	{
+		/* find solution and save the values in draft tables
+		   only if there is a solution for the system we save them
+		   in the proper table */
+		res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s, sched_ctxs, workers);
+		if(res != 0.0)
+		{
+			for(w = 0; w < nw; w++)
+				for(t = 0; t < nt; t++)
+					tasks[w][t] = draft_tasks[w][t];
+			for(s = 0; s < ns; s++)
+				for(w = 0; w < nw; w++)
+					w_in_s[s][w] = draft_w_in_s[s][w];
+			has_sol = 1;
+			found_sol = 1;
+		}
+		else
+			has_sol = 0;
+		
+		/* if we have a solution with this tmax try a smaller value
+		   bigger than the old min */
+		if(has_sol)
+		{
+			if(old_tmax != 0.0 && (old_tmax - tmax) < 0.5)
+				break;
+			old_tmax = tmax;
+		}
+		else /*else try a bigger one but smaller than the old tmax */
+		{
+			tmin = tmax;
+			if(old_tmax != 0.0)
+				tmax = old_tmax;
+		}
+		if(tmin == tmax) break;
+		tmax = _find_tmax(tmin, tmax);
+		
+		if(tmax < smallest_tmax)
+		{
+			tmax = old_tmax;
+			tmin = smallest_tmax;
+			tmax = _find_tmax(tmin, tmax);
+		}
+		nd++;
+	}
+	gettimeofday(&end_time, NULL);
+
+	long diff_s = end_time.tv_sec  - start_time.tv_sec;
+	long diff_us = end_time.tv_usec  - start_time.tv_usec;
+	
+	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+//        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
+
+	return found_sol;
+}
+
+static void _redistribute_resources_in_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw], unsigned first_time, int *in_sched_ctxs, int *workers)
+{
+	int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+	int s, s2, w;
+
+	for(s = 0; s < ns; s++)
+	{
+		int workers_to_add[nw], workers_to_remove[nw];
+		int destination_ctx[nw][ns];
+
+		for(w = 0; w < nw; w++)
+		{
+			workers_to_add[w] = -1;
+			workers_to_remove[w] = -1;
+			for(s2 = 0; s2 < ns; s2++)
+				destination_ctx[w][s2] = -1;
+		}
+
+		int nadd = 0, nremove = 0;
+
+		for(w = 0; w < nw; w++)
+		{
+			enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
+				starpu_worker_get_type(workers[w]);
+
+			if(arch == STARPU_CPU_WORKER)
+			{
+				if(w_in_s[s][w] >= 0.5)
+				{
+					workers_to_add[nadd++] = workers == NULL ? w : workers[w];
+				}
+				else
+				{
+					workers_to_remove[nremove++] = workers == NULL ? w : workers[w];
+					for(s2 = 0; s2 < ns; s2++)
+						if(s2 != s && w_in_s[s2][w] >= 0.5)
+							destination_ctx[w][s2] = 1;
+						else
+							destination_ctx[w][s2] = 0;	
+				}
+			}
+			else
+			{
+				if(w_in_s[s][w] >= 0.3)
+				{
+					workers_to_add[nadd++] = workers == NULL ? w : workers[w];
+				}
+				else
+				{
+					workers_to_remove[nremove++] = workers == NULL ? w : workers[w];
+					for(s2 = 0; s2 < ns; s2++)
+						if(s2 != s && w_in_s[s2][w] >= 0.3)
+							destination_ctx[w][s2] = 1;
+						else
+							destination_ctx[w][s2] = 0;
+				}
+			}
+	
+		}
+
+		sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, nadd, sched_ctxs[s]);
+		struct policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
+		int i;
+		for(i = 0; i < nadd; i++)
+			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
+		
+		if(!first_time)
+		{
+			/* do not remove workers if they can't go anywhere */
+			int w2;
+			unsigned found_one_dest[nremove];
+			unsigned all_have_dest = 1;
+			for(w2 = 0; w2 < nremove; w2++)
+				found_one_dest[w2] = 0;
+
+			for(w2 = 0; w2 < nremove; w2++)
+				for(s2 = 0; s2 < ns; s2++)
+				{
+					/* if the worker has to be removed we should find a destination
+					   otherwise we are not interested */
+					if(destination_ctx[w2][s2] == -1)
+						found_one_dest[w2] = -1;
+					if(destination_ctx[w2][s2] == 1)// && sched_ctx_hypervisor_can_resize(sched_ctxs[s2]))
+					{
+						found_one_dest[w2] = 1;
+						break;
+					}
+				}
+			for(w2 = 0; w2 < nremove; w2++)
+			{
+				if(found_one_dest[w2] == 0)
+				{
+					all_have_dest = 0;
+					break;
+				}
+			}
+			if(all_have_dest)
+				sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_to_remove, nremove, sched_ctxs[s], 0);
+		}
+	}
+
+}
+
+static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	int ns = sched_ctxs == NULL ? sched_ctx_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	int nw = workers == NULL ? starpu_worker_get_count() : nworkers; /* Number of different workers */
+	int nt = 0; /* Number of different kinds of tasks */
+	struct bound_task_pool * tp;
+	for (tp = task_pools; tp; tp = tp->next)
+		nt++;
+	
+	double w_in_s[ns][nw];
+	
+	unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, NULL, sched_ctxs, workers);
+	/* if we did find at least one solution redistribute the resources */
+	if(found_sol)
+		_redistribute_resources_in_ctxs(ns, nw, nt, w_in_s, 1, sched_ctxs, workers);
+}
+
+static void size_if_required()
+{
+	int nsched_ctxs, nworkers;
+	int *sched_ctxs, *workers;
+	unsigned has_req = sched_ctx_hypervisor_get_size_req(&sched_ctxs, &nsched_ctxs, &workers, &nworkers);	
+
+	if(has_req)
+	{
+		struct sched_ctx_wrapper* sc_w = NULL;
+		unsigned ready_to_size = 1;
+		int s;
+		pthread_mutex_lock(&act_hypervisor_mutex);
+		for(s = 0; s < nsched_ctxs; s++)
+		{
+			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
+			if(sc_w->submitted_flops < sc_w->total_flops)
+				ready_to_size = 0;
+		}
+
+		if(ready_to_size)
+			_size_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
+		pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}
+
+static void lp2_handle_submitted_job(struct starpu_task *task, uint32_t footprint)
+{
+	/* count the tasks of the same type */
+	pthread_mutex_lock(&mutex);
+	struct bound_task_pool *tp = NULL;
+
+	for (tp = task_pools; tp; tp = tp->next)
+	{
+		if (tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
+			break;
+	}
+
+	if (!tp)
+	{
+		tp = (struct bound_task_pool *) malloc(sizeof(struct bound_task_pool));
+		tp->cl = task->cl;
+		tp->footprint = footprint;
+		tp->sched_ctx_id = task->sched_ctx;
+		tp->n = 0;
+		tp->next = task_pools;
+		task_pools = tp;
+	}
+
+	/* One more task of this kind */
+	tp->n++;
+	pthread_mutex_unlock(&mutex);
+
+	size_if_required();
+}
+
+static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt], int *workers)
+{
+        struct bound_task_pool *tp;
+        int w, t;
+        for (w = 0; w < nw; w++)
+        {
+                for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+                {
+                        enum starpu_perf_archtype arch = workers == NULL ? starpu_worker_get_perf_archtype(w) :
+				starpu_worker_get_perf_archtype(workers[w]);
+                        double length = starpu_history_based_job_expected_perf(tp->cl->model, arch, tp->footprint);
+
+                        if (isnan(length))
+                                times[w][t] = NAN;
+                       else
+                                times[w][t] = length / 1000.;	
+                }
+        }
+}
+
+/*                                                                                                                                                                                                                  
+ * GNU Linear Programming Kit backend                                                                                                                                                                               
+ */
+#ifdef HAVE_GLPK_H
+#include <glpk.h>
+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers)
+{
+	struct bound_task_pool * tp;
+	int t, w, s;
+	glp_prob *lp;
+
+	lp = glp_create_prob();
+	glp_set_prob_name(lp, "StarPU theoretical bound");
+	glp_set_obj_dir(lp, GLP_MAX);
+	glp_set_obj_name(lp, "total execution time");
+
+	{
+		double times[nw][nt];
+		int ne = nt * nw /* worker execution time */
+			+ nw * ns
+			+ nw * (nt + ns)
+			+ 1; /* glp dumbness */
+		int n = 1;
+		int ia[ne], ja[ne];
+		double ar[ne];
+
+		_starpu_get_tasks_times(nw, nt, times, workers);
+
+		/* Variables: number of tasks i assigned to worker j, and tmax */
+		glp_add_cols(lp, nw*nt+ns*nw);
+#define colnum(w, t) ((t)*nw+(w)+1)
+		for(s = 0; s < ns; s++)
+			for(w = 0; w < nw; w++)
+				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
+
+		for (w = 0; w < nw; w++)
+			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+			{
+				char name[32];
+				snprintf(name, sizeof(name), "w%dt%dn", w, t);
+				glp_set_col_name(lp, colnum(w, t), name);
+				glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
+			}
+		for(s = 0; s < ns; s++)
+			for(w = 0; w < nw; w++)
+			{
+				char name[32];
+				snprintf(name, sizeof(name), "w%ds%dn", w, s);
+				glp_set_col_name(lp, nw*nt+s*nw+w+1, name);	
+				glp_set_col_bnds(lp, nw*nt+s*nw+w+1, GLP_DB, 0.0, 1.0);
+			}
+
+		int *sched_ctxs = in_sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+
+		int curr_row_idx = 0;
+		/* Total worker execution time */
+		glp_add_rows(lp, nw*ns);
+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		{
+			int someone = 0;
+			for (w = 0; w < nw; w++)
+				if (!isnan(times[w][t]))
+					someone = 1;
+			if (!someone)
+			{
+				/* This task does not have any performance model at all, abort */
+				glp_delete_prob(lp);
+				return 0.0;
+			}
+		}
+		/*sum(t[t][w]*n[t][w]) < x[s][w]*tmax */
+		for(s = 0; s < ns; s++)
+		{
+			for (w = 0; w < nw; w++)
+			{
+				char name[32], title[64];
+				starpu_worker_get_name(w, name, sizeof(name));
+				snprintf(title, sizeof(title), "worker %s", name);
+				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
+				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+				{
+					if(tp->sched_ctx_id == sched_ctxs[s])
+					{
+						ia[n] = curr_row_idx+s*nw+w+1;
+						ja[n] = colnum(w, t);
+						if (isnan(times[w][t]))
+							ar[n] = 1000000000.;
+						else
+							ar[n] = times[w][t];
+						n++;
+					}
+				}
+				/* x[s][w] = 1 | 0 */
+				ia[n] = curr_row_idx+s*nw+w+1;
+				ja[n] = nw*nt+s*nw+w+1;
+				ar[n] = (-1) * tmax;
+				n++;
+				glp_set_row_bnds(lp, curr_row_idx+s*nw+w+1, GLP_UP, 0.0, 0.0);
+			}
+		}
+
+		curr_row_idx += nw*ns;
+
+		/* Total task completion */
+		glp_add_rows(lp, nt);
+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+		{
+			char name[32], title[64];
+			starpu_worker_get_name(w, name, sizeof(name));
+			snprintf(title, sizeof(title), "task %s key %x", tp->cl->name, (unsigned) tp->footprint);
+			glp_set_row_name(lp, curr_row_idx+t+1, title);
+			for (w = 0; w < nw; w++)
+			{
+				ia[n] = curr_row_idx+t+1;
+				ja[n] = colnum(w, t);
+				ar[n] = 1;
+				n++;
+			}
+			glp_set_row_bnds(lp, curr_row_idx+t+1, GLP_FX, tp->n, tp->n);
+		}
+
+		curr_row_idx += nt;
+
+		/* sum(x[s][i]) = 1 */
+		glp_add_rows(lp, nw);
+		for (w = 0; w < nw; w++)
+		{
+			char name[32], title[64];
+			starpu_worker_get_name(w, name, sizeof(name));
+			snprintf(title, sizeof(title), "w%x", w);
+			glp_set_row_name(lp, curr_row_idx+w+1, title);
+			for(s = 0; s < ns; s++)
+			{
+				ia[n] = curr_row_idx+w+1;
+				ja[n] = nw*nt+s*nw+w+1;
+				ar[n] = 1;
+				n++;
+			}
+
+			glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
+		}
+		if(n != ne)
+			printf("ns= %d nw = %d nt = %d n = %d ne = %d\n", ns, nw, nt, n, ne);
+		STARPU_ASSERT(n == ne);
+
+		glp_load_matrix(lp, ne-1, ia, ja, ar);
+	}
+
+	glp_smcp parm;
+	glp_init_smcp(&parm);
+	parm.msg_lev = GLP_MSG_OFF;
+	int ret = glp_simplex(lp, &parm);
+	if (ret)
+	{
+		glp_delete_prob(lp);
+		lp = NULL;
+		return 0.0;
+	}
+
+	int stat = glp_get_prim_stat(lp);
+	/* if we don't have a solution return */
+	if(stat == GLP_NOFEAS)
+	{
+		glp_delete_prob(lp);
+		lp = NULL;
+		return 0.0;
+	}
+
+	double res = glp_get_obj_val(lp);
+	for (w = 0; w < nw; w++)
+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+			tasks[w][t] = glp_get_col_prim(lp, colnum(w, t));
+
+	for(s = 0; s < ns; s++)
+		for(w = 0; w < nw; w++)
+			w_in_s[s][w] = glp_get_col_prim(lp, nw*nt+s*nw+w+1);
+
+	glp_delete_prob(lp);
+	return res;
+}
+
+
+static double _find_tmax(double t1, double t2)
+{
+	return t1 + ((t2 - t1)/2);
+}
+
+
+static void lp2_handle_poped_task(unsigned sched_ctx, int worker)
+{
+	struct sched_ctx_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	
+	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{
+		if(sc_w->submitted_flops < sc_w->total_flops)
+		{
+			pthread_mutex_unlock(&act_hypervisor_mutex);
+			return;
+		}
+
+		if(_velocity_gap_btw_ctxs())
+		{
+			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
+			int nw = starpu_worker_get_count(); /* Number of different workers */
+			int nt = 0; /* Number of different kinds of tasks */
+			struct bound_task_pool * tp;
+			for (tp = task_pools; tp; tp = tp->next)
+				nt++;
+			
+			double w_in_s[ns][nw];
+			double tasks_per_worker[nw][nt];
+
+			unsigned found_sol = _compute_task_distribution_over_ctxs(ns, nw, nt, w_in_s, tasks_per_worker, NULL, NULL);
+			/* if we did find at least one solution redistribute the resources */
+			if(found_sol)
+			{
+				int w, s;
+				double nworkers[ns][2];
+				int nworkers_rounded[ns][2];
+				for(s = 0; s < ns; s++)
+				{
+					nworkers[s][0] = 0.0;
+					nworkers[s][1] = 0.0;
+					nworkers_rounded[s][0] = 0;
+					nworkers_rounded[s][1] = 0;
+
+				}
+
+				for(s = 0; s < ns; s++)
+				{
+					for(w = 0; w < nw; w++)
+					{
+						enum starpu_perf_archtype arch = starpu_worker_get_type(w);
+						
+						if(arch == STARPU_CUDA_WORKER)
+						{
+							nworkers[s][0] += w_in_s[s][w];
+							if(w_in_s[s][w] >= 0.3)
+								nworkers_rounded[s][0]++;
+						}
+						else
+						{
+							nworkers[s][1] += w_in_s[s][w];
+							if(w_in_s[s][w] > 0.3)
+								nworkers_rounded[s][1]++;
+						}
+					}
+				}
+/* 				for(s = 0; s < ns; s++) */
+/* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
+/* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
+
+				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
+
+			}
+		}
+		pthread_mutex_unlock(&act_hypervisor_mutex);
+	}		
+}
+
+
+static void lp2_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+{
+	sched_ctx_hypervisor_save_size_req(sched_ctxs, nsched_ctxs, workers, nworkers);
+}
+
+struct hypervisor_policy lp2_policy = {
+	.size_ctxs = lp2_size_ctxs,
+	.handle_poped_task = lp2_handle_poped_task,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = NULL,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = NULL,
+	.handle_submitted_job = lp2_handle_submitted_job,
+	.custom = 0,
+	.name = "lp2"
+};
+	
+#endif /* HAVE_GLPK_H */
+

+ 101 - 0
sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c

@@ -0,0 +1,101 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "lp_tools.h"
+
+
+static void lp_handle_poped_task(unsigned sched_ctx, int worker)
+{
+	if(_velocity_gap_btw_ctxs())
+	{
+		int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+		
+		double nworkers[nsched_ctxs][2];
+
+		int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+		if(ret != EBUSY)
+		{ 
+			int total_nw[2];
+			_get_total_nw(NULL, -1, 2, total_nw);
+
+
+			struct timeval start_time;
+			struct timeval end_time;
+			gettimeofday(&start_time, NULL);
+
+			double vmax = _lp_get_nworkers_per_ctx(nsched_ctxs, 2, nworkers, total_nw);
+			gettimeofday(&end_time, NULL);
+
+			long diff_s = end_time.tv_sec  - start_time.tv_sec;
+			long diff_us = end_time.tv_usec  - start_time.tv_usec;
+
+			float timing = (float)(diff_s*1000000 + diff_us)/1000;
+
+			if(vmax != 0.0)
+			{
+				int nworkers_rounded[nsched_ctxs][2];
+				_lp_round_double_to_int(nsched_ctxs, 2, nworkers, nworkers_rounded);				
+				_lp_redistribute_resources_in_ctxs(nsched_ctxs, 2, nworkers_rounded, nworkers);
+			}
+			pthread_mutex_unlock(&act_hypervisor_mutex);
+		}
+	}		
+}
+static void lp_size_ctxs(int *sched_ctxs, int ns, int *workers, int nworkers)
+{	
+	int nsched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_nsched_ctxs() : ns;
+	double nworkers_per_type[nsched_ctxs][2];
+	int total_nw[2];
+	_get_total_nw(workers, nworkers, 2, total_nw);
+
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	double vmax = _lp_get_nworkers_per_ctx(nsched_ctxs, 2, nworkers_per_type, total_nw);
+	if(vmax != 0.0)
+	{
+		printf("********size\n");
+/* 		for( i = 0; i < nsched_ctxs; i++) */
+/* 		{ */
+/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 0, res[i][0]); */
+/* 			printf("ctx %d/worker type %d: n = %lf \n", i, 1, res[i][1]); */
+/* 		} */
+		int nworkers_per_type_rounded[nsched_ctxs][2];
+		_lp_round_double_to_int(nsched_ctxs, 2, nworkers_per_type, nworkers_per_type_rounded);
+/*       		for( i = 0; i < nsched_ctxs; i++) */
+/* 		{ */
+/* 			printf("ctx %d/worker type %d: n = %d \n", i, 0, res_rounded[i][0]); */
+/* 			printf("ctx %d/worker type %d: n = %d \n", i, 1, res_rounded[i][1]); */
+/* 		} */
+		
+		_lp_distribute_resources_in_ctxs(sched_ctxs, nsched_ctxs, 2, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
+	}
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+}
+
+#ifdef HAVE_GLPK_H
+struct hypervisor_policy lp_policy = {
+	.size_ctxs = lp_size_ctxs,
+	.handle_poped_task = lp_handle_poped_task,
+	.handle_pushed_task = NULL,
+	.handle_idle_cycle = NULL,
+	.handle_idle_end = NULL,
+	.handle_post_exec_hook = NULL,
+	.handle_submitted_job = NULL,
+	.custom = 0,
+	.name = "lp"
+};
+	
+#endif /* HAVE_GLPK_H */
+

+ 420 - 0
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c

@@ -0,0 +1,420 @@
+#include <math.h>
+#include "lp_tools.h"
+
+#ifdef HAVE_GLPK_H
+
+static double _glp_get_nworkers_per_ctx(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], int  total_nw[nw])
+{
+	int s, w;
+	glp_prob *lp;
+	
+	int ne =
+		(ns*nw+1)*(ns+nw)
+		+ 1; /* glp dumbness */
+	int n = 1;
+	int ia[ne], ja[ne];
+	double ar[ne];
+
+	lp = glp_create_prob();
+
+	glp_set_prob_name(lp, "sample");
+	glp_set_obj_dir(lp, GLP_MAX);
+        glp_set_obj_name(lp, "max speed");
+
+	/* we add nw*ns columns one for each type of worker in each context 
+	   and another column corresponding to the 1/tmax bound (bc 1/tmax is a variable too)*/
+	glp_add_cols(lp, nw*ns+1);
+
+	for(s = 0; s < ns; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			char name[32];
+			snprintf(name, sizeof(name), "worker%dctx%d", w, s);
+			glp_set_col_name(lp, n, name);
+			glp_set_col_bnds(lp, n, GLP_LO, 0.3, 0.0);
+			n++;
+		}
+	}
+
+	/*1/tmax should belong to the interval [0.0;1.0]*/
+	glp_set_col_name(lp, n, "vmax");
+	glp_set_col_bnds(lp, n, GLP_DB, 0.0, 1.0);
+	/* Z = 1/tmax -> 1/tmax structural variable, nCPUs & nGPUs in ctx are auxiliar variables */
+	glp_set_obj_coef(lp, n, 1.0);
+
+	n = 1;
+	/* one row corresponds to one ctx*/
+	glp_add_rows(lp, ns);
+
+	for(s = 0; s < ns; s++)
+	{
+		char name[32];
+		snprintf(name, sizeof(name), "ctx%d", s);
+		glp_set_row_name(lp, s+1, name);
+		glp_set_row_bnds(lp, s+1, GLP_LO, 0., 0.);
+
+		for(w = 0; w < nw; w++)
+		{
+			int s2;
+			for(s2 = 0; s2 < ns; s2++)
+			{
+				if(s2 == s)
+				{
+					ia[n] = s+1;
+					ja[n] = w + nw*s2 + 1;
+					ar[n] = v[s][w];
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+				}
+				else
+				{
+					ia[n] = s+1;
+					ja[n] = w + nw*s2 + 1;
+					ar[n] = 0.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+				}
+				n++;
+			}
+		}
+		/* 1/tmax */
+		ia[n] = s+1;
+		ja[n] = ns*nw+1;
+		ar[n] = (-1) * flops[s];
+//		printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+		n++;
+	}
+	
+	/*we add another linear constraint : sum(all cpus) = 9 and sum(all gpus) = 3 */
+	glp_add_rows(lp, nw);
+
+	for(w = 0; w < nw; w++)
+	{
+		char name[32];
+		snprintf(name, sizeof(name), "w%d", w);
+		glp_set_row_name(lp, ns+w+1, name);
+		for(s = 0; s < ns; s++)
+		{
+			int w2;
+			for(w2 = 0; w2 < nw; w2++)
+			{
+				if(w2 == w)
+				{
+					ia[n] = ns+w+1;
+					ja[n] = w2+s*nw + 1;
+					ar[n] = 1.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+				}
+				else
+				{
+					ia[n] = ns+w+1;
+					ja[n] = w2+s*nw + 1;
+					ar[n] = 0.0;
+//					printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+				}
+				n++;
+			}
+		}
+		/* 1/tmax */
+		ia[n] = ns+w+1;
+		ja[n] = ns*nw+1;
+		ar[n] = 0.0;
+//		printf("ia[%d]=%d ja[%d]=%d ar[%d]=%lf\n", n, ia[n], n, ja[n], n, ar[n]);
+		n++;
+
+		/*sum(all gpus) = 3*/
+		if(w == 0)
+			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[0], total_nw[0]);
+
+		/*sum(all cpus) = 9*/
+		if(w == 1) 
+			glp_set_row_bnds(lp, ns+w+1, GLP_FX, total_nw[1], total_nw[1]);
+	}
+
+	STARPU_ASSERT(n == ne);
+
+	glp_load_matrix(lp, ne-1, ia, ja, ar);
+
+	glp_smcp parm;
+	glp_init_smcp(&parm);
+	parm.msg_lev = GLP_MSG_OFF;
+	glp_simplex(lp, &parm);
+	
+	double vmax = glp_get_obj_val(lp);
+
+	n = 1;
+	for(s = 0; s < ns; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			res[s][w] = glp_get_col_prim(lp, n);
+			n++;
+		}
+	}
+
+	glp_delete_prob(lp);
+	return vmax;
+}
+
+#endif //HAVE_GLPK_H
+
+double _lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers])
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+#ifdef HAVE_GLPK_H
+	double v[nsched_ctxs][ntypes_of_workers];
+	double flops[nsched_ctxs];
+#endif
+	int i = 0;
+	struct sched_ctx_wrapper* sc_w;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
+#ifdef HAVE_GLPK_H
+		v[i][0] = 200.0;//_get_velocity_per_worker_type(sc_w, STARPU_CUDA_WORKER);
+		v[i][1] = 20.0;//_get_velocity_per_worker_type(sc_w, STARPU_CPU_WORKER);
+		flops[i] = sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
+//			printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
+#endif
+	}
+
+#ifdef HAVE_GLPK_H	
+	return 1/_glp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, v, flops, res, total_nw);
+#else
+	return 0.0;
+#endif
+}
+
+double _lp_get_tmax(int nw, int *workers)
+{
+	int ntypes_of_workers = 2;
+	int total_nw[ntypes_of_workers];
+	_get_total_nw(workers, nw, 2, total_nw);
+
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	
+	double res[nsched_ctxs][ntypes_of_workers];
+	return _lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000;
+}
+
+void _lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
+{
+	int s, w;
+	double left_res[nw];
+	for(w = 0; w < nw; w++)
+		left_res[nw] = 0.0;
+	for(s = 0; s < ns; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			int x = floor(res[s][w]);
+			double x_double = (double)x;
+			double diff = res[s][w] - x_double;
+			
+			if(diff != 0.0)
+			{
+				if(diff > 0.5)
+				{
+					if(left_res[w] != 0.0)
+					{
+						if((diff + left_res[w]) > 0.5)
+						{
+							res_rounded[s][w] = x + 1;
+							left_res[w] = (-1.0) * (x_double + 1.0 - (res[s][w] + left_res[w]));
+						}
+						else
+						{
+							res_rounded[s][w] = x;
+							left_res[w] = (-1.0) * (diff + left_res[w]);
+						}
+					}
+					else
+					{
+						res_rounded[s][w] = x + 1;
+						left_res[w] = (-1.0) * (x_double + 1.0 - res[s][w]);
+					}
+
+				}
+				else
+				{
+					if((diff + left_res[w]) > 0.5)
+					{
+						res_rounded[s][w] = x + 1;
+						left_res[w] = (-1.0) * (x_double + 1.0 - (res[s][w] + left_res[w]));
+					}
+					else
+					{
+						res_rounded[s][w] = x;
+						left_res[w] = diff;
+					}
+				}
+			}
+		}
+	}		
+}
+
+void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int s, s2, w;
+	for(s = 0; s < ns; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			enum starpu_archtype arch;
+			if(w == 0) arch = STARPU_CUDA_WORKER;
+			if(w == 1) arch = STARPU_CPU_WORKER;
+
+			int workers_move[STARPU_NMAXWORKERS];
+			int nw_move = 0;
+
+			int workers_add[STARPU_NMAXWORKERS];
+			int nw_add = 0;
+
+			if(w == 1)
+			{
+				int nworkers_ctx = get_nworkers_ctx(sched_ctxs[s], arch);
+				if(nworkers_ctx > res_rounded[s][w])
+				{
+					int nworkers_to_move = nworkers_ctx - res_rounded[s][w];
+					int *workers_to_move = _get_first_workers(sched_ctxs[s], &nworkers_to_move, arch);
+					int i;
+					for(i = 0; i < nworkers_to_move; i++)
+						workers_move[nw_move++] = workers_to_move[i];
+					free(workers_to_move);
+				}
+			}
+			else
+			{
+				double nworkers_ctx = get_nworkers_ctx(sched_ctxs[s], arch) * 1.0;
+				if(nworkers_ctx > res[s][w])
+				{
+					double nworkers_to_move = nworkers_ctx - res[s][w];
+					int x = floor(nworkers_to_move);
+					double x_double = (double)x;
+					double diff = nworkers_to_move - x_double;
+					if(diff == 0.0)
+					{
+						int *workers_to_move = _get_first_workers(sched_ctxs[s], &x, arch);
+						if(x > 0)
+						{
+							int i;
+							for(i = 0; i < x; i++)
+								workers_move[nw_move++] = workers_to_move[i];
+							
+						}
+						free(workers_to_move);
+					}
+					else
+					{
+						x+=1;
+						int *workers_to_move = _get_first_workers(sched_ctxs[s], &x, arch);
+						if(x > 0)
+						{
+							int i;
+							for(i = 0; i < x-1; i++)
+								workers_move[nw_move++] = workers_to_move[i];
+							
+							if(diff > 0.8)
+								workers_move[nw_move++] = workers_to_move[x-1];
+							else
+								if(diff > 0.3)
+									workers_add[nw_add++] = workers_to_move[x-1];
+							
+						}
+						free(workers_to_move);
+					}
+				}
+			}
+			
+			for(s2 = 0; s2 < ns; s2++)
+			{
+				if(sched_ctxs[s2] != sched_ctxs[s])
+				{
+					double nworkers_ctx2 = get_nworkers_ctx(sched_ctxs[s2], arch) * 1.0;
+					if((res[s2][w] - nworkers_ctx2) >= 0.0 && nw_move > 0)
+					{
+						sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
+						nw_move = 0;
+						break;
+					}
+					if((res[s2][w] - nworkers_ctx2) >= 0.0 &&  (res[s2][w] - nworkers_ctx2) <= (double)nw_add && nw_add > 0)
+					{
+						sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s2]);
+						nw_add = 0;
+						break;
+					}
+
+				}
+			}
+			if(nw_move > 0)
+				sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
+		}
+	}
+}
+
+void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers)
+{
+	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
+	int *current_sched_ctxs = sched_ctxs == NULL ? sched_ctx_hypervisor_get_sched_ctxs() : sched_ctxs;
+
+	int s, w;
+	for(s = 0; s < ns; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			enum starpu_archtype arch;
+			if(w == 0) arch = STARPU_CUDA_WORKER;
+			if(w == 1) arch = STARPU_CPU_WORKER;
+
+			if(w == 1)
+			{
+				int nworkers_to_add = res_rounded[s][w];
+				int *workers_to_add = _get_first_workers_in_list(workers, current_nworkers, &nworkers_to_add, arch);
+
+				if(nworkers_to_add > 0)
+				{
+					sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, nworkers_to_add, current_sched_ctxs[s]);
+					sched_ctx_hypervisor_start_resize(current_sched_ctxs[s]);
+					struct policy_config *new_config = sched_ctx_hypervisor_get_config(current_sched_ctxs[s]);
+					int i;
+					for(i = 0; i < nworkers_to_add; i++)
+						new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
+				}
+				free(workers_to_add);
+			}
+			else
+			{
+				double nworkers_to_add = res[s][w];
+				int x = floor(nworkers_to_add);
+				double x_double = (double)x;
+				double diff = nworkers_to_add - x_double;
+				if(diff == 0.0)
+				{
+					int *workers_to_add = _get_first_workers_in_list(workers, current_nworkers, &x, arch);
+					if(x > 0)
+					{
+						sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, x, current_sched_ctxs[s]);
+						sched_ctx_hypervisor_start_resize(current_sched_ctxs[s]);						
+					}
+					free(workers_to_add);
+				}
+				else
+				{
+					x+=1;
+					int *workers_to_add = _get_first_workers_in_list(workers, current_nworkers, &x, arch);
+					if(x > 0)
+					{
+						if(diff >= 0.3)
+							sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, x, current_sched_ctxs[s]);
+						else
+							sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, x-1, current_sched_ctxs[s]);
+						sched_ctx_hypervisor_start_resize(current_sched_ctxs[s]);
+					}
+					free(workers_to_add);			
+				}
+			}
+			
+		}
+		sched_ctx_hypervisor_stop_resize(current_sched_ctxs[s]);
+	}
+}

+ 22 - 0
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.h

@@ -0,0 +1,22 @@
+#include "policy_tools.h"
+/*                                                                                                                                                                                                                  
+ * GNU Linear Programming Kit backend                                                                                                                                                                               
+ */
+#ifdef HAVE_GLPK_H
+#include <glpk.h>
+#endif //HAVE_GLPK_H
+
+/* returns tmax, and computes in table res the nr of workers needed by each context st the system ends up in the smallest tmax*/
+double _lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
+
+/* returns tmax of the system */
+double _lp_get_tmax(int nw, int *workers);
+
+/* the linear programme determins a rational number of ressources for each ctx, we round them depending on the type of ressource */
+void _lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw]);
+
+/* redistribute the ressource in contexts by assigning the first x available ressources to each one */
+void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw]);
+
+/* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
+void _lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);

+ 401 - 0
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -0,0 +1,401 @@
+/* #include <sched_ctx_hypervisor.h> */
+/* #include <pthread.h> */
+
+#include "policy_tools.h"
+
+static int _compute_priority(unsigned sched_ctx)
+{
+	struct policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+
+	int total_priority = 0;
+
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+	int worker;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		total_priority += config->priority[worker];
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	return total_priority;
+}
+
+/* find the context with the slowest priority */
+unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
+{
+	int i;
+	int highest_priority = -1;
+	int current_priority = 0;
+	unsigned sched_ctx = STARPU_NMAX_SCHED_CTXS;
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+
+	struct policy_config *config = NULL;
+
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		if(sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && sched_ctxs[i] != req_sched_ctx)
+		{
+			unsigned nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctxs[i]);
+			config  = sched_ctx_hypervisor_get_config(sched_ctxs[i]);
+			if((nworkers + nworkers_to_move) <= config->max_nworkers)
+			{
+				current_priority = _compute_priority(sched_ctxs[i]);
+				if (highest_priority < current_priority)
+				{
+					highest_priority = current_priority;
+					sched_ctx = sched_ctxs[i];
+				}
+			}
+		}
+	}
+	
+	return sched_ctx;
+}
+
+int* _get_first_workers_in_list(int *workers, int nall_workers,  unsigned *nworkers, enum starpu_archtype arch)
+{
+	int *curr_workers = (int*)malloc((*nworkers)*sizeof(int));
+	
+	int w, worker;
+	int nfound_workers = 0;
+	for(w = 0; w < nall_workers; w++)
+	{
+		worker = workers == NULL ? w : workers[w];
+		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		if(arch == STARPU_ANY_WORKER || curr_arch == arch)
+		{
+			curr_workers[nfound_workers++] = worker;
+		}
+		if(nfound_workers == *nworkers)
+			break;
+	}
+	if(nfound_workers < *nworkers)
+		*nworkers = nfound_workers;
+	return curr_workers;
+}
+
+/* get first nworkers with the highest idle time in the context */
+int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch)
+{
+	struct sched_ctx_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
+	struct policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+
+	int *curr_workers = (int*)malloc((*nworkers) * sizeof(int));
+	int i;
+	for(i = 0; i < *nworkers; i++)
+		curr_workers[i] = -1;
+
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+	int index;
+	int worker;
+	int considered = 0;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	for(index = 0; index < *nworkers; index++)
+	{
+		while(workers->has_next(workers))
+		{
+			considered = 0;
+			worker = workers->get_next(workers);
+			enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+			if(arch == STARPU_ANY_WORKER || curr_arch == arch)
+			{
+
+				if(!config->fixed_workers[worker])
+				{
+					for(i = 0; i < index; i++)
+					{
+						if(curr_workers[i] == worker)
+						{
+							considered = 1;
+							break;
+						}
+					}
+					
+					if(!considered)
+					{
+						/* the first iteration*/
+						if(curr_workers[index] < 0)
+						curr_workers[index] = worker;
+						/* small priority worker is the first to leave the ctx*/
+						else if(config->priority[worker] <
+							config->priority[curr_workers[index]])
+						curr_workers[index] = worker;
+						/* if we don't consider priorities check for the workers
+						   with the biggest idle time */
+						else if(config->priority[worker] ==
+							config->priority[curr_workers[index]])
+						{
+							double worker_idle_time = sc_w->current_idle_time[worker];
+							double curr_worker_idle_time = sc_w->current_idle_time[curr_workers[index]];
+							if(worker_idle_time > curr_worker_idle_time)
+								curr_workers[index] = worker;
+						}
+					}
+				}
+			}
+		}
+			
+		if(curr_workers[index] < 0)
+		{
+			*nworkers = index;
+			break;
+		}
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+
+	return curr_workers;
+}
+
+/* get the number of workers in the context that are allowed to be moved (that are not fixed) */
+unsigned _get_potential_nworkers(struct policy_config *config, unsigned sched_ctx, enum starpu_archtype arch)
+{
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+
+	unsigned potential_workers = 0;
+	int worker;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+                if(arch == STARPU_ANY_WORKER || curr_arch == arch)
+                {
+			if(!config->fixed_workers[worker])
+				potential_workers++;
+		}
+	}
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	
+	return potential_workers;
+}
+
+/* compute the number of workers that should be moved depending:
+   - on the min/max number of workers in a context imposed by the user, 
+   - on the resource granularity imposed by the user for the resizing process*/
+int _get_nworkers_to_move(unsigned req_sched_ctx)
+{
+       	struct policy_config *config = sched_ctx_hypervisor_get_config(req_sched_ctx);
+	unsigned nworkers = starpu_get_nworkers_of_sched_ctx(req_sched_ctx);
+	unsigned nworkers_to_move = 0;
+	
+	unsigned potential_moving_workers = _get_potential_nworkers(config, req_sched_ctx, STARPU_ANY_WORKER);
+	if(potential_moving_workers > 0)
+	{
+		if(potential_moving_workers <= config->min_nworkers)
+			/* if we have to give more than min better give it all */ 
+			/* => empty ctx will block until having the required workers */
+			nworkers_to_move = potential_moving_workers; 
+		else if(potential_moving_workers > config->max_nworkers)
+		{
+			if((potential_moving_workers - config->granularity) > config->max_nworkers)
+//				nworkers_to_move = config->granularity;
+				nworkers_to_move = potential_moving_workers;
+			else
+				nworkers_to_move = potential_moving_workers - config->max_nworkers;
+ 
+		}
+		else if(potential_moving_workers > config->granularity)
+		{
+			if((nworkers - config->granularity) > config->min_nworkers)	
+				nworkers_to_move = config->granularity;
+			else
+				nworkers_to_move = potential_moving_workers - config->min_nworkers;
+		}
+		else
+		{
+			int nfixed_workers = nworkers - potential_moving_workers;
+			if(nfixed_workers >= config->min_nworkers)
+				nworkers_to_move = potential_moving_workers;
+			else
+				nworkers_to_move = potential_moving_workers - (config->min_nworkers - nfixed_workers);	
+		}
+
+		if((nworkers - nworkers_to_move) > config->max_nworkers)
+			nworkers_to_move = nworkers - config->max_nworkers;
+	}
+	return nworkers_to_move;
+}
+
+unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize, unsigned now)
+{
+	int ret = 1;
+	if(force_resize)
+		pthread_mutex_lock(&act_hypervisor_mutex);
+	else
+		ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{					
+		int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+		if(nworkers_to_move > 0)
+		{
+			unsigned poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+			if(receiver_sched_ctx == STARPU_NMAX_SCHED_CTXS)
+			{
+				poor_sched_ctx = _find_poor_sched_ctx(sender_sched_ctx, nworkers_to_move);
+			}
+			else
+			{
+				poor_sched_ctx = receiver_sched_ctx;
+				struct policy_config *config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				unsigned nworkers = starpu_get_nworkers_of_sched_ctx(poor_sched_ctx);
+				unsigned nshared_workers = starpu_get_nshared_workers(sender_sched_ctx, poor_sched_ctx);
+				if((nworkers+nworkers_to_move-nshared_workers) > config->max_nworkers)
+					nworkers_to_move = nworkers > config->max_nworkers ? 0 : (config->max_nworkers - nworkers+nshared_workers);
+				if(nworkers_to_move == 0) poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+			}
+			if(poor_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+			{						
+				int *workers_to_move = _get_first_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ANY_WORKER);
+				sched_ctx_hypervisor_move_workers(sender_sched_ctx, poor_sched_ctx, workers_to_move, nworkers_to_move, now);
+				
+				struct policy_config *new_config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				int i;
+				for(i = 0; i < nworkers_to_move; i++)
+					new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
+				
+				free(workers_to_move);
+			}
+		}	
+		pthread_mutex_unlock(&act_hypervisor_mutex);
+		return 1;
+	}
+	return 0;
+
+}
+
+
+unsigned _resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now)
+{
+	return _resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
+}
+
+static double _get_elapsed_flops(struct sched_ctx_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
+{
+	double ret_val = 0.0;
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sc_w->sched_ctx);
+        int worker;
+
+	if(workers->init_cursor)
+                workers->init_cursor(workers);
+
+        while(workers->has_next(workers))
+	{
+                worker = workers->get_next(workers);
+                enum starpu_archtype arch = starpu_worker_get_type(worker);
+                if(arch == req_arch)
+                {
+			ret_val += sc_w->elapsed_flops[worker];
+			(*npus)++;
+                }
+        }
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+
+	return ret_val;
+}
+
+double _get_ctx_velocity(struct sched_ctx_wrapper* sc_w)
+{
+        double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+	double total_elapsed_flops = sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(sc_w);
+	double prc = elapsed_flops/sc_w->total_flops;
+	unsigned nworkers = starpu_get_nworkers_of_sched_ctx(sc_w->sched_ctx);
+	double redim_sample = elapsed_flops == total_elapsed_flops ? HYPERVISOR_START_REDIM_SAMPLE*nworkers : HYPERVISOR_REDIM_SAMPLE*nworkers;  
+	if(prc >= redim_sample)
+        {
+                double curr_time = starpu_timing_now();
+                double elapsed_time = curr_time - sc_w->start_time;
+                return elapsed_flops/elapsed_time;
+        }
+	return 0.0;
+}
+
+/* compute an average value of the cpu velocity */
+double _get_velocity_per_worker_type(struct sched_ctx_wrapper* sc_w, enum starpu_archtype arch)
+{
+        int npus = 0;
+        double elapsed_flops = _get_elapsed_flops(sc_w, &npus, arch);
+
+        if( elapsed_flops != 0.0)
+        {
+                double curr_time = starpu_timing_now();
+                double elapsed_time = curr_time - sc_w->start_time;
+                return (elapsed_flops/elapsed_time) / npus;
+        }
+
+        return -1.0;
+}
+
+
+/* check if there is a big velocity gap between the contexts */
+int _velocity_gap_btw_ctxs()
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+	int i = 0, j = 0;
+	struct sched_ctx_wrapper* sc_w;
+	struct sched_ctx_wrapper* other_sc_w;
+	
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
+		double ctx_v = _get_ctx_velocity(sc_w);
+		if(ctx_v != 0.0)
+		{
+			for(j = 0; j < nsched_ctxs; j++)
+			{
+				if(sched_ctxs[i] != sched_ctxs[j])
+				{
+					other_sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[j]);
+					double other_ctx_v = _get_ctx_velocity(other_sc_w);
+					if(other_ctx_v != 0.0)
+					{
+						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
+						if(gap > 1.5)
+							return 1;
+					} 
+					else
+						return 1;						
+				}
+			}
+		}
+
+	}
+	return 0;
+}
+
+
+void _get_total_nw(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers])
+{
+	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
+	int w;
+	for(w = 0; w < ntypes_of_workers; w++)
+		total_nw[w] = 0;
+
+	for(w = 0; w < current_nworkers; w++)
+	{
+		enum starpu_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
+			starpu_worker_get_type(workers[w]);
+		if(arch == STARPU_CPU_WORKER)
+			total_nw[1]++;
+		else
+			total_nw[0]++;
+	}
+}

+ 41 - 0
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.h

@@ -0,0 +1,41 @@
+#include <sched_ctx_hypervisor.h>
+#include <pthread.h>
+
+#define HYPERVISOR_REDIM_SAMPLE 0.01
+#define HYPERVISOR_START_REDIM_SAMPLE 0.005
+
+struct bound_task_pool
+{
+	/* Which codelet has been executed */
+	struct starpu_codelet *cl;
+	/* Task footprint key */
+	uint32_t footprint;
+	/* Context the task belongs to */
+	unsigned sched_ctx_id;
+	/* Number of tasks of this kind */
+	unsigned long n;
+	/* Other task kinds */
+	struct bound_task_pool *next;
+};
+
+unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move);
+
+int* _get_first_workers(unsigned sched_ctx, int *nworkers, enum starpu_archtype arch);
+
+int* _get_first_workers_in_list(int *workers, int nall_workers,  unsigned *nworkers, enum starpu_archtype arch);
+
+unsigned _get_potential_nworkers(struct policy_config *config, unsigned sched_ctx, enum starpu_archtype arch);
+
+int _get_nworkers_to_move(unsigned req_sched_ctx);
+
+unsigned _resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize, unsigned now);
+
+unsigned _resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now);
+
+double _get_ctx_velocity(struct sched_ctx_wrapper* sc_w);
+
+double _get_velocity_per_worker_type(struct sched_ctx_wrapper* sc_w, enum starpu_archtype arch);
+
+int _velocity_gap_btw_ctxs(void);
+
+void _get_total_nw(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers]);

+ 506 - 0
sched_ctx_hypervisor/src/hypervisor_policies/simple_policy.c

@@ -0,0 +1,506 @@
+#include <sched_ctx_hypervisor.h>
+#include <pthread.h>
+
+static int _compute_priority(unsigned sched_ctx)
+{
+	struct policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+
+	int total_priority = 0;
+
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+	int worker;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		total_priority += config->priority[worker];
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	return total_priority;
+}
+
+static unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move)
+{
+	int i;
+	int highest_priority = -1;
+	int current_priority = 0;
+	unsigned sched_ctx = STARPU_NMAX_SCHED_CTXS;
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+
+	struct policy_config *config = NULL;
+
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		if(sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && sched_ctxs[i] != req_sched_ctx)
+		{
+			unsigned nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctxs[i]);
+			config  = sched_ctx_hypervisor_get_config(sched_ctxs[i]);
+			if((nworkers + nworkers_to_move) <= config->max_nworkers)
+			{
+				current_priority = _compute_priority(sched_ctxs[i]);
+				if (highest_priority < current_priority)
+				{
+					highest_priority = current_priority;
+					sched_ctx = sched_ctxs[i];
+				}
+			}
+		}
+	}
+	
+	return sched_ctx;
+}
+
+int* _get_first_workers(unsigned sched_ctx, unsigned *nworkers, enum starpu_archtype arch)
+{
+	struct policy_config *config = sched_ctx_hypervisor_get_config(sched_ctx);
+
+	int *curr_workers = (int*)malloc((*nworkers) * sizeof(int));
+	int i;
+	for(i = 0; i < *nworkers; i++)
+		curr_workers[i] = -1;
+
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+	int index;
+	int worker;
+	int considered = 0;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	for(index = 0; index < *nworkers; index++)
+	{
+		while(workers->has_next(workers))
+		{
+			considered = 0;
+			worker = workers->get_next(workers);
+			enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+			if(arch == 0 || curr_arch == arch)
+			{
+
+				if(!config->fixed_workers[worker])
+				{
+					for(i = 0; i < index; i++)
+					{
+						if(curr_workers[i] == worker)
+						{
+							considered = 1;
+							break;
+						}
+					}
+					
+					if(!considered)
+					{
+						/* the first iteration*/
+						if(curr_workers[index] < 0)
+						curr_workers[index] = worker;
+						/* small priority worker is the first to leave the ctx*/
+						else if(config->priority[worker] <
+							config->priority[curr_workers[index]])
+						curr_workers[index] = worker;
+						/* if we don't consider priorities check for the workers
+						   with the biggest idle time */
+						else if(config->priority[worker] ==
+							config->priority[curr_workers[index]])
+						{
+							double worker_idle_time = sched_ctx_hypervisor_get_idle_time(sched_ctx, worker);
+							double curr_worker_idle_time = sched_ctx_hypervisor_get_idle_time(sched_ctx, curr_workers[index]);
+							if(worker_idle_time > curr_worker_idle_time)
+								curr_workers[index] = worker;
+						}
+					}
+				}
+			}
+		}
+			
+		if(curr_workers[index] < 0)
+		{
+			*nworkers = index;
+			break;
+		}
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+
+	return curr_workers;
+}
+
+static unsigned _get_potential_nworkers(struct policy_config *config, unsigned sched_ctx, enum starpu_archtype arch)
+{
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+
+	unsigned potential_workers = 0;
+	int worker;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+                if(arch == 0 || curr_arch == arch)
+                {
+			if(!config->fixed_workers[worker])
+				potential_workers++;
+		}
+	}
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	
+	return potential_workers;
+}
+
+static unsigned _get_nworkers_to_move(unsigned req_sched_ctx)
+{
+       	struct policy_config *config = sched_ctx_hypervisor_get_config(req_sched_ctx);
+	unsigned nworkers = starpu_get_nworkers_of_sched_ctx(req_sched_ctx);
+	unsigned nworkers_to_move = 0;
+	
+	unsigned potential_moving_workers = _get_potential_nworkers(config, req_sched_ctx, 0);
+	if(potential_moving_workers > 0)
+	{
+		if(potential_moving_workers <= config->min_nworkers)
+			/* if we have to give more than min better give it all */ 
+			/* => empty ctx will block until having the required workers */
+			
+			nworkers_to_move = potential_moving_workers; 
+		else if(potential_moving_workers > config->max_nworkers)
+		{
+			if((potential_moving_workers - config->granularity) > config->max_nworkers)
+				nworkers_to_move = config->granularity;
+			else
+				nworkers_to_move = potential_moving_workers - config->max_nworkers;
+ 
+		}
+		else if(potential_moving_workers > config->granularity)
+		{
+			if((nworkers - config->granularity) > config->min_nworkers)	
+				nworkers_to_move = config->granularity;
+			else
+				nworkers_to_move = potential_moving_workers - config->min_nworkers;
+		}
+		else
+		{
+			int nfixed_workers = nworkers - potential_moving_workers;
+			if(nfixed_workers >= config->min_nworkers)
+				nworkers_to_move = potential_moving_workers;
+			else
+				nworkers_to_move = potential_moving_workers - (config->min_nworkers - nfixed_workers);	
+		}
+
+		if((nworkers - nworkers_to_move) > config->max_nworkers)
+			nworkers_to_move = nworkers - config->max_nworkers;
+	}
+	return nworkers_to_move;
+}
+
+static unsigned _simple_resize(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize)
+{
+	int ret = 1;
+	if(force_resize)
+		pthread_mutex_lock(&act_hypervisor_mutex);
+	else
+		ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+	if(ret != EBUSY)
+	{					
+		unsigned nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+
+		if(nworkers_to_move > 0)
+		{
+			unsigned poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+			if(receiver_sched_ctx == STARPU_NMAX_SCHED_CTXS)
+				poor_sched_ctx = _find_poor_sched_ctx(sender_sched_ctx, nworkers_to_move);
+			else
+			{
+				poor_sched_ctx = receiver_sched_ctx;
+				struct policy_config *config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				unsigned nworkers = starpu_get_nworkers_of_sched_ctx(poor_sched_ctx);
+				unsigned nshared_workers = starpu_get_nshared_workers(sender_sched_ctx, poor_sched_ctx);
+				if((nworkers+nworkers_to_move-nshared_workers) > config->max_nworkers)
+					nworkers_to_move = nworkers > config->max_nworkers ? 0 : (config->max_nworkers - nworkers+nshared_workers);
+				if(nworkers_to_move == 0) poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+			}
+
+
+			if(poor_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+			{						
+				int *workers_to_move = _get_first_workers(sender_sched_ctx, &nworkers_to_move, 0);
+				sched_ctx_hypervisor_move_workers(sender_sched_ctx, poor_sched_ctx, workers_to_move, nworkers_to_move);
+				
+				struct policy_config *new_config = sched_ctx_hypervisor_get_config(poor_sched_ctx);
+				int i;
+				for(i = 0; i < nworkers_to_move; i++)
+					new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
+				
+				free(workers_to_move);
+			}
+		}	
+		pthread_mutex_unlock(&act_hypervisor_mutex);
+		return 1;
+	}
+	return 0;
+
+}
+
+static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *nworkers)
+{
+        int *workers = NULL;
+        double v_receiver = sched_ctx_hypervisor_get_ctx_velocity(receiver_sched_ctx);
+        double receiver_remainig_flops = sched_ctx_hypervisor_get_flops_left(receiver_sched_ctx);
+        double sender_exp_end = sched_ctx_hypervisor_get_exp_end(sender_sched_ctx);
+        double sender_v_cpu = sched_ctx_hypervisor_get_cpu_velocity(sender_sched_ctx);
+//      double v_gcpu = sched_ctx_hypervisor_get_gpu_velocity(sender_sched_ctx);                                                                                                                                                                                                                                                                                                                                                                                                              
+
+        double v_for_rctx = (receiver_remainig_flops/(sender_exp_end - starpu_timing_now())) - v_receiver;
+//      v_for_rctx /= 2;                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
+
+        int nworkers_needed = v_for_rctx/sender_v_cpu;
+/*      printf("%d->%d: v_rec %lf v %lf v_cpu %lf w_needed %d \n", sender_sched_ctx, receiver_sched_ctx, */
+/*             v_receiver, v_for_rctx, sender_v_cpu, nworkers_needed); */
+        if(nworkers_needed > 0)
+        {
+                struct policy_config *sender_config = sched_ctx_hypervisor_get_config(sender_sched_ctx);
+                unsigned potential_moving_cpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
+                unsigned potential_moving_gpus = _get_potential_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
+                unsigned sender_nworkers = starpu_get_nworkers_of_sched_ctx(sender_sched_ctx);
+                struct policy_config *config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                unsigned nworkers_ctx = starpu_get_nworkers_of_sched_ctx(receiver_sched_ctx);
+
+                if(nworkers_needed < (potential_moving_cpus + 5 * potential_moving_gpus))
+                {
+                        if((sender_nworkers - nworkers_needed) >= sender_config->min_nworkers)
+                        {
+                                if((nworkers_ctx + nworkers_needed) > config->max_nworkers)
+                                        nworkers_needed = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx);
+
+                                if(nworkers_needed > 0)
+                                {
+                                        int ngpus = nworkers_needed / 5;
+                                        int *gpus;
+                                        gpus = _get_first_workers(sender_sched_ctx, &ngpus, STARPU_CUDA_WORKER);
+                                        int ncpus = nworkers_needed - ngpus;
+                                        int *cpus;
+                                        cpus = _get_first_workers(sender_sched_ctx, &ncpus, STARPU_CPU_WORKER);
+                                        workers = (int*)malloc(nworkers_needed*sizeof(int));
+                                        int i;
+                                        for(i = 0; i < ngpus; i++)
+                                                workers[(*nworkers)++] = gpus[i];
+
+                                        for(i = 0; i < ncpus; i++)
+                                                workers[(*nworkers)++] = cpus[i];
+
+                                        free(gpus);
+                                        free(cpus);
+                                }
+                        }
+                }
+		else
+                {
+                        int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
+
+                        if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers)
+                        {
+                                unsigned nshared_workers = starpu_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx);
+                                if((nworkers_ctx + nworkers_to_move - nshared_workers) > config->max_nworkers)
+                                        nworkers_to_move = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx + nshared_workers);
+
+                                if(nworkers_to_move > 0)
+                                {
+                                        workers = _get_first_workers(sender_sched_ctx, &nworkers_to_move, 0);
+                                        *nworkers = nworkers_to_move;
+                                }
+                        }
+                }
+        }
+        return workers;
+}
+
+static unsigned _simple_resize2(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, unsigned force_resize)
+{
+        int ret = 1;
+        if(force_resize)
+                pthread_mutex_lock(&act_hypervisor_mutex);
+        else
+                ret = pthread_mutex_trylock(&act_hypervisor_mutex);
+        if(ret != EBUSY)
+        {
+                int nworkers_to_move = 0;
+                int *workers_to_move =  _get_workers_to_move(sender_sched_ctx, receiver_sched_ctx, &nworkers_to_move);
+		if(nworkers_to_move > 0)
+                {
+                        sched_ctx_hypervisor_move_workers(sender_sched_ctx, receiver_sched_ctx, workers_to_move, nworkers_to_move);
+
+                        struct policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+                        int i;
+                        for(i = 0; i < nworkers_to_move; i++)
+                                new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
+
+                        free(workers_to_move);
+                }
+                pthread_mutex_unlock(&act_hypervisor_mutex);
+                return 1;
+        }
+        return 0;
+
+}
+
+static unsigned simple_resize(unsigned sender_sched_ctx)
+{
+	return _simple_resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 1);
+}
+
+static void simple_manage_idle_time(unsigned req_sched_ctx, int worker, double idle_time)
+{
+       	struct policy_config *config = sched_ctx_hypervisor_get_config(req_sched_ctx);
+
+	if(config != NULL && idle_time > config->max_idle[worker])
+		simple_resize(req_sched_ctx);
+	return;
+}
+
+int _find_fastest_sched_ctx()
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	double first_exp_end = sched_ctx_hypervisor_get_exp_end(sched_ctxs[0]);
+	int fastest_sched_ctx = first_exp_end == -1.0  ? -1 : sched_ctxs[0];
+	double curr_exp_end = 0.0;
+	int i;
+	for(i = 1; i < nsched_ctxs; i++)
+	{
+		curr_exp_end = sched_ctx_hypervisor_get_exp_end(sched_ctxs[i]);
+		if(first_exp_end > curr_exp_end && curr_exp_end != -1.0)
+		{
+			first_exp_end = curr_exp_end;
+			fastest_sched_ctx = sched_ctxs[i];
+		}
+	}
+
+	return fastest_sched_ctx;
+
+}
+
+int _find_slowest_sched_ctx()
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	int slowest_sched_ctx = -1;
+	double curr_exp_end = 0.0;
+	double last_exp_end = -1.0;
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		curr_exp_end = sched_ctx_hypervisor_get_exp_end(sched_ctxs[i]);
+		/*if it hasn't started bc of no ressources give it priority */
+		if(curr_exp_end == -1.0)
+			return sched_ctxs[i];
+		if(last_exp_end < curr_exp_end)
+		{
+			slowest_sched_ctx = sched_ctxs[i];
+			last_exp_end = curr_exp_end;
+		}
+	}
+
+	return slowest_sched_ctx;
+
+}
+
+int _find_slowest_available_sched_ctx(unsigned sched_ctx)
+{
+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
+
+	int slowest_sched_ctx = -1;
+	double curr_exp_end = 0.0;
+	double last_exp_end = -1.0;
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
+		if(sched_ctxs[i] != sched_ctx)
+		{
+			curr_exp_end = sched_ctx_hypervisor_get_exp_end(sched_ctxs[i]);
+			/*if it hasn't started bc of no ressources give it priority */
+			if(curr_exp_end == -1.0)
+				return sched_ctxs[i];
+			if(last_exp_end < curr_exp_end)
+			{
+				slowest_sched_ctx = sched_ctxs[i];
+				last_exp_end = curr_exp_end;
+			}
+		}
+	}
+
+	return slowest_sched_ctx;
+
+}
+
+static void simple_manage_gflops_rate(unsigned sched_ctx)
+{
+	double exp_end = sched_ctx_hypervisor_get_exp_end(sched_ctx);
+	double flops_left_pct = sched_ctx_hypervisor_get_flops_left_pct(sched_ctx);
+
+	if(flops_left_pct == 0.0f)
+	{
+		int slowest_sched_ctx = _find_slowest_available_sched_ctx(sched_ctx);
+		if(slowest_sched_ctx != -1)
+		{
+			double slowest_flops_left_pct = sched_ctx_hypervisor_get_flops_left_pct(slowest_sched_ctx);
+			printf("ctx %d finished & gives away the res to %d; slow_left %lf\n", sched_ctx, slowest_sched_ctx, slowest_flops_left_pct);
+			if(slowest_flops_left_pct != 0.0f)
+			{
+				struct policy_config* config = sched_ctx_hypervisor_get_config(sched_ctx);
+				config->min_nworkers = 0;
+				config->max_nworkers = 0;
+				_simple_resize(sched_ctx, slowest_sched_ctx, 1);
+				sched_ctx_hypervisor_stop_resize(slowest_sched_ctx);
+			}
+		}
+	}
+
+	int fastest_sched_ctx = _find_fastest_sched_ctx();
+	int slowest_sched_ctx = _find_slowest_sched_ctx();
+	if(fastest_sched_ctx != -1 && slowest_sched_ctx != -1 && fastest_sched_ctx != slowest_sched_ctx)
+	{
+		double fastest_exp_end = sched_ctx_hypervisor_get_exp_end(fastest_sched_ctx);
+		double slowest_exp_end = sched_ctx_hypervisor_get_exp_end(slowest_sched_ctx);
+		double fastest_bef_res_exp_end = sched_ctx_hypervisor_get_bef_res_exp_end(fastest_sched_ctx);
+		double slowest_bef_res_exp_end = sched_ctx_hypervisor_get_bef_res_exp_end(slowest_sched_ctx);
+//					       (fastest_bef_res_exp_end < slowest_bef_res_exp_end || 
+//						fastest_bef_res_exp_end == 0.0 || slowest_bef_res_exp_end == 0)))
+		
+		if((slowest_exp_end == -1.0 && fastest_exp_end != -1.0) || ((fastest_exp_end + (fastest_exp_end*0.5)) < slowest_exp_end ))
+		{
+			double fast_flops_left_pct = sched_ctx_hypervisor_get_flops_left_pct(fastest_sched_ctx);
+			if(fast_flops_left_pct < 0.8)
+				_simple_resize(fastest_sched_ctx, slowest_sched_ctx, 0);
+		}
+	}
+}
+
+
+struct hypervisor_policy idle_policy = {
+	.manage_idle_time = simple_manage_idle_time,
+	.manage_gflops_rate = simple_manage_gflops_rate,
+	.resize = simple_resize,
+};
+
+struct hypervisor_policy app_driven_policy = {
+	.manage_idle_time = simple_manage_idle_time,
+	.manage_gflops_rate = simple_manage_gflops_rate,
+	.resize = simple_resize,
+};
+
+struct hypervisor_policy gflops_rate_policy = {
+	.manage_idle_time = simple_manage_idle_time,
+	.manage_gflops_rate = simple_manage_gflops_rate,
+	.resize = simple_resize,
+};

+ 249 - 0
sched_ctx_hypervisor/src/sched_ctx_config.c

@@ -0,0 +1,249 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sched_ctx_hypervisor_intern.h>
+
+static struct policy_config* _create_config(void)
+{
+	struct policy_config *config = (struct policy_config *)malloc(sizeof(struct policy_config));
+	config->min_nworkers = -1;
+	config->max_nworkers = -1;	
+	config->new_workers_max_idle = -1.0;
+
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		config->granularity = -1;
+		config->priority[i] = -1;
+		config->fixed_workers[i] = -1;
+		config->max_idle[i] = -1.0;
+		config->empty_ctx_max_idle[i] = -1.0;
+		config->min_working[i] = -1.0;
+	}
+	
+	return config;
+}
+
+static void _update_config(struct policy_config *old, struct policy_config* new)
+{
+	old->min_nworkers = new->min_nworkers != -1 ? new->min_nworkers : old->min_nworkers ;
+	old->max_nworkers = new->max_nworkers != -1 ? new->max_nworkers : old->max_nworkers ;
+	old->new_workers_max_idle = new->new_workers_max_idle != -1.0 ? new->new_workers_max_idle : old->new_workers_max_idle;
+	old->granularity = new->granularity != -1 ? new->granularity : old->granularity;
+
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		old->priority[i] = new->priority[i] != -1 ? new->priority[i] : old->priority[i];
+		old->fixed_workers[i] = new->fixed_workers[i] != -1 ? new->fixed_workers[i] : old->fixed_workers[i];
+		old->max_idle[i] = new->max_idle[i] != -1.0 ? new->max_idle[i] : old->max_idle[i];
+		old->empty_ctx_max_idle[i] = new->empty_ctx_max_idle[i] != -1.0 ? new->empty_ctx_max_idle[i] : old->empty_ctx_max_idle[i];
+		old->min_working[i] = new->min_working[i] != -1.0 ? new->min_working[i] : old->min_working[i];
+	}
+}
+
+void sched_ctx_hypervisor_set_config(unsigned sched_ctx, void *config)
+{
+	if(hypervisor.sched_ctx_w[sched_ctx].config != NULL && config != NULL)
+	{
+		_update_config(hypervisor.sched_ctx_w[sched_ctx].config, config);
+	}
+	else
+		hypervisor.sched_ctx_w[sched_ctx].config = config;
+	
+	return;
+}
+
+void _add_config(unsigned sched_ctx)
+{
+	struct policy_config *config = _create_config();
+	config->min_nworkers = 0;
+	config->max_nworkers = STARPU_NMAXWORKERS;	
+	config->new_workers_max_idle = MAX_IDLE_TIME;
+
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		config->granularity = 1;
+		config->priority[i] = 0;
+		config->fixed_workers[i] = 0;
+		config->max_idle[i] = MAX_IDLE_TIME;
+		config->empty_ctx_max_idle[i] = MAX_IDLE_TIME;
+		config->min_working[i] = MIN_WORKING_TIME;
+	}
+
+	sched_ctx_hypervisor_set_config(sched_ctx, config);
+}
+
+void _remove_config(unsigned sched_ctx)
+{
+	sched_ctx_hypervisor_set_config(sched_ctx, NULL);
+}
+
+struct policy_config* sched_ctx_hypervisor_get_config(unsigned sched_ctx)
+{
+	return hypervisor.sched_ctx_w[sched_ctx].config;
+}
+
+static struct policy_config* _ioctl(unsigned sched_ctx, va_list varg_list, unsigned later)
+{
+	struct policy_config *config = NULL;
+
+	if(later)
+		config = _create_config();
+	else
+		config = sched_ctx_hypervisor_get_config(sched_ctx);
+
+	assert(config != NULL);
+
+	int arg_type;
+	int i;
+	int *workerids;
+	int nworkers;
+
+	while ((arg_type = va_arg(varg_list, int)) != HYPERVISOR_NULL) 
+	{
+		switch(arg_type)
+		{
+		case HYPERVISOR_MAX_IDLE:
+			workerids = va_arg(varg_list, int*);
+			nworkers = va_arg(varg_list, int);
+			double max_idle = va_arg(varg_list, double);
+			for(i = 0; i < nworkers; i++)
+				config->max_idle[workerids[i]] = max_idle;
+
+			break;
+
+		case HYPERVISOR_EMPTY_CTX_MAX_IDLE:
+			workerids = va_arg(varg_list, int*);
+			nworkers = va_arg(varg_list, int);
+			double empty_ctx_max_idle = va_arg(varg_list, double);
+			
+			for(i = 0; i < nworkers; i++)
+				config->empty_ctx_max_idle[workerids[i]] = empty_ctx_max_idle;
+
+			break;
+
+		case HYPERVISOR_MIN_WORKING:
+			workerids = va_arg(varg_list, int*);
+			nworkers = va_arg(varg_list, int);
+			double min_working = va_arg(varg_list, double);
+
+			for(i = 0; i < nworkers; i++)
+				config->min_working[workerids[i]] = min_working;
+
+			break;
+
+		case HYPERVISOR_PRIORITY:
+			workerids = va_arg(varg_list, int*);
+			nworkers = va_arg(varg_list, int);
+			int priority = va_arg(varg_list, int);
+	
+			for(i = 0; i < nworkers; i++)
+				config->priority[workerids[i]] = priority;
+			break;
+
+		case HYPERVISOR_MIN_WORKERS:
+			config->min_nworkers = va_arg(varg_list, unsigned);
+			break;
+
+		case HYPERVISOR_MAX_WORKERS:
+			config->max_nworkers = va_arg(varg_list, unsigned);
+			break;
+
+		case HYPERVISOR_GRANULARITY:
+			config->granularity = va_arg(varg_list, unsigned);
+			break;
+
+		case HYPERVISOR_FIXED_WORKERS:
+			workerids = va_arg(varg_list, int*);
+			nworkers = va_arg(varg_list, int);
+
+			for(i = 0; i < nworkers; i++)
+				config->fixed_workers[workerids[i]] = 1;
+			break;
+
+		case HYPERVISOR_NEW_WORKERS_MAX_IDLE:
+			config->new_workers_max_idle = va_arg(varg_list, double);
+			break;
+
+/* not important for the strateg, needed just to jump these args in the iteration of the args */			
+		case HYPERVISOR_TIME_TO_APPLY:
+			va_arg(varg_list, int);
+			break;
+
+		case HYPERVISOR_MIN_TASKS:
+			va_arg(varg_list, int);
+			break;
+
+		}
+	}
+
+	va_end(varg_list);
+
+	return later ? config : NULL;
+}
+
+
+void sched_ctx_hypervisor_ioctl(unsigned sched_ctx, ...)
+{
+	va_list varg_list;
+	va_start(varg_list, sched_ctx);
+
+	int arg_type;
+	int stop = 0;
+	int task_tag = -1;
+
+	while ((arg_type = va_arg(varg_list, int)) != HYPERVISOR_NULL) 
+	{
+		switch(arg_type)
+		{
+		case HYPERVISOR_TIME_TO_APPLY:
+			task_tag = va_arg(varg_list, int);
+			stop = 1;
+			break;
+
+		case HYPERVISOR_MIN_TASKS:
+			hypervisor.min_tasks = va_arg(varg_list, int);
+			hypervisor.check_min_tasks[sched_ctx] = 1;
+			break;
+
+		}
+		if(stop) break;
+	}
+
+	va_end(varg_list);
+	va_start(varg_list, sched_ctx);
+
+	/* if config not null => save hypervisor configuration and consider it later */
+	struct policy_config *config = _ioctl(sched_ctx, varg_list, (task_tag > 0));
+	if(config != NULL)
+	{
+		struct configuration_entry *entry;
+
+		entry = malloc(sizeof *entry);
+		STARPU_ASSERT(entry != NULL);
+
+		entry->task_tag = task_tag;
+		entry->configuration = config;
+
+		pthread_mutex_lock(&hypervisor.conf_mut[sched_ctx]);
+		HASH_ADD_INT(hypervisor.configurations[sched_ctx], task_tag, entry);
+		pthread_mutex_unlock(&hypervisor.conf_mut[sched_ctx]);
+	}
+
+	return;
+}

+ 826 - 0
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -0,0 +1,826 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sched_ctx_hypervisor_intern.h>
+#include <common/uthash.h>
+
+unsigned imposed_resize = 0;
+struct starpu_performance_counters* perf_counters = NULL;
+
+static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
+static void notify_pushed_task(unsigned sched_ctx, int worker);
+static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
+static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
+static void notify_idle_end(unsigned sched_ctx, int  worker);
+static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
+
+extern struct hypervisor_policy idle_policy;
+extern struct hypervisor_policy app_driven_policy;
+extern struct hypervisor_policy gflops_rate_policy;
+#ifdef HAVE_GLPK_H
+extern struct hypervisor_policy lp_policy;
+extern struct hypervisor_policy lp2_policy;
+#endif
+
+
+static struct hypervisor_policy *predefined_policies[] = {
+        &idle_policy,
+		&app_driven_policy,
+#ifdef HAVE_GLPK_H
+		&lp_policy,
+		&lp2_policy,
+#endif
+		&gflops_rate_policy
+};
+
+static void _load_hypervisor_policy(struct hypervisor_policy *policy)
+{
+	STARPU_ASSERT(policy);
+
+	hypervisor.policy.name = policy->name;
+	hypervisor.policy.size_ctxs = policy->size_ctxs;
+	hypervisor.policy.handle_poped_task = policy->handle_poped_task;
+	hypervisor.policy.handle_pushed_task = policy->handle_pushed_task;
+	hypervisor.policy.handle_idle_cycle = policy->handle_idle_cycle;
+	hypervisor.policy.handle_idle_end = policy->handle_idle_end;
+	hypervisor.policy.handle_post_exec_hook = policy->handle_post_exec_hook;
+	hypervisor.policy.handle_submitted_job = policy->handle_submitted_job;
+}
+
+
+static struct hypervisor_policy *_find_hypervisor_policy_from_name(const char *policy_name)
+{
+
+	if (!policy_name)
+		return NULL;
+	
+	unsigned i;
+	for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
+	{
+		struct hypervisor_policy *p;
+		p = predefined_policies[i];
+		if (p->name)
+		{
+			if (strcmp(policy_name, p->name) == 0) {
+				/* we found a policy with the requested name */
+				return p;
+			}
+		}
+	}
+	fprintf(stderr, "Warning: hypervisor policy \"%s\" was not found, try \"help\" to get a list\n", policy_name);
+	
+	/* nothing was found */
+	return NULL;
+}
+
+static struct hypervisor_policy *_select_hypervisor_policy(struct hypervisor_policy* hypervisor_policy)
+{
+	struct hypervisor_policy *selected_policy = NULL;
+	
+	if(hypervisor_policy && hypervisor_policy->custom)
+		return hypervisor_policy;
+	
+	/* we look if the application specified the name of a policy to load */
+	const char *policy_name;
+	if (hypervisor_policy && hypervisor_policy->name)
+	{
+		policy_name = hypervisor_policy->name;
+	}
+	else 
+	{
+		policy_name = getenv("HYPERVISOR_POLICY");
+	}
+	
+	if (policy_name)
+		selected_policy = _find_hypervisor_policy_from_name(policy_name);
+	
+	/* Perhaps there was no policy that matched the name */
+	if (selected_policy)
+		return selected_policy;
+	
+	/* If no policy was specified, we use the idle policy as a default */
+	
+	return &idle_policy;
+}
+
+
+/* initializez the performance counters that starpu will use to retrive hints for resizing */
+struct starpu_performance_counters* sched_ctx_hypervisor_init(struct hypervisor_policy *hypervisor_policy)
+{
+	hypervisor.min_tasks = 0;
+	hypervisor.nsched_ctxs = 0;
+	pthread_mutex_init(&act_hypervisor_mutex, NULL);
+	
+	int i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		hypervisor.resize[i] = 0;
+		hypervisor.configurations[i] = NULL;
+		hypervisor.sr = NULL;
+		hypervisor.check_min_tasks[i] = 1;
+		hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
+		hypervisor.sched_ctx_w[i].sched_ctx = STARPU_NMAX_SCHED_CTXS;
+		hypervisor.sched_ctx_w[i].config = NULL;
+		hypervisor.sched_ctx_w[i].total_flops = 0.0;
+		hypervisor.sched_ctx_w[i].submitted_flops = 0.0;
+		hypervisor.sched_ctx_w[i].remaining_flops = 0.0;
+		hypervisor.sched_ctx_w[i].start_time = 0.0;
+		hypervisor.sched_ctx_w[i].resize_ack.receiver_sched_ctx = -1;
+		hypervisor.sched_ctx_w[i].resize_ack.moved_workers = NULL;
+		hypervisor.sched_ctx_w[i].resize_ack.nmoved_workers = 0;
+		hypervisor.sched_ctx_w[i].resize_ack.acked_workers = NULL;
+		pthread_mutex_init(&hypervisor.sched_ctx_w[i].mutex, NULL);
+		int j;
+		for(j = 0; j < STARPU_NMAXWORKERS; j++)
+		{
+			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
+			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
+			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
+			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
+			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
+			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
+		}
+	}
+
+	struct hypervisor_policy *selected_hypervisor_policy = _select_hypervisor_policy(hypervisor_policy);
+	_load_hypervisor_policy(selected_hypervisor_policy);
+
+	perf_counters = (struct starpu_performance_counters*)malloc(sizeof(struct starpu_performance_counters));
+	perf_counters->notify_idle_cycle = notify_idle_cycle;
+	perf_counters->notify_pushed_task = notify_pushed_task;
+	perf_counters->notify_poped_task = notify_poped_task;
+	perf_counters->notify_post_exec_hook = notify_post_exec_hook;
+	perf_counters->notify_idle_end = notify_idle_end;
+	perf_counters->notify_submitted_job = notify_submitted_job;
+
+	starpu_notify_hypervisor_exists();
+
+	return perf_counters;
+}
+
+const char* sched_ctx_hypervisor_get_policy()
+{
+	return hypervisor.policy.name;
+}
+
+/* the user can forbid the resizing process*/
+void sched_ctx_hypervisor_stop_resize(unsigned sched_ctx)
+{
+	imposed_resize = 1;
+	hypervisor.resize[sched_ctx] = 0;
+}
+
+/* the user can restart the resizing process*/
+void sched_ctx_hypervisor_start_resize(unsigned sched_ctx)
+{
+	imposed_resize = 1;
+	hypervisor.resize[sched_ctx] = 1;
+}
+
+void sched_ctx_hypervisor_shutdown(void)
+{
+	printf("shutdown\n");
+	int i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+                if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && hypervisor.nsched_ctxs > 0)
+		{
+			sched_ctx_hypervisor_stop_resize(hypervisor.sched_ctxs[i]);
+			sched_ctx_hypervisor_unregister_ctx(hypervisor.sched_ctxs[i]);
+			pthread_mutex_destroy(&hypervisor.sched_ctx_w[i].mutex);
+		}
+	}
+	perf_counters->notify_idle_cycle = NULL;
+	perf_counters->notify_pushed_task = NULL;
+	perf_counters->notify_poped_task = NULL;
+	perf_counters->notify_post_exec_hook = NULL;
+	perf_counters->notify_idle_end = NULL;
+
+	free(perf_counters);
+	perf_counters = NULL;
+
+	pthread_mutex_destroy(&act_hypervisor_mutex);
+}
+
+/* the hypervisor is in charge only of the contexts registered to it*/
+void sched_ctx_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
+{
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	hypervisor.configurations[sched_ctx] = NULL;
+	hypervisor.resize_requests[sched_ctx] = NULL;
+	pthread_mutex_init(&hypervisor.conf_mut[sched_ctx], NULL);
+	pthread_mutex_init(&hypervisor.resize_mut[sched_ctx], NULL);
+
+	_add_config(sched_ctx);
+	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = sched_ctx;
+	hypervisor.sched_ctxs[hypervisor.nsched_ctxs++] = sched_ctx;
+
+	hypervisor.sched_ctx_w[sched_ctx].total_flops = total_flops;
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops = total_flops;
+	if(strcmp(hypervisor.policy.name, "app_driven") == 0)
+		hypervisor.resize[sched_ctx] = 1;
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+}
+
+static int _get_first_free_sched_ctx(int *sched_ctxs, unsigned nsched_ctxs)
+{
+	int i;
+	for(i = 0; i < nsched_ctxs; i++)
+		if(sched_ctxs[i] == STARPU_NMAX_SCHED_CTXS)
+			return i;
+	
+	return STARPU_NMAX_SCHED_CTXS;
+}
+
+/* rearange array of sched_ctxs in order not to have {MAXVAL, MAXVAL, 5, MAXVAL, 7}    
+   and have instead {5, 7, MAXVAL, MAXVAL, MAXVAL}                                    
+   it is easier afterwards to iterate the array                           
+*/
+static void _rearange_sched_ctxs(int *sched_ctxs, int old_nsched_ctxs)
+{
+	int first_free_id = STARPU_NMAX_SCHED_CTXS;
+	int i;
+	for(i = 0; i < old_nsched_ctxs; i++)
+	{
+		if(sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
+		{
+			first_free_id = _get_first_free_sched_ctx(sched_ctxs, old_nsched_ctxs);
+			if(first_free_id != STARPU_NMAX_SCHED_CTXS)
+			{
+				sched_ctxs[first_free_id] = sched_ctxs[i];
+				sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
+			}
+		}
+	}
+}
+
+/* unregistered contexts will no longer be resized */
+void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
+{
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	unsigned i;
+	for(i = 0; i < hypervisor.nsched_ctxs; i++)
+	{
+		if(hypervisor.sched_ctxs[i] == sched_ctx)
+		{
+			hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
+			break;
+		}
+	}
+
+	_rearange_sched_ctxs(hypervisor.sched_ctxs, hypervisor.nsched_ctxs);
+	hypervisor.nsched_ctxs--;
+	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = STARPU_NMAX_SCHED_CTXS;
+	_remove_config(sched_ctx);
+	
+/* 	free(hypervisor.configurations[sched_ctx]); */
+/* 	free(hypervisor.resize_requests[sched_ctx]); */
+	pthread_mutex_destroy(&hypervisor.conf_mut[sched_ctx]);
+	pthread_mutex_destroy(&hypervisor.resize_mut[sched_ctx]);
+	if(hypervisor.nsched_ctxs == 1)
+		sched_ctx_hypervisor_stop_resize(hypervisor.sched_ctxs[0]);
+
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+}
+
+static int get_ntasks( int *tasks)
+{
+	int ntasks = 0;
+	int j;
+	for(j = 0; j < STARPU_NMAXWORKERS; j++)
+	{
+		ntasks += tasks[j];
+	}
+	return ntasks;
+}
+
+
+static void _get_cpus(int *workers, int nworkers, int *cpus, int *ncpus)
+{
+	int i, worker;
+	*ncpus = 0;
+
+	for(i = 0; i < nworkers; i++)
+	{
+		worker = workers[i];
+		enum starpu_archtype arch = starpu_worker_get_type(worker);
+		if(arch == STARPU_CPU_WORKER)
+			cpus[(*ncpus)++] = worker;
+	}
+}
+
+int get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch)
+{
+	int nworkers_ctx = 0;
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx);
+	int worker;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		if(curr_arch == arch || arch == STARPU_ANY_WORKER)
+			nworkers_ctx++;
+	}
+	return nworkers_ctx;
+}
+
+/* actually move the workers: the cpus are moved, gpus are only shared  */
+/* forbids another resize request before this one is take into account */
+void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move, unsigned now)
+{
+	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])// && hypervisor.resize[receiver_sched_ctx])
+	{
+/* 		int j; */
+/* 		printf("resize ctx %d with", sender_sched_ctx); */
+/* 		for(j = 0; j < nworkers_to_move; j++) */
+/* 			printf(" %d", workers_to_move[j]); */
+/* 		printf("\n"); */
+		
+		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int));
+		int ncpus;
+		
+		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus);
+		
+//		if(ncpus != 0)
+//			starpu_remove_workers_from_sched_ctx(cpus, ncpus, sender_sched_ctx);
+
+		starpu_add_workers_to_sched_ctx(workers_to_move, nworkers_to_move, receiver_sched_ctx);
+
+		if(now)
+		{
+/* 			int j; */
+/* 			printf("remove from ctx %d:", sender_sched_ctx); */
+/* 			for(j = 0; j < nworkers_to_move; j++) */
+/* 				printf(" %d", workers_to_move[j]); */
+/* 			printf("\n"); */
+			
+			starpu_remove_workers_from_sched_ctx(workers_to_move, nworkers_to_move, sender_sched_ctx);
+		}
+		else
+		{
+			int ret = pthread_mutex_trylock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);	
+			if(ret != EBUSY)
+			{
+				hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.receiver_sched_ctx = receiver_sched_ctx;
+				hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.moved_workers = (int*)malloc(nworkers_to_move * sizeof(int));
+				hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.nmoved_workers = nworkers_to_move;
+				hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.acked_workers = (int*)malloc(nworkers_to_move * sizeof(int));
+				
+				
+				int i;
+				for(i = 0; i < nworkers_to_move; i++)
+				{
+					hypervisor.sched_ctx_w[sender_sched_ctx].current_idle_time[workers_to_move[i]] = 0.0;
+					hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.moved_workers[i] = workers_to_move[i];	
+					hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.acked_workers[i] = 0;	
+				}
+				
+				hypervisor.resize[sender_sched_ctx] = 0;
+				
+				pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
+			}
+		}
+		struct policy_config *new_config = sched_ctx_hypervisor_get_config(receiver_sched_ctx);
+		int i;
+		for(i = 0; i < nworkers_to_move; i++)
+			new_config->max_idle[workers_to_move[i]] = new_config->max_idle[workers_to_move[i]] !=MAX_IDLE_TIME ? new_config->max_idle[workers_to_move[i]] :  new_config->new_workers_max_idle;
+		
+	}
+	return;
+}
+
+void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx)
+{
+	if(nworkers_to_add > 0 && hypervisor.resize[sched_ctx])
+	{
+/* 		int j; */
+/* 		printf("add to ctx %d:", sched_ctx); */
+/* 		for(j = 0; j < nworkers_to_add; j++) */
+/* 			printf(" %d", workers_to_add[j]); */
+/* 		printf("\n"); */
+		starpu_add_workers_to_sched_ctx(workers_to_add, nworkers_to_add, sched_ctx);
+		struct policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctx);
+		int i;
+		for(i = 0; i < nworkers_to_add; i++)
+			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
+		
+	}
+	return;
+}
+
+unsigned sched_ctx_hypervisor_can_resize(unsigned sched_ctx)
+{
+	return hypervisor.resize[sched_ctx];
+}
+
+void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now)
+{
+	if(nworkers_to_remove > 0 && hypervisor.resize[sched_ctx])
+	{
+		int nworkers=0;
+		int workers[nworkers_to_remove];
+	
+		if(now)
+		{
+/* 				int j; */
+/* 				printf("remove from ctx %d:", sched_ctx); */
+/* 				for(j = 0; j < nworkers_to_remove; j++) */
+/* 					printf(" %d", workers_to_remove[j]); */
+/* 				printf("\n"); */
+				
+				starpu_remove_workers_from_sched_ctx(workers_to_remove, nworkers_to_remove, sched_ctx);
+		}
+		else
+		{
+			int ret = pthread_mutex_trylock(&hypervisor.sched_ctx_w[sched_ctx].mutex);	
+			if(ret != EBUSY)
+			{
+				
+				int i;
+				for(i = 0; i < nworkers_to_remove; i++)
+					if(starpu_worker_belongs_to_sched_ctx(workers_to_remove[i], sched_ctx))
+						workers[nworkers++] = workers_to_remove[i];
+				
+				hypervisor.sched_ctx_w[sched_ctx].resize_ack.receiver_sched_ctx = -1;
+				hypervisor.sched_ctx_w[sched_ctx].resize_ack.moved_workers = (int*)malloc(nworkers_to_remove * sizeof(int));
+				hypervisor.sched_ctx_w[sched_ctx].resize_ack.nmoved_workers = nworkers;
+				hypervisor.sched_ctx_w[sched_ctx].resize_ack.acked_workers = (int*)malloc(nworkers_to_remove * sizeof(int));
+				
+				
+				for(i = 0; i < nworkers; i++)
+				{
+					hypervisor.sched_ctx_w[sched_ctx].current_idle_time[workers[i]] = 0.0;
+					hypervisor.sched_ctx_w[sched_ctx].resize_ack.moved_workers[i] = workers[i];	
+					hypervisor.sched_ctx_w[sched_ctx].resize_ack.acked_workers[i] = 0;	
+				}
+
+				hypervisor.resize[sched_ctx] = 0;
+				pthread_mutex_unlock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
+			}
+		}
+ 	}
+	return;
+}
+
+static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
+{
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
+}
+
+double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_wrapper* sc_w)
+{
+	double ret_val = 0.0;
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		ret_val += sc_w->elapsed_flops[i];
+	return ret_val;
+}
+
+double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_wrapper* sc_w)
+{
+	double ret_val = 0.0;
+	int i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		ret_val += sc_w->total_elapsed_flops[i];
+	return ret_val;
+}
+
+static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
+{
+	if(worker != -1 && !starpu_worker_belongs_to_sched_ctx(worker, sched_ctx))
+		return 0;
+
+	struct resize_ack *resize_ack = NULL;
+	unsigned sender_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+
+	int i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		if(hypervisor.sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS)
+		{
+			struct sched_ctx_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
+			pthread_mutex_lock(&sc_w->mutex);
+			unsigned only_remove = 0;
+			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != sched_ctx && 
+			   sc_w->resize_ack.nmoved_workers > 0 && starpu_worker_belongs_to_sched_ctx(worker, hypervisor.sched_ctxs[i]))
+			{
+				int j;
+				for(j = 0; j < sc_w->resize_ack.nmoved_workers; j++)
+					if(sc_w->resize_ack.moved_workers[j] == worker)
+					{
+						only_remove = 1;
+						break;
+					}
+			}
+			if(only_remove || 
+			   (sc_w->resize_ack.receiver_sched_ctx != -1 && sc_w->resize_ack.receiver_sched_ctx == sched_ctx))
+			{
+				resize_ack = &sc_w->resize_ack;
+				sender_sched_ctx = hypervisor.sched_ctxs[i];
+				pthread_mutex_unlock(&sc_w->mutex);
+				break;
+			}
+			pthread_mutex_unlock(&sc_w->mutex);
+		}
+	}
+
+	/* if there is no ctx waiting for its ack return 1*/
+	if(resize_ack == NULL)
+		return 1;
+	
+	int ret = pthread_mutex_trylock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
+	if(ret != EBUSY)
+	{
+		int *moved_workers = resize_ack->moved_workers;
+		int nmoved_workers = resize_ack->nmoved_workers;
+		int *acked_workers = resize_ack->acked_workers;
+
+		if(worker != -1)
+		{
+			for(i = 0; i < nmoved_workers; i++)
+			{
+				int moved_worker = moved_workers[i];
+				if(moved_worker == worker && acked_workers[i] == 0)
+				{
+					acked_workers[i] = 1;
+				}
+			}
+		}
+			
+		int nacked_workers = 0;
+		for(i = 0; i < nmoved_workers; i++)
+		{
+			nacked_workers += (acked_workers[i] == 1);
+		}
+		
+		unsigned resize_completed = (nacked_workers == nmoved_workers);
+		int receiver_sched_ctx = sched_ctx;
+		if(resize_completed)
+		{
+			/* if the permission to resize is not allowed by the user don't do it
+			   whatever the application says */
+			if(!((hypervisor.resize[sender_sched_ctx] == 0 || hypervisor.resize[receiver_sched_ctx] == 0) && imposed_resize))
+			{				
+/* 				int j; */
+/* 				printf("remove from ctx %d:", sender_sched_ctx); */
+/* 				for(j = 0; j < nmoved_workers; j++) */
+/* 					printf(" %d", moved_workers[j]); */
+/* 				printf("\n"); */
+				
+				starpu_remove_workers_from_sched_ctx(moved_workers, nmoved_workers, sender_sched_ctx);
+				
+				/* info concerning only the gflops_rate strateg */
+				struct sched_ctx_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
+				struct sched_ctx_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
+				
+				double start_time =  starpu_timing_now();
+				sender_sc_w->start_time = start_time;
+				sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
+				_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
+				
+				receiver_sc_w->start_time = start_time;
+				receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
+				_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
+				
+				hypervisor.resize[sender_sched_ctx] = 1;
+//				hypervisor.resize[receiver_sched_ctx] = 1;
+				/* if the user allowed resizing leave the decisions to the application */
+				if(imposed_resize)  imposed_resize = 0;
+				
+				resize_ack->receiver_sched_ctx = -1;
+				resize_ack->nmoved_workers = 0;
+				free(resize_ack->moved_workers);
+				free(resize_ack->acked_workers);
+
+			}
+			pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
+			return resize_completed;
+		}
+		pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
+	}
+	return 0;
+}
+
+/* Enqueue a resize request for 'sched_ctx', to be executed when the
+ * 'task_tag' tasks of 'sched_ctx' complete.  */
+void sched_ctx_hypervisor_resize(unsigned sched_ctx, int task_tag)
+{
+	struct resize_request_entry *entry;
+
+	entry = malloc(sizeof *entry);
+	STARPU_ASSERT(entry != NULL);
+
+	entry->sched_ctx = sched_ctx;
+	entry->task_tag = task_tag;
+
+	pthread_mutex_lock(&hypervisor.resize_mut[sched_ctx]);
+	HASH_ADD_INT(hypervisor.resize_requests[sched_ctx], task_tag, entry);
+	pthread_mutex_unlock(&hypervisor.resize_mut[sched_ctx]);
+}
+
+/* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
+static void notify_idle_end(unsigned sched_ctx, int worker)
+{
+	if(hypervisor.resize[sched_ctx])
+		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
+	
+	if(hypervisor.policy.handle_idle_end)
+		hypervisor.policy.handle_idle_end(sched_ctx, worker);
+		
+}
+
+/* notifies the hypervisor that the worker spent another cycle in idle time */
+static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
+{
+	if(hypervisor.resize[sched_ctx])
+	{
+		struct sched_ctx_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
+		sc_w->current_idle_time[worker] += idle_time;
+		if(hypervisor.policy.handle_idle_cycle)
+		{
+			hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
+		}
+	}		
+	return;
+}
+
+/* notifies the hypervisor that a new task was pushed on the queue of the worker */
+static void notify_pushed_task(unsigned sched_ctx, int worker)
+{	
+	hypervisor.sched_ctx_w[sched_ctx].pushed_tasks[worker]++;
+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time == 0.0)
+		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
+	
+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
+	
+	if((hypervisor.min_tasks == 0 || (!(hypervisor.resize[sched_ctx] == 0 && imposed_resize) && ntasks == hypervisor.min_tasks)) && hypervisor.check_min_tasks[sched_ctx])
+	{
+		hypervisor.resize[sched_ctx] = 1;
+		if(imposed_resize) imposed_resize = 0;
+		hypervisor.check_min_tasks[sched_ctx] = 0;
+	}
+
+	if(hypervisor.policy.handle_pushed_task)
+		hypervisor.policy.handle_pushed_task(sched_ctx, worker);
+}
+
+/* notifies the hypervisor that a task was poped from the queue of the worker */
+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
+{
+	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
+	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
+
+	if(hypervisor.resize[sched_ctx])
+	{
+		if(hypervisor.policy.handle_poped_task)
+			hypervisor.policy.handle_poped_task(sched_ctx, worker);
+	}
+	_ack_resize_completed(sched_ctx, worker);
+}
+
+/* notifies the hypervisor that a tagged task has just been executed */
+static void notify_post_exec_hook(unsigned sched_ctx, int task_tag)
+{
+	STARPU_ASSERT(task_tag > 0);
+
+	unsigned conf_sched_ctx;
+	int i;
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	unsigned ns = hypervisor.nsched_ctxs;
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+	
+	for(i = 0; i < ns; i++)
+	{
+		struct configuration_entry *entry;
+
+		conf_sched_ctx = hypervisor.sched_ctxs[i];
+		pthread_mutex_lock(&hypervisor.conf_mut[conf_sched_ctx]);
+
+		HASH_FIND_INT(hypervisor.configurations[conf_sched_ctx], &task_tag, entry);
+
+		if (entry != NULL)
+		{
+			struct policy_config *config = entry->configuration;
+
+			sched_ctx_hypervisor_set_config(conf_sched_ctx, config);
+			HASH_DEL(hypervisor.configurations[conf_sched_ctx], entry);
+			free(config);
+		}
+		pthread_mutex_unlock(&hypervisor.conf_mut[conf_sched_ctx]);
+	}	
+		
+	if(hypervisor.resize[sched_ctx])
+	{
+		pthread_mutex_lock(&hypervisor.resize_mut[sched_ctx]);
+
+		if(hypervisor.policy.handle_post_exec_hook)
+		{
+			/* Check whether 'task_tag' is in the 'resize_requests' set.  */
+			struct resize_request_entry *entry;
+			HASH_FIND_INT(hypervisor.resize_requests[sched_ctx], &task_tag, entry);
+			if (entry != NULL)
+			{
+				hypervisor.policy.handle_post_exec_hook(sched_ctx,
+									task_tag);
+				HASH_DEL(hypervisor.resize_requests[sched_ctx], entry);
+				free(entry);
+			}
+
+		}
+		pthread_mutex_unlock(&hypervisor.resize_mut[sched_ctx]);
+	}
+	return;
+}
+
+static void notify_submitted_job(struct starpu_task *task, uint32_t footprint)
+{
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	hypervisor.sched_ctx_w[task->sched_ctx].submitted_flops += task->flops;
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+
+	if(hypervisor.policy.handle_submitted_job)
+		hypervisor.policy.handle_submitted_job(task, footprint);
+}
+
+void sched_ctx_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+{
+	pthread_mutex_lock(&act_hypervisor_mutex);
+	int curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
+	int *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
+	pthread_mutex_unlock(&act_hypervisor_mutex);
+	int s;
+	for(s = 0; s < curr_nsched_ctxs; s++)
+		hypervisor.resize[curr_sched_ctxs[s]] = 1;
+
+	if(hypervisor.policy.size_ctxs)
+		hypervisor.policy.size_ctxs(curr_sched_ctxs, curr_nsched_ctxs, workers, nworkers);
+}
+
+struct sched_ctx_wrapper* sched_ctx_hypervisor_get_wrapper(unsigned sched_ctx)
+{
+	return &hypervisor.sched_ctx_w[sched_ctx];
+}
+
+int* sched_ctx_hypervisor_get_sched_ctxs()
+{
+	return hypervisor.sched_ctxs;
+}
+
+int sched_ctx_hypervisor_get_nsched_ctxs()
+{
+	int ns;
+	ns = hypervisor.nsched_ctxs;
+	return ns;
+}
+
+void sched_ctx_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+{
+	hypervisor.sr = (struct size_request*)malloc(sizeof(struct size_request));
+	hypervisor.sr->sched_ctxs = sched_ctxs;
+	hypervisor.sr->nsched_ctxs = nsched_ctxs;
+	hypervisor.sr->workers = workers;
+	hypervisor.sr->nworkers = nworkers;
+}
+
+unsigned sched_ctx_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
+{
+	if(hypervisor.sr != NULL)
+	{
+		*sched_ctxs = hypervisor.sr->sched_ctxs;
+		*nsched_ctxs = hypervisor.sr->nsched_ctxs;
+		*workers = hypervisor.sr->workers;
+		*nworkers = hypervisor.sr->nworkers;
+		return 1;
+	}
+	return 0;
+}
+
+void sched_ctx_hypervisor_free_size_req(void)
+{
+	if(hypervisor.sr != NULL)
+	{
+		free(hypervisor.sr);
+		hypervisor.sr = NULL;
+	}
+}

+ 81 - 0
sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h

@@ -0,0 +1,81 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sched_ctx_hypervisor.h>
+#include <common/uthash.h>
+struct size_request {
+	int *workers;
+	int nworkers;
+	int *sched_ctxs;
+	int nsched_ctxs;
+};
+
+
+/* Entry in the resize request hash table.  */
+struct resize_request_entry {
+	/* Key: the tag of tasks concerned by this resize request.  */
+	uint32_t task_tag;
+
+	/* Value: identifier of the scheduling context needing to be resized.
+	 * The value doesn't matter since the hash table is used only to test
+	 * membership of a task tag.  */
+	unsigned sched_ctx;
+
+	/* Bookkeeping.  */
+	UT_hash_handle hh;
+};
+
+struct configuration_entry {
+	/* Key: the tag of tasks concerned by this configuration.  */
+	uint32_t task_tag;
+
+	/* Value: configuration of the scheduling context.  */
+	struct policy_config *configuration;
+
+	/* Bookkeeping.  */
+	UT_hash_handle hh;
+};
+
+struct sched_ctx_hypervisor {
+	struct sched_ctx_wrapper sched_ctx_w[STARPU_NMAX_SCHED_CTXS];
+	int sched_ctxs[STARPU_NMAX_SCHED_CTXS];
+	unsigned nsched_ctxs;
+	unsigned resize[STARPU_NMAX_SCHED_CTXS];
+	int min_tasks;
+	struct hypervisor_policy policy;
+
+	struct configuration_entry *configurations[STARPU_NMAX_SCHED_CTXS];
+
+	/* Set of pending resize requests for any context/tag pair.  */
+	struct resize_request_entry *resize_requests[STARPU_NMAX_SCHED_CTXS];
+
+	pthread_mutex_t conf_mut[STARPU_NMAX_SCHED_CTXS];
+	pthread_mutex_t resize_mut[STARPU_NMAX_SCHED_CTXS];
+	struct size_request *sr;
+	int check_min_tasks[STARPU_NMAX_SCHED_CTXS];
+};
+
+struct sched_ctx_hypervisor_adjustment {
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers;
+};
+
+struct sched_ctx_hypervisor hypervisor;
+
+
+void _add_config(unsigned sched_ctx);
+
+void _remove_config(unsigned sched_ctx);

+ 7 - 1
src/Makefile.am

@@ -2,6 +2,7 @@
 #
 # Copyright (C) 2009-2012  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2011  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -62,6 +63,7 @@ noinst_HEADERS = 						\
 	core/dependencies/implicit_data_deps.h			\
 	core/progress_hook.h                                    \
 	core/sched_policy.h					\
+	core/sched_ctx.h					\
 	core/perfmodel/perfmodel.h				\
 	core/perfmodel/regression.h				\
 	core/jobs.h						\
@@ -101,6 +103,7 @@ noinst_HEADERS = 						\
 	common/utils.h						\
 	common/barrier.h					\
 	common/uthash.h						\
+	common/barrier_counter.h				\
 	drivers/driver_common/driver_common.h			\
 	drivers/cpu/driver_cpu.h				\
 	drivers/gordon/driver_gordon.h				\
@@ -122,6 +125,7 @@ noinst_HEADERS = 						\
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	common/barrier.c					\
+	common/barrier_counter.c				\
 	common/hash.c 						\
 	common/rwlock.c						\
 	common/starpu_spinlock.c				\
@@ -150,6 +154,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	core/perfmodel/regression.c				\
 	core/sched_policy.c					\
 	core/simgrid.c						\
+	core/sched_ctx.c					\
 	core/priorities.c					\
 	core/parallel_task.c					\
 	sched_policies/eager_central_policy.c			\
@@ -215,7 +220,8 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	top/starpu_top.c					\
 	top/starpu_top_task.c					\
 	top/starpu_top_message_queue.c				\
-	top/starpu_top_connection.c
+	top/starpu_top_connection.c                          	\
+	worker_collection/worker_list.c
 
 if STARPU_USE_CPU
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c

+ 96 - 0
src/common/barrier_counter.c

@@ -0,0 +1,96 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/barrier_counter.h>
+
+int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, int count)
+{
+	_starpu_barrier_init(&barrier_c->barrier, count);
+	_STARPU_PTHREAD_COND_INIT(&barrier_c->cond2, NULL);
+	return 0;
+}
+
+int _starpu_barrier_counter_destroy(struct _starpu_barrier_counter *barrier_c)
+{
+	_starpu_barrier_destroy(&barrier_c->barrier);
+	_STARPU_PTHREAD_COND_DESTROY(&barrier_c->cond2);
+	return 0;
+}
+
+
+int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+
+	while (barrier->reached_start > 0)
+		_STARPU_PTHREAD_COND_WAIT(&barrier->cond, &barrier->mutex);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return 0;
+}
+
+int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+
+	while (barrier->reached_start < barrier->count)
+		_STARPU_PTHREAD_COND_WAIT(&barrier_c->cond2, &barrier->mutex);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return 0;
+}
+
+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	int ret = 0;
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+
+	if (--barrier->reached_start == 0)
+	{
+		ret = 1;
+		_STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return ret;
+}
+
+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+	
+	if(++barrier->reached_start == barrier->count)
+		_STARPU_PTHREAD_COND_BROADCAST(&barrier_c->cond2);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return 0;
+}
+
+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+
+	barrier->reached_start++;
+	
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return 0;
+}
+

+ 37 - 0
src/common/barrier_counter.h

@@ -0,0 +1,37 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <common/utils.h>
+#include <common/barrier.h>
+
+struct _starpu_barrier_counter {
+	struct _starpu_barrier barrier;
+	pthread_cond_t cond2;
+};
+
+int _starpu_barrier_counter_init(struct _starpu_barrier_counter *barrier_c, int count);
+
+int _starpu_barrier_counter_destroy(struct _starpu_barrier_counter *barrier_c);
+
+int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counter *barrier_c);
+
+int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c);
+
+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c);
+
+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c);
+
+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
+

+ 2 - 2
src/common/fxt.h

@@ -206,10 +206,10 @@ do {									\
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 1, model_name); \
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE3(_STARPU_FUT_START_CODELET_BODY, (job), _starpu_gettid(), 0); \
+		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 	}								\
 } while(0);
 

+ 4 - 4
src/core/dependencies/implicit_data_deps.c

@@ -328,7 +328,7 @@ void _starpu_detect_implicit_data_deps(struct starpu_task *task)
 		new_task = _starpu_detect_implicit_data_deps_with_handle(task, task, handle, mode);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task) {
-			int ret = starpu_task_submit(new_task);
+			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 	}
@@ -516,7 +516,7 @@ void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle)
 			/* There is no need to depend on that task now, since it was already unlocked */
 			_starpu_release_data_enforce_sequential_consistency(link->task, handle);
 
-			int ret = starpu_task_submit(link->task);
+			int ret = _starpu_task_submit_internally(link->task);
 			STARPU_ASSERT(!ret);
 			struct _starpu_task_wrapper_list *tmp = link;
 			link = link->next;
@@ -548,12 +548,12 @@ int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_a
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task) {
-			int ret = starpu_task_submit(new_task);
+			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 
 		/* TODO detect if this is superflous */
-		int ret = starpu_task_submit(sync_task);
+		int ret = _starpu_task_submit_internally(sync_task);
 		STARPU_ASSERT(!ret);
 		ret = starpu_task_wait(sync_task);
 		STARPU_ASSERT(ret == 0);

+ 23 - 13
src/core/jobs.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -128,8 +129,9 @@ void _starpu_wait_job(struct _starpu_job *j)
 
 void _starpu_handle_job_termination(struct _starpu_job *j)
 {
+	int workerid = starpu_worker_get_id();
 	struct starpu_task *task = j->task;
-
+	unsigned sched_ctx = task->sched_ctx;
 	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 	task->status = STARPU_TASK_FINISHED;
@@ -201,8 +203,14 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	/* If the job was executed on a combined worker there is no need for the
 	 * scheduler to process it : the task structure doesn't contain any valuable
 	 * data as it's not linked to an actual worker */
-	if (j->task_size == 1)
+	/* control task should not execute post_exec_hook */
+	if(j->task_size == 1 && task->cl != NULL && !task->control_task)
+	{
 		_starpu_sched_post_exec_hook(task);
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	  starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+	}
 
 	_STARPU_TRACE_TASK_DONE(j);
 
@@ -254,6 +262,8 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	}
 	_starpu_decrement_nsubmitted_tasks();
 	_starpu_decrement_nready_tasks();
+
+	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
 }
 
 /* This function is called when a new task is submitted to StarPU
@@ -336,29 +346,29 @@ unsigned _starpu_enforce_deps_and_schedule(struct _starpu_job *j)
 	if (_starpu_not_all_tag_deps_are_fulfilled(j))
 	{
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-                _STARPU_LOG_OUT_TAG("not_all_tag_deps_are_fulfilled");
+		_STARPU_LOG_OUT_TAG("not_all_tag_deps_are_fulfilled");
 		return 0;
-        }
-
+	}
+	
 	/* enfore task dependencies */
 	if (_starpu_not_all_task_deps_are_fulfilled(j))
 	{
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
-                _STARPU_LOG_OUT_TAG("not_all_task_deps_are_fulfilled");
+		_STARPU_LOG_OUT_TAG("not_all_task_deps_are_fulfilled");
 		return 0;
-        }
+	}
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
 	/* enforce data dependencies */
 	if (_starpu_submit_job_enforce_data_deps(j))
 	{
-                _STARPU_LOG_OUT_TAG("enforce_data_deps");
+		_STARPU_LOG_OUT_TAG("enforce_data_deps");
 		return 0;
-        }
+	}
 
 	ret = _starpu_push_task(j);
 
-        _STARPU_LOG_OUT();
+	_STARPU_LOG_OUT();
 	return ret;
 }
 
@@ -402,15 +412,15 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 	if (STARPU_UNLIKELY(!(worker->worker_mask & task->cl->where)))
 		return -ENODEV;
 
-	_STARPU_PTHREAD_MUTEX_LOCK(worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 
 	if (back)
 		starpu_task_list_push_back(&worker->local_tasks, task);
 	else
 		starpu_task_list_push_front(&worker->local_tasks, task);
 
-	_STARPU_PTHREAD_COND_BROADCAST(worker->sched_cond);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(worker->sched_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 
 	return 0;
 }

+ 10 - 7
src/core/perfmodel/perfmodel.c

@@ -355,13 +355,16 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 
 	while (entry)
 	{
-		double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
-
-		/* In case the task is not calibrated, we consider the task
-		 * ends immediately. */
-		if (task_length > 0.0)
-			expected_length += task_length;
-
+		if(!entry->task->scheduled)
+		{
+			double task_length = starpu_task_expected_length(entry->task, arch, nimpl);
+			
+			/* In case the task is not calibrated, we consider the task
+			 * ends immediately. */
+			if (task_length > 0.0)
+				expected_length += task_length;
+		}
+			
 		entry = entry->next;
 	}
 

+ 10 - 0
src/core/perfmodel/perfmodel_history.c

@@ -1139,6 +1139,16 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	return exp;
 }
 
+double starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint)
+{
+	struct _starpu_job j =
+		{
+			.footprint = footprint,
+			.footprint_is_computed = 1,
+		};
+	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
+}
+
 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
 {
 	if (model)

+ 994 - 0
src/core/sched_ctx.c

@@ -0,0 +1,994 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <core/sched_policy.h>
+#include <core/sched_ctx.h>
+#include <common/utils.h>
+
+extern struct worker_collection worker_list;
+static _starpu_pthread_mutex_t sched_ctx_manag = PTHREAD_MUTEX_INITIALIZER;
+static _starpu_pthread_mutex_t finished_submit_mutex = PTHREAD_MUTEX_INITIALIZER;
+struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
+pthread_key_t sched_ctx_key;
+unsigned with_hypervisor = 0;
+double max_time_worker_on_ctx = -1.0;
+
+static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
+static unsigned _starpu_worker_get_first_free_sched_ctx(struct _starpu_worker *worker);
+
+static unsigned _starpu_worker_get_sched_ctx_id(struct _starpu_worker *worker, unsigned sched_ctx_id);
+
+static void change_worker_sched_ctx(unsigned sched_ctx_id)
+{
+	int workerid = starpu_worker_get_id();
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+
+	int worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
+	/* if the worker is not in the ctx's list it means the update concerns the addition of ctxs*/
+	if(worker_sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+	{
+		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
+		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+		/* add context to worker */
+		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
+		worker->nsched_ctxs++;	
+		worker->active_ctx = sched_ctx_id;
+	}
+	else 
+	{
+		/* remove context from worker */
+		if(worker->sched_ctx[worker_sched_ctx_id]->sched_policy)
+			worker->sched_ctx[worker_sched_ctx_id]->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
+		worker->sched_ctx[worker_sched_ctx_id] = NULL;
+		worker->nsched_ctxs--;
+		starpu_set_turn_to_other_ctx(worker->workerid, sched_ctx_id);
+	}
+}
+
+static void update_workers_func(void *buffers[] __attribute__ ((unused)), void *_args)
+{
+	unsigned sched_ctx_id = (uintptr_t)_args;
+	change_worker_sched_ctx(sched_ctx_id);
+}
+
+struct starpu_codelet sched_ctx_info_cl = {
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cuda_func = update_workers_func,
+	.cpu_func = update_workers_func,
+	.opencl_func = update_workers_func,
+	.nbuffers = 0
+};
+
+static void _starpu_update_workers(int *workerids, int nworkers, int sched_ctx_id)
+{
+	int i;
+	struct _starpu_worker *worker[nworkers];
+ 	struct _starpu_worker *curr_worker = _starpu_get_local_worker_key();
+
+	for(i = 0; i < nworkers; i++)
+	{
+		worker[i] = _starpu_get_worker_struct(workerids[i]);
+
+		/* if the current thread requires resize it's no need
+		   to send itsefl a message in order to change its 
+		   sched_ctx info */
+		if(curr_worker && curr_worker == worker[i])
+			change_worker_sched_ctx(sched_ctx_id);
+		else
+		{			
+			worker[i]->tasks[sched_ctx_id] = starpu_task_create();
+			worker[i]->tasks[sched_ctx_id]->cl = &sched_ctx_info_cl;
+			worker[i]->tasks[sched_ctx_id]->cl_arg = (void*)(uintptr_t)sched_ctx_id;
+			worker[i]->tasks[sched_ctx_id]->execute_on_a_specific_worker = 1;
+			worker[i]->tasks[sched_ctx_id]->workerid = workerids[i];
+			worker[i]->tasks[sched_ctx_id]->destroy = 1;
+			worker[i]->tasks[sched_ctx_id]->control_task = 1;
+			int worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker[i], sched_ctx_id);
+			/* if the ctx is not in the worker's list it means the update concerns the addition of ctxs*/
+			if(worker_sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+				worker[i]->tasks[sched_ctx_id]->priority = 1;
+
+			_starpu_exclude_task_from_dag(worker[i]->tasks[sched_ctx_id]);
+
+			_starpu_task_submit_internally(worker[i]->tasks[sched_ctx_id]);
+		}		
+	}
+}
+
+void starpu_stop_task_submission()
+{
+	_starpu_exclude_task_from_dag(&stop_submission_task);
+	_starpu_task_submit_internally(&stop_submission_task);
+}
+
+static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, 
+				       int *added_workers, int *n_added_workers)
+{
+	struct worker_collection *workers = sched_ctx->workers;
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+
+	int nworkers_to_add = nworkers == -1 ? (int)config->topology.nworkers : nworkers;
+	int workers_to_add[nworkers_to_add];
+
+	int i = 0;
+	for(i = 0; i < nworkers_to_add; i++)
+	{
+		/* added_workers is NULL for the call of this func at the creation of the context*/
+		/* if the function is called at the creation of the context it's no need to do this verif */
+		if(added_workers)
+		{
+			int worker = workers->add(workers, (workerids == NULL ? i : workerids[i]));
+			if(worker >= 0)
+			{
+				added_workers[(*n_added_workers)++] = worker;		
+			}
+		}
+		else
+		{
+			int worker = (workerids == NULL ? i : workerids[i]); 
+			workers->add(workers, worker);
+			workers_to_add[i] = worker;
+		}
+	}
+
+	if(added_workers)
+	{
+		if(*n_added_workers > 0)
+			sched_ctx->sched_policy->add_workers(sched_ctx->id, added_workers, *n_added_workers);	
+	}
+	else
+		sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);		
+
+	return;
+}
+
+static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sched_ctx, int *workerids, 
+						  int nworkers, int *removed_workers, int *n_removed_workers)
+{
+	struct worker_collection *workers = sched_ctx->workers;
+
+	int i = 0;
+
+
+	if(nworkers == -1)
+	{
+		int nrem_workers = 0;
+		int rem_workers[STARPU_NMAXWORKERS];
+
+		if(workers->init_cursor)
+			workers->init_cursor(workers);
+
+		int worker = -1;
+		while(workers->has_next(workers))
+		{
+			worker = workers->get_next(workers);
+			if(!starpu_worker_is_combined_worker(worker))
+				rem_workers[nrem_workers++] = worker;
+		}
+
+		if(workers->init_cursor)
+			workers->deinit_cursor(workers);
+
+		if(nrem_workers > 0)
+			sched_ctx->sched_policy->remove_workers(sched_ctx->id, rem_workers, nrem_workers);
+		return;
+	}
+
+	for(i = 0; i < nworkers; i++)
+	{
+		if(workers->nworkers > 0)
+		{
+			int worker = workers->remove(workers, workerids[i]);
+			if(worker >= 0)
+				removed_workers[(*n_removed_workers)++] = worker;
+		}
+		if(*n_removed_workers)
+			sched_ctx->sched_policy->remove_workers(sched_ctx->id, removed_workers, *n_removed_workers);
+	}
+
+	return;
+}
+
+
+struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int *workerids, 
+				  int nworkers_ctx, unsigned is_initial_sched,
+				  const char *sched_name)
+{
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
+	STARPU_ASSERT(config->topology.nsched_ctxs < STARPU_NMAX_SCHED_CTXS);
+
+	unsigned id = _starpu_get_first_free_sched_ctx(config);
+
+	struct _starpu_sched_ctx *sched_ctx = &config->sched_ctxs[id];
+	sched_ctx->id = id;
+
+	config->topology.nsched_ctxs++;	
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
+
+	int nworkers = config->topology.nworkers;
+	
+	STARPU_ASSERT(nworkers_ctx <= nworkers);
+  
+	_STARPU_PTHREAD_MUTEX_INIT(&sched_ctx->changing_ctx_mutex, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&sched_ctx->empty_ctx_mutex, NULL);
+
+	starpu_task_list_init(&sched_ctx->empty_ctx_tasks);
+
+	sched_ctx->sched_policy = (struct starpu_sched_policy*)malloc(sizeof(struct starpu_sched_policy));
+	sched_ctx->is_initial_sched = is_initial_sched;
+	sched_ctx->name = sched_name;
+	sched_ctx->inheritor = STARPU_NMAX_SCHED_CTXS;
+	sched_ctx->finished_submit = 0;
+
+	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
+
+	/* initialise all sync structures bc the number of workers can modify */
+	sched_ctx->sched_mutex = (_starpu_pthread_mutex_t**)malloc(STARPU_NMAXWORKERS * sizeof(_starpu_pthread_mutex_t*));
+	sched_ctx->sched_cond = (_starpu_pthread_cond_t**)malloc(STARPU_NMAXWORKERS * sizeof(_starpu_pthread_cond_t*));
+
+	
+	/*init the strategy structs and the worker_collection of the ressources of the context */
+	_starpu_init_sched_policy(config, sched_ctx, policy_name);
+
+	/* construct the collection of workers(list/tree/etc.) */
+	sched_ctx->workers->workerids = sched_ctx->workers->init(sched_ctx->workers);
+	sched_ctx->workers->nworkers = 0;
+
+	/* after having an worker_collection on the ressources add them */
+	_starpu_add_workers_to_sched_ctx(sched_ctx, workerids, nworkers_ctx, NULL, NULL);
+
+
+	/* if we create the initial big sched ctx we can update workers' status here
+	   because they haven't been launched yet */
+	if(is_initial_sched)
+	{
+		int i;
+		for(i = 0; i < nworkers; i++)
+		{
+			struct _starpu_worker *worker = _starpu_get_worker_struct(i);
+			worker->sched_ctx[_starpu_worker_get_first_free_sched_ctx(worker)] = sched_ctx;
+			worker->nsched_ctxs++;
+		}
+	}
+
+	int w;
+	for(w = 0; w < STARPU_NMAXWORKERS; w++)
+	{
+		sched_ctx->pop_counter[w] = 0;
+	}
+	
+	return sched_ctx;
+}
+
+static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_archtype arch, unsigned allow_overlap)
+{
+	int pus[max];
+	int npus = 0; 
+	int i;
+	int n = 0;
+		
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	if(config->topology.nsched_ctxs == 1)
+	{
+		/*we have all available resources */
+		npus = starpu_worker_get_nids_by_type(arch, pus, max);
+/*TODO: hierarchical ctxs: get max good workers: close one to another */
+		for(i = 0; i < npus; i++)
+			workers[(*nw)++] = pus[i];
+	}
+	else
+	{
+		unsigned enough_ressources = 0;
+		npus = starpu_worker_get_nids_ctx_free_by_type(arch, pus, max);
+       
+		for(i = 0; i < npus; i++)
+			workers[(*nw)++] = pus[i];
+		
+		if(npus == max)
+			/*we have enough available resources */
+			enough_ressources = 1;
+
+		if(!enough_ressources && npus >= min)
+			/*we have enough available resources */
+			enough_ressources = 1;
+
+		if(!enough_ressources)
+		{
+			/* try to get ressources from ctx who have more than the min of workers they need */
+			int s;
+			for(s = 1; s < STARPU_NMAX_SCHED_CTXS; s++)
+			{
+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+				{
+					int _npus = 0;
+					int _pus[STARPU_NMAXWORKERS];
+					_npus = starpu_get_workers_of_sched_ctx(config->sched_ctxs[s].id, _pus, arch);
+					int ctx_min = arch == STARPU_CPU_WORKER ? config->sched_ctxs[s].min_ncpus : config->sched_ctxs[s].min_ngpus;
+					if(_npus > ctx_min)
+					{
+						if(npus < min)
+						{
+							n = (_npus - ctx_min) > (min - npus) ? min - npus : (_npus - ctx_min);
+							npus += n;
+						}
+/*TODO: hierarchical ctxs: get n good workers: close to the other ones I already assigned to the ctx */
+						for(i = 0; i < n; i++)
+							workers[(*nw)++] = _pus[i];
+						starpu_remove_workers_from_sched_ctx(_pus, n, config->sched_ctxs[s].id);
+					}
+				}
+			}
+
+			if(npus >= min)
+				enough_ressources = 1;
+		}
+		
+		if(!enough_ressources)
+		{
+			/* if there is no available workers to satisfy the  minimum required 
+			 give them workers proportional to their requirements*/
+			int global_npus = starpu_worker_get_count_by_type(arch);
+			
+			int req_npus = 0;
+
+			int s;
+			for(s = 1; s < STARPU_NMAX_SCHED_CTXS; s++)
+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+					req_npus += arch == STARPU_CPU_WORKER ? config->sched_ctxs[s].min_ncpus : config->sched_ctxs[s].min_ngpus;
+
+			req_npus += min;
+			
+			for(s = 1; s < STARPU_NMAX_SCHED_CTXS; s++)
+			{
+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+				{
+					int ctx_min = arch == STARPU_CPU_WORKER ? config->sched_ctxs[s].min_ncpus : config->sched_ctxs[s].min_ngpus;
+					double needed_npus = ((double)ctx_min * (double)global_npus) / (double)req_npus;
+
+					int _npus = 0;
+					int _pus[STARPU_NMAXWORKERS];
+				
+					_npus = starpu_get_workers_of_sched_ctx(config->sched_ctxs[s].id, _pus, arch);					
+					if(needed_npus < (double)_npus)
+					{
+						double npus_to_rem = (double)_npus - needed_npus;
+						int x = floor(npus_to_rem);
+						double x_double = (double)x;
+						double diff = npus_to_rem - x_double;
+						int npus_to_remove = diff >= 0.5 ? x+1 : x;
+
+						int pus_to_remove[npus_to_remove];
+						int c = 0;
+						
+/*TODO: hierarchical ctxs: get npus_to_remove good workers: close to the other ones I already assigned to the ctx */
+						for(i = _npus-1; i >= (_npus - npus_to_remove); i--)
+						{
+							workers[(*nw)++] = _pus[i];
+							pus_to_remove[c++] = _pus[i];
+						}
+						if(!allow_overlap)
+							starpu_remove_workers_from_sched_ctx(pus_to_remove, npus_to_remove, config->sched_ctxs[s].id);
+					}
+
+				}
+			}
+		}
+	}
+}
+
+unsigned starpu_create_sched_ctx_inside_interval(const char *policy_name, const char *sched_name, 
+						 int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus,
+						 unsigned allow_overlap)
+{
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	int workers[max_ncpus + max_ngpus];
+	int nw = 0;
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
+	_get_workers(min_ncpus, max_ncpus, workers, &nw, STARPU_CPU_WORKER, allow_overlap);
+	_get_workers(min_ngpus, max_ngpus, workers, &nw, STARPU_CUDA_WORKER, allow_overlap);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
+	int i;
+	printf("%d: ", nw);
+	for(i = 0; i < nw; i++)
+		printf("%d ", workers[i]);
+	printf("\n");
+	sched_ctx = _starpu_create_sched_ctx(policy_name, workers, nw, 0, sched_name);
+	sched_ctx->min_ncpus = min_ncpus;
+	sched_ctx->max_ncpus = max_ncpus;
+	sched_ctx->min_ngpus = min_ngpus;
+	sched_ctx->max_ngpus = max_ngpus;
+	
+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	sched_ctx->perf_counters = NULL;
+#endif
+	return sched_ctx->id;
+	
+}
+unsigned starpu_create_sched_ctx(const char *policy_name, int *workerids, 
+				 int nworkers, const char *sched_name)
+{
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	sched_ctx = _starpu_create_sched_ctx(policy_name, workerids, nworkers, 0, sched_name);
+
+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	sched_ctx->perf_counters = NULL;
+#endif
+	return sched_ctx->id;
+}
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->perf_counters = perf_counters;
+	return;
+}
+#endif
+
+/* free all structures for the context */
+static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
+{
+	_starpu_deinit_sched_policy(sched_ctx);		
+	free(sched_ctx->sched_policy);
+	free(sched_ctx->sched_mutex);
+	free(sched_ctx->sched_cond);
+
+	sched_ctx->sched_policy = NULL;
+	sched_ctx->sched_mutex = NULL;
+	sched_ctx->sched_cond = NULL;
+
+	_STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->changing_ctx_mutex);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
+
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
+	config->topology.nsched_ctxs--;
+	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
+}
+
+void starpu_delete_sched_ctx(unsigned sched_ctx_id, unsigned inheritor_sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	struct _starpu_sched_ctx *inheritor_sched_ctx = _starpu_get_sched_ctx_struct(inheritor_sched_ctx_id);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
+	_starpu_update_workers(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, sched_ctx->id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
+
+	/*if both of them have all the ressources is pointless*/
+	/*trying to transfer ressources from one ctx to the other*/
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	unsigned nworkers = config->topology.nworkers;
+
+	if(!(sched_ctx->workers->nworkers == nworkers && sched_ctx->workers->nworkers == inheritor_sched_ctx->workers->nworkers) && sched_ctx->workers->nworkers > 0 && inheritor_sched_ctx_id != STARPU_NMAX_SCHED_CTXS)
+	{
+		starpu_add_workers_to_sched_ctx(sched_ctx->workers->workerids, sched_ctx->workers->nworkers, inheritor_sched_ctx_id);
+	}
+
+	if(!_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id) && !_starpu_wait_for_all_tasks_of_sched_ctx(0))
+	{
+		_starpu_delete_sched_ctx(sched_ctx);
+	}
+	return;	
+}
+
+/* called after the workers are terminated so we don't have anything else to do but free the memory*/
+void _starpu_delete_all_sched_ctxs()
+{
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(i);
+		if(sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+		{
+			_starpu_remove_workers_from_sched_ctx(sched_ctx, NULL, -1, NULL, NULL);
+			_starpu_barrier_counter_destroy(&sched_ctx->tasks_barrier);
+			_starpu_delete_sched_ctx(sched_ctx);
+		}
+	}
+	return;
+}
+
+static void _starpu_check_workers(int *workerids, int nworkers)
+{
+        struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+        int nworkers_conf = config->topology.nworkers;
+
+	int i;
+	for(i = 0; i < nworkers; i++)
+	{
+		/* take care the user does not ask for a resource that does not exist */
+		STARPU_ASSERT(workerids[i] >= 0 &&  workerids[i] <= nworkers_conf);
+	}		
+}
+
+void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx)
+{
+	unsigned unlocked = 0;
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
+	while(!starpu_task_list_empty(&sched_ctx->empty_ctx_tasks))
+	{
+		if(unlocked)
+			_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
+		struct starpu_task *old_task = starpu_task_list_pop_back(&sched_ctx->empty_ctx_tasks);
+		unlocked = 1;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
+		
+		if(old_task == &stop_submission_task)
+			break;
+
+		struct _starpu_job *old_j = _starpu_get_job_associated_to_task(old_task);
+		int ret = _starpu_push_task(old_j);
+		/* if we should stop poping from empty ctx tasks */
+		if(ret == -1) break;
+	}
+	if(!unlocked)
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
+	return;
+
+}
+void starpu_add_workers_to_sched_ctx(int *workers_to_add, int nworkers_to_add, unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	int added_workers[nworkers_to_add];
+	int n_added_workers = 0;
+
+	STARPU_ASSERT(workers_to_add != NULL && nworkers_to_add > 0);
+	_starpu_check_workers(workers_to_add, nworkers_to_add);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
+	_starpu_add_workers_to_sched_ctx(sched_ctx, workers_to_add, nworkers_to_add, added_workers, &n_added_workers);
+
+	if(n_added_workers > 0)
+	{
+		_starpu_update_workers(added_workers, n_added_workers, sched_ctx->id);
+	}
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);
+	
+	_starpu_fetch_tasks_from_empty_ctx_list(sched_ctx);
+
+	return;
+}
+
+void starpu_remove_workers_from_sched_ctx(int *workers_to_remove, int nworkers_to_remove, unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	int removed_workers[sched_ctx->workers->nworkers];
+	int n_removed_workers = 0;
+
+	_starpu_check_workers(workers_to_remove, nworkers_to_remove);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->changing_ctx_mutex);
+	_starpu_remove_workers_from_sched_ctx(sched_ctx, workers_to_remove, nworkers_to_remove, removed_workers, &n_removed_workers);
+
+	if(n_removed_workers > 0)
+		_starpu_update_workers(removed_workers, n_removed_workers, sched_ctx->id);
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->changing_ctx_mutex);	       
+	return;
+}
+
+/* unused sched_ctx have the id STARPU_NMAX_SCHED_CTXS */
+void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
+{
+	pthread_key_create(&sched_ctx_key, NULL);
+
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		config->sched_ctxs[i].id = STARPU_NMAX_SCHED_CTXS;
+
+	char* max_time_on_ctx = getenv("STARPU_MAX_TIME_ON_CTX");
+	if (max_time_on_ctx != NULL)
+		max_time_worker_on_ctx = atof(max_time_on_ctx);
+
+	return;
+}
+
+/* unused sched_ctx pointers of a worker are NULL */
+void _starpu_init_sched_ctx_for_worker(unsigned workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->sched_ctx = (struct _starpu_sched_ctx**)malloc(STARPU_NMAX_SCHED_CTXS * sizeof(struct _starpu_sched_ctx*));
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		worker->sched_ctx[i] = NULL;
+
+	return;
+}
+
+/* sched_ctx aren't necessarly one next to another */
+/* for eg when we remove one its place is free */
+/* when we add  new one we reuse its place */
+static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config)
+{
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		if(config->sched_ctxs[i].id == STARPU_NMAX_SCHED_CTXS)
+			return i;
+
+	STARPU_ASSERT(0);
+	return STARPU_NMAX_SCHED_CTXS;
+}
+
+static unsigned _starpu_worker_get_first_free_sched_ctx(struct _starpu_worker *worker)
+{
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		if(worker->sched_ctx[i] == NULL)
+			return i;
+	STARPU_ASSERT(0);
+	return STARPU_NMAX_SCHED_CTXS;
+}
+
+static unsigned _starpu_worker_get_sched_ctx_id(struct _starpu_worker *worker, unsigned sched_ctx_id)
+{
+	unsigned to_be_deleted = STARPU_NMAX_SCHED_CTXS;
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		if(worker->sched_ctx[i] != NULL)
+		{
+			if(worker->sched_ctx[i]->id == sched_ctx_id)
+				return i;
+			else if(worker->sched_ctx[i]->id == STARPU_NMAX_SCHED_CTXS)
+				to_be_deleted = i;
+		}
+	}
+
+	return to_be_deleted;
+}
+
+int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	
+	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+	  return -EDEADLK;
+	
+	return _starpu_barrier_counter_wait_for_empty_counter(&sched_ctx->tasks_barrier);
+}
+
+void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
+	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
+		if(sched_ctx->finished_submit)
+		{
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
+			starpu_delete_sched_ctx(sched_ctx_id, sched_ctx->inheritor);
+			return;
+		}
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
+	}
+	return;
+}
+
+void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
+}
+
+void starpu_set_sched_ctx(unsigned *sched_ctx)
+{
+	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
+}
+
+unsigned starpu_get_sched_ctx()
+{
+	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
+	if(sched_ctx == NULL)
+		return STARPU_NMAX_SCHED_CTXS;
+	STARPU_ASSERT(*sched_ctx < STARPU_NMAX_SCHED_CTXS);
+	return *sched_ctx;
+}
+
+void starpu_notify_hypervisor_exists()
+{
+	with_hypervisor = 1;
+}
+
+unsigned starpu_check_if_hypervisor_exists()
+{
+	return with_hypervisor;
+}
+
+unsigned _starpu_get_nsched_ctxs()
+{
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	return config->topology.nsched_ctxs;
+}
+
+void starpu_set_sched_ctx_policy_data(unsigned sched_ctx_id, void* policy_data)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->policy_data = policy_data;
+}
+
+void* starpu_get_sched_ctx_policy_data(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return sched_ctx->policy_data;
+}
+
+_starpu_pthread_mutex_t *_starpu_get_sched_mutex(struct _starpu_sched_ctx *sched_ctx, int workerid)
+{
+	if(sched_ctx->sched_mutex)
+		return sched_ctx->sched_mutex[workerid];
+	else
+		return NULL;
+}
+
+void starpu_worker_set_sched_condition(unsigned sched_ctx_id, int workerid, _starpu_pthread_mutex_t *sched_mutex, _starpu_pthread_cond_t *sched_cond)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	if(sched_ctx->sched_mutex && sched_ctx->sched_cond)
+	{
+		sched_ctx->sched_mutex[workerid] = sched_mutex;
+		sched_ctx->sched_cond[workerid] = sched_cond;
+	}
+}
+
+void starpu_worker_get_sched_condition(unsigned sched_ctx_id, int workerid, _starpu_pthread_mutex_t **sched_mutex, _starpu_pthread_cond_t **sched_cond)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	*sched_mutex = sched_ctx->sched_mutex[workerid];
+	*sched_cond = sched_ctx->sched_cond[workerid];
+
+	/* the tasks concerning changings of the the ctxs were not executed in order */
+	if(!*sched_mutex)
+	{
+		struct _starpu_worker *workerarg = _starpu_get_worker_struct(workerid);
+		*sched_mutex = &workerarg->sched_mutex;
+		*sched_cond = &workerarg->sched_cond;
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, *sched_mutex, *sched_cond);
+	}
+
+}
+
+void starpu_worker_init_sched_condition(unsigned sched_ctx_id, int workerid)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->sched_mutex[workerid] = (_starpu_pthread_mutex_t*)malloc(sizeof(_starpu_pthread_mutex_t));
+	sched_ctx->sched_cond[workerid] = (_starpu_pthread_cond_t*)malloc(sizeof(_starpu_pthread_cond_t));
+	_STARPU_PTHREAD_MUTEX_INIT(sched_ctx->sched_mutex[workerid], NULL);
+	_STARPU_PTHREAD_COND_INIT(sched_ctx->sched_cond[workerid], NULL);
+}
+
+void starpu_worker_deinit_sched_condition(unsigned sched_ctx_id, int workerid)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_DESTROY(sched_ctx->sched_mutex[workerid]);
+	_STARPU_PTHREAD_COND_DESTROY(sched_ctx->sched_cond[workerid]);
+	free(sched_ctx->sched_mutex[workerid]);
+	free(sched_ctx->sched_cond[workerid]);
+}
+
+struct worker_collection* starpu_create_worker_collection_for_sched_ctx(unsigned sched_ctx_id, int worker_collection_type)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->workers = (struct worker_collection*)malloc(sizeof(struct worker_collection));
+
+	switch(worker_collection_type)
+	{
+	case WORKER_LIST:
+		sched_ctx->workers->has_next = worker_list.has_next;
+		sched_ctx->workers->get_next = worker_list.get_next;
+		sched_ctx->workers->add = worker_list.add;
+		sched_ctx->workers->remove = worker_list.remove;
+		sched_ctx->workers->init = worker_list.init;
+		sched_ctx->workers->deinit = worker_list.deinit;
+		sched_ctx->workers->init_cursor = worker_list.init_cursor;
+		sched_ctx->workers->deinit_cursor = worker_list.deinit_cursor;
+		sched_ctx->workers->type = WORKER_LIST; 
+		break;
+	}
+
+	return sched_ctx->workers;
+}
+
+void starpu_delete_worker_collection_for_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->workers->deinit(sched_ctx->workers);
+
+	free(sched_ctx->workers);
+}
+
+struct worker_collection* starpu_get_worker_collection_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return sched_ctx->workers;
+}
+
+int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_archtype arch)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	
+	struct worker_collection *workers = sched_ctx->workers;
+	int worker;
+
+	int npus = 0;
+	
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+	
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		enum starpu_archtype curr_arch = starpu_worker_get_type(worker);
+		if(curr_arch == arch)
+			pus[npus++] = worker;
+	}
+	
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	return npus;
+}
+
+_starpu_pthread_mutex_t* starpu_get_changing_ctx_mutex(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return &sched_ctx->changing_ctx_mutex;
+}
+
+unsigned starpu_get_nworkers_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	if(sched_ctx != NULL)
+		return sched_ctx->workers->nworkers;
+	else 
+		return 0;
+
+}
+
+unsigned starpu_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2)
+{
+        struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+        struct _starpu_sched_ctx *sched_ctx2 = _starpu_get_sched_ctx_struct(sched_ctx_id2);
+
+        struct worker_collection *workers = sched_ctx->workers;
+        struct worker_collection *workers2 = sched_ctx2->workers;
+        int worker, worker2;
+        int shared_workers = 0;
+
+        if(workers->init_cursor)
+                workers->init_cursor(workers);
+
+        if(workers2->init_cursor)
+                workers2->init_cursor(workers2);
+
+        while(workers->has_next(workers))
+        {
+                worker = workers->get_next(workers);
+                while(workers2->has_next(workers2))
+		{
+                        worker2 = workers2->get_next(workers2);
+                        if(worker == worker2)
+				shared_workers++;
+                }
+        }
+
+        if(workers->init_cursor)
+                workers->deinit_cursor(workers);
+
+        if(workers2->init_cursor)
+                workers2->deinit_cursor(workers2);
+
+	return shared_workers;
+}
+
+unsigned starpu_worker_belongs_to_sched_ctx(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		if(worker->sched_ctx[i] && worker->sched_ctx[i]->id == sched_ctx_id)
+			return 1;
+	}
+	return 0;
+}
+
+unsigned starpu_are_overlapping_ctxs_on_worker(int workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	return worker->nsched_ctxs > 1;
+}
+
+unsigned starpu_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
+{
+	if(max_time_worker_on_ctx == -1.0) return 1;
+
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	return worker->active_ctx == sched_ctx_id;
+}
+
+void starpu_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+
+	struct _starpu_sched_ctx *other_sched_ctx = NULL;
+	struct _starpu_sched_ctx *active_sched_ctx = NULL;
+	int i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		other_sched_ctx = worker->sched_ctx[i];
+		if(other_sched_ctx != NULL && other_sched_ctx->id != STARPU_NMAX_SCHED_CTXS && 
+		   other_sched_ctx->id != 0 && other_sched_ctx->id != sched_ctx_id)
+		{
+			worker->active_ctx = other_sched_ctx->id;
+			active_sched_ctx = other_sched_ctx;
+			break;
+		}
+	}		
+
+	if(worker->active_ctx != sched_ctx_id)
+	{
+		_starpu_fetch_tasks_from_empty_ctx_list(active_sched_ctx);
+	}
+}
+
+double starpu_get_max_time_worker_on_ctx(void)
+{
+	return max_time_worker_on_ctx;	
+}
+
+void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
+{
+	STARPU_ASSERT(inheritor < STARPU_NMAX_SCHED_CTXS);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->inheritor = inheritor;
+	return;
+}
+
+void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_LOCK(&finished_submit_mutex);
+	sched_ctx->finished_submit = 1;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
+	return;
+}
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+
+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
+	   && sched_ctx->perf_counters != NULL)
+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
+}
+
+void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+
+	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
+	   && sched_ctx->perf_counters != NULL)
+		sched_ctx->perf_counters->notify_pushed_task(sched_ctx_id, workerid);
+}
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR

+ 139 - 0
src/core/sched_ctx.h

@@ -0,0 +1,139 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __SCHED_CONTEXT_H__
+#define __SCHED_CONTEXT_H__
+
+#include <starpu.h>
+#include <starpu_sched_ctx.h>
+#include <starpu_scheduler.h>
+#include <common/config.h>
+#include <common/barrier_counter.h>
+#include <profiling/profiling.h>
+
+#define NO_RESIZE -1
+#define REQ_RESIZE 0
+#define DO_RESIZE 1
+
+
+struct _starpu_sched_ctx {
+	/* id of the context used in user mode*/
+	unsigned id;
+
+	/* name of context */
+	const char *name;
+
+	/* policy of the context */
+	struct starpu_sched_policy *sched_policy;
+
+	/* data necessary for the policy */
+	void *policy_data;
+
+	struct worker_collection *workers;
+	
+	/* mutex for temp_nworkers_in_ctx*/
+	_starpu_pthread_mutex_t changing_ctx_mutex;
+
+	/* we keep an initial sched which we never delete */
+	unsigned is_initial_sched; 
+
+	/* wait for the tasks submitted to the context to be executed */
+	struct _starpu_barrier_counter tasks_barrier;
+
+	/* table of sched cond corresponding to each worker in this ctx */
+	_starpu_pthread_cond_t **sched_cond;
+
+	/* table of sched mutex corresponding to each worker in this ctx */
+	_starpu_pthread_mutex_t **sched_mutex;
+
+	/* cond to block push when there are no workers in the ctx */
+	_starpu_pthread_cond_t no_workers_cond;
+
+	/* mutex to block push when there are no workers in the ctx */
+	_starpu_pthread_mutex_t no_workers_mutex;
+
+	/*ready tasks that couldn't be pushed because the ctx has no workers*/
+	struct starpu_task_list empty_ctx_tasks;
+
+	/* mutext protecting empty_ctx_tasks list */
+	_starpu_pthread_mutex_t empty_ctx_mutex; 
+
+	/* min CPUs to execute*/
+	int min_ncpus;
+
+	/* max CPUs to execute*/
+	int max_ncpus;
+
+	/* min GPUs to execute*/
+	int min_ngpus;
+
+	/* max GPUs to execute*/	
+	int max_ngpus;
+	
+	/* needed for overlapping contexts to help the workers
+	   determine which is the next context to pop tasks from */
+	unsigned pop_counter[STARPU_NMAXWORKERS];
+
+	/* in case we delete the context leave resources to the inheritor*/
+	unsigned inheritor;
+
+	/* indicates whether the application finished submitting tasks
+	   to this context*/
+	unsigned finished_submit;
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	/* a structure containing a series of performance counters determining the resize procedure */
+	struct starpu_performance_counters *perf_counters;
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+};
+
+struct _starpu_machine_config;
+
+/* init sched_ctx_id of all contextes*/
+void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
+
+/* init the list of contextes of the worker */
+void _starpu_init_sched_ctx_for_worker(unsigned workerid);
+
+/* allocate all structures belonging to a context */
+struct _starpu_sched_ctx*  _starpu_create_sched_ctx(const char *policy_name, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name);
+
+/* delete all sched_ctx */
+void _starpu_delete_all_sched_ctxs();
+
+/* This function waits until all the tasks that were already submitted to a specific
+ * context have been executed. */
+int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id);
+
+/* In order to implement starpu_wait_for_all_tasks_of_ctx, we keep track of the number of 
+ * task currently submitted to the context */
+void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
+void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
+
+/* Return the corresponding index of the workerid in the ctx table */
+int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
+
+/* Get the total number of sched_ctxs created till now */
+unsigned _starpu_get_nsched_ctxs();
+
+/* Get the mutex corresponding to the global workerid */
+_starpu_pthread_mutex_t *_starpu_get_sched_mutex(struct _starpu_sched_ctx *sched_ctx, int worker);
+
+/* Get workers belonging to a certain context, it returns the number of workers 
+ take care: no mutex taken, the list of workers might not be updated */
+int starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_archtype arch);
+
+#endif // __SCHED_CONTEXT_H__

+ 240 - 46
src/core/sched_policy.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,8 +26,6 @@
 #include <common/barrier.h>
 #include <core/debug.h>
 
-static struct starpu_sched_policy policy;
-
 static int use_prefetch = 0;
 
 int starpu_get_prefetch_flag(void)
@@ -48,16 +47,16 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_pgreedy_policy
 };
 
-struct starpu_sched_policy *_starpu_get_sched_policy(void)
+struct starpu_sched_policy *_starpu_get_sched_policy(struct _starpu_sched_ctx *sched_ctx)
 {
-	return &policy;
+	return sched_ctx->sched_policy;
 }
 
 /*
  *	Methods to initialize the scheduling policy
  */
 
-static void load_sched_policy(struct starpu_sched_policy *sched_policy)
+static void load_sched_policy(struct starpu_sched_policy *sched_policy, struct _starpu_sched_ctx *sched_ctx)
 {
 	STARPU_ASSERT(sched_policy);
 
@@ -72,7 +71,8 @@ static void load_sched_policy(struct starpu_sched_policy *sched_policy)
 	}
 #endif
 
-	memcpy(&policy, sched_policy, sizeof(policy));
+	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
+	memcpy(policy, sched_policy, sizeof(*policy));
 }
 
 static struct starpu_sched_policy *find_sched_policy_from_name(const char *policy_name)
@@ -124,17 +124,26 @@ static void display_sched_help_message(void)
 	 }
 }
 
-static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_config *config)
+static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_config *config, const char *required_policy)
 {
 	struct starpu_sched_policy *selected_policy = NULL;
+	struct starpu_conf *user_conf = config->conf;
+
+	if(required_policy)
+		selected_policy = find_sched_policy_from_name(required_policy);
 
 	/* First, we check whether the application explicitely gave a scheduling policy or not */
-	if (config->conf->sched_policy)
-		return config->conf->sched_policy;
+	if (!selected_policy && user_conf && (user_conf->sched_policy))
+		return user_conf->sched_policy;
 
 	/* Otherwise, we look if the application specified the name of a policy to load */
-	if (config->conf->sched_policy_name)
-		selected_policy = find_sched_policy_from_name(config->conf->sched_policy_name);
+	const char *sched_pol_name;
+	sched_pol_name = getenv("STARPU_SCHED");
+	if (sched_pol_name == NULL && user_conf && user_conf->sched_policy_name)
+		sched_pol_name = user_conf->sched_policy_name;
+
+	if (!selected_policy && sched_pol_name)
+		selected_policy = find_sched_policy_from_name(sched_pol_name);
 
 	/* Perhaps there was no policy that matched the name */
 	if (selected_policy)
@@ -144,7 +153,7 @@ static struct starpu_sched_policy *select_sched_policy(struct _starpu_machine_co
 	return &_starpu_sched_eager_policy;
 }
 
-void _starpu_init_sched_policy(struct _starpu_machine_config *config)
+void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _starpu_sched_ctx *sched_ctx, const char *required_policy)
 {
 	/* Perhaps we have to display some help */
 	display_sched_help_message();
@@ -158,17 +167,18 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config)
 	_starpu_set_calibrate_flag(config->conf->calibrate);
 
 	struct starpu_sched_policy *selected_policy;
-	selected_policy = select_sched_policy(config);
+	selected_policy = select_sched_policy(config, required_policy);
 
-	load_sched_policy(selected_policy);
+	load_sched_policy(selected_policy, sched_ctx);
 
-	policy.init_sched(&config->topology, &policy);
+	sched_ctx->sched_policy->init_sched(sched_ctx->id);
 }
 
-void _starpu_deinit_sched_policy(struct _starpu_machine_config *config)
+void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
 {
-	if (policy.deinit_sched)
-		policy.deinit_sched(&config->topology, &policy);
+	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
+	if (policy->deinit_sched)
+		policy->deinit_sched(sched_ctx->id);
 }
 
 /* Enqueue a task into the list of tasks explicitely attached to a worker. In
@@ -199,8 +209,22 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	if (use_prefetch)
 		starpu_prefetch_task_input_on_node(task, memory_node);
 
-	if (policy.push_task_notify)
-		policy.push_task_notify(task, workerid);
+	/* if we push a task on a specific worker, notify all the sched_ctxs the worker belongs to */
+	unsigned i;
+	struct _starpu_sched_ctx *sched_ctx;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		sched_ctx = worker->sched_ctx[i];
+		if (sched_ctx != NULL && sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
+		{
+			sched_ctx->sched_policy->push_task_notify(task, workerid);
+		}
+
+	}
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_pushed_task_cb(workerid, task->sched_ctx);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
 	if (is_basic_worker)
 	{
@@ -228,7 +252,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			for (i = 0; i < task->cl->nbuffers; i++)
 				task->handles[i]->mf_node = node;
 		}
-		return _starpu_push_local_task(worker, task, 0);
+		if(task->priority > 0)
+			return _starpu_push_local_task(worker, task, 1);
+		else
+			return _starpu_push_local_task(worker, task, 0);
 	}
 	else
 	{
@@ -237,7 +264,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		int *combined_workerid = combined_worker->combined_workerid;
 
 		int ret = 0;
-		int i;
 
 		struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 		j->task_size = worker_size;
@@ -247,6 +273,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		_STARPU_PTHREAD_BARRIER_INIT(&j->before_work_barrier, NULL, worker_size);
 		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
+		int i;
 		for (i = 0; i < worker_size; i++)
 		{
 			struct starpu_task *alias = _starpu_create_task_alias(task);
@@ -259,11 +286,60 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	}
 }
 
+static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
+{
+	int worker = -1, nworkers = 0;
+	struct worker_collection *workers = sched_ctx->workers;
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		if (starpu_worker_can_execute_task(worker, task, 0) && starpu_is_ctxs_turn(worker, sched_ctx->id))
+			nworkers++;
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	return nworkers;
+}
+
 /* the generic interface that call the proper underlying implementation */
+
 int _starpu_push_task(struct _starpu_job *j)
 {
 	struct starpu_task *task = j->task;
-        _STARPU_LOG_IN();
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	unsigned nworkers = 0;
+
+	if(!sched_ctx->is_initial_sched)
+	{
+		/*if there are workers in the ctx that are not able to execute tasks
+		  we consider the ctx empty */
+		nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx);
+
+		if(nworkers == 0)
+		{
+			if(task->already_pushed)
+			{
+				_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
+				starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
+				return -1;
+			}
+			else
+			{
+				_STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
+				task->already_pushed = 1;
+				starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
+				return 0;
+			}
+		}
+	}
+
+	_STARPU_LOG_IN();
 
 	_starpu_increment_nready_tasks();
 	task->status = STARPU_TASK_READY;
@@ -281,25 +357,31 @@ int _starpu_push_task(struct _starpu_job *j)
 	if (task->cl == NULL)
 	{
 		_starpu_handle_job_termination(j);
-                _STARPU_LOG_OUT_TAG("handle_job_termination");
+		_STARPU_LOG_OUT_TAG("handle_job_termination");
 		return 0;
 	}
 
-        int ret;
+	int ret;
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	{
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 	}
 	else
 	{
-		STARPU_ASSERT(policy.push_task);
-		ret = policy.push_task(task);
+		STARPU_ASSERT(sched_ctx->sched_policy->push_task);
+		ret = sched_ctx->sched_policy->push_task(task);
+		if(ret == -1)
+		{
+			fprintf(stderr, "repush task \n");
+			_starpu_decrement_nready_tasks();
+			ret = _starpu_push_task(j);
+		}
 	}
 
 	_starpu_profiling_set_task_push_end_time(task);
 
-        _STARPU_LOG_OUT();
-        return ret;
+	_STARPU_LOG_OUT();
+	return ret;
 }
 
 /*
@@ -382,6 +464,38 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 	return conversion_task;
 }
 
+struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
+{
+	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
+	unsigned smallest_counter =  worker->nsched_ctxs;
+	unsigned i;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		sched_ctx = worker->sched_ctx[i];
+
+		if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
+		   sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
+		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
+		{
+			good_sched_ctx = sched_ctx;
+			smallest_counter = sched_ctx->pop_counter[worker->workerid];
+		}
+	}
+
+	if(good_sched_ctx == NULL)
+	{
+		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		{
+			sched_ctx = worker->sched_ctx[i];
+			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+				sched_ctx->pop_counter[worker->workerid] = 0;
+		}
+
+		return _get_next_sched_ctx_to_pop_into(worker);
+	}
+	return good_sched_ctx;
+}
+
 struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
 {
 	struct starpu_task *task;
@@ -396,16 +510,82 @@ struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
 		_starpu_clock_gettime(&pop_start_time);
 
 pick:
+	_STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 	/* perhaps there is some local task to be executed first */
 	task = _starpu_pop_local_task(worker);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+
+
+	/* get tasks from the stacks of the strategy */
+	if(!task)
+	{
+		struct _starpu_sched_ctx *sched_ctx;
+		_starpu_pthread_mutex_t *sched_ctx_mutex;
+
+		int been_here[STARPU_NMAX_SCHED_CTXS];
+		int i;
+		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+			been_here[i] = 0;
+
+		while(!task)
+		{
+			if(worker->nsched_ctxs == 1)
+				sched_ctx = _starpu_get_initial_sched_ctx();
+			else
+				sched_ctx = _get_next_sched_ctx_to_pop_into(worker);
+			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+			{
+				sched_ctx_mutex = _starpu_get_sched_mutex(sched_ctx, worker->workerid);
+				if(sched_ctx_mutex != NULL)
+				{
+					_STARPU_PTHREAD_MUTEX_LOCK(sched_ctx_mutex);
+
+					if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
+						task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
+
+					_STARPU_PTHREAD_MUTEX_UNLOCK(sched_ctx_mutex);
+
+				}
+			}
+
+			if((!task && sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
+				break;
+
+
+			been_here[sched_ctx->id] = 1;
+
+			sched_ctx->pop_counter[worker->workerid]++;
+
+		}
+
+	  }
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	struct starpu_performance_counters *perf_counters = NULL;
+	int j;
+	for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
+	{
+		sched_ctx = worker->sched_ctx[j];
+		if(sched_ctx != NULL && sched_ctx->id != 0)
+		{
+			perf_counters = sched_ctx->perf_counters;
+			if(perf_counters != NULL && perf_counters->notify_idle_cycle && perf_counters->notify_idle_end)
+			{
+				if(!task)
+					perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
+				else
+					perf_counters->notify_idle_end(sched_ctx->id, worker->workerid);
+			}
+		}
+	}
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 
-	if (!task && policy.pop_task)
-		task = policy.pop_task();
 
 	if (!task)
-		return NULL;
+		goto profiling;
 
-	/* Make sure we do not bother with all the multiformat-specific code if 
+	/* Make sure we do not bother with all the multiformat-specific code if
 	 * it is not necessary. */
 	if (!_starpu_task_uses_multiformat_handles(task))
 		goto profiling;
@@ -423,7 +603,7 @@ pick:
 	node = starpu_worker_get_memory_node(worker_id);
 
 	/*
-	 * We do have a task that uses multiformat handles. Let's create the 
+	 * We do have a task that uses multiformat handles. Let's create the
 	 * required conversion tasks.
 	 */
 	unsigned i;
@@ -451,7 +631,7 @@ pick:
 	goto pick;
 
 profiling:
-	if (profiling)
+	if (profiling && task)
 	{
 		struct starpu_task_profiling_info *profiling_info;
 		profiling_info = task->profiling_info;
@@ -470,31 +650,42 @@ profiling:
 	return task;
 }
 
-struct starpu_task *_starpu_pop_every_task(void)
+struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
 {
-	STARPU_ASSERT(policy.pop_every_task);
+	STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
 
 	/* TODO set profiling info */
-	return policy.pop_every_task();
+	if(sched_ctx->sched_policy->pop_every_task)
+		return sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
+	return NULL;
 }
 
 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 {
-	if (policy.pre_exec_hook)
-		policy.pre_exec_hook(task);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+	if (sched_ctx->sched_policy->pre_exec_hook)
+		sched_ctx->sched_policy->pre_exec_hook(task);
 }
 
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 {
-	if (policy.post_exec_hook)
-		policy.post_exec_hook(task);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	if(task->hypervisor_tag > 0 && sched_ctx != NULL &&
+	   sched_ctx->id != 0 && sched_ctx->perf_counters != NULL)
+		sched_ctx->perf_counters->notify_post_exec_hook(sched_ctx->id, task->hypervisor_tag);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
+	if (sched_ctx->sched_policy->post_exec_hook)
+		sched_ctx->sched_policy->post_exec_hook(task);
 }
 
 void _starpu_wait_on_sched_event(void)
 {
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
 
-	_STARPU_PTHREAD_MUTEX_LOCK(worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 
 	_starpu_handle_all_pending_node_data_requests(worker->memory_node);
 
@@ -506,7 +697,7 @@ void _starpu_wait_on_sched_event(void)
 #endif
 	}
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(worker->sched_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 }
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
@@ -518,6 +709,9 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int back)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 
-	return _starpu_push_local_task(worker, task, back);
-}
+	int ret =  _starpu_push_local_task(worker, task, back);
 
+	task->scheduled = 1;
+
+	return ret;
+}

+ 9 - 6
src/core/sched_policy.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,20 +20,22 @@
 
 #include <starpu.h>
 #include <core/workers.h>
-
+#include <core/sched_ctx.h>
 #include <starpu_scheduler.h>
 
-struct _starpu_machine_config;
-struct starpu_sched_policy *_starpu_get_sched_policy(void);
+struct starpu_machine_config;
+struct starpu_sched_policy *_starpu_get_sched_policy( struct _starpu_sched_ctx *sched_ctx);
+
+void _starpu_init_sched_policy(struct _starpu_machine_config *config, 
+			       struct _starpu_sched_ctx *sched_ctx, const char *required_policy);
 
-void _starpu_init_sched_policy(struct _starpu_machine_config *config);
-void _starpu_deinit_sched_policy(struct _starpu_machine_config *config);
+void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx);
 
 int _starpu_push_task(struct _starpu_job *task);
 /* pop a task that can be executed on the worker */
 struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker);
 /* pop every task that can be executed on the worker */
-struct starpu_task *_starpu_pop_every_task(void);
+struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx);
 void _starpu_sched_post_exec_hook(struct starpu_task *task);
 
 void _starpu_wait_on_sched_event(void);

+ 100 - 25
src/core/task.c

@@ -3,6 +3,7 @@
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +20,7 @@
 #include <starpu.h>
 #include <starpu_profiling.h>
 #include <core/workers.h>
+#include <core/sched_ctx.h>
 #include <core/jobs.h>
 #include <core/task.h>
 #include <core/task_bundle.h>
@@ -70,6 +72,9 @@ void starpu_task_init(struct starpu_task *task)
 	task->predicted_transfer = NAN;
 
 	task->magic = 42;
+	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
+	
+	task->flops = 0.0;
 }
 
 /* Free all the ressources allocated for a task, without deallocating the task
@@ -200,16 +205,28 @@ int _starpu_submit_job(struct _starpu_job *j)
 
 	struct starpu_task *task = j->task;
 
-        _STARPU_LOG_IN();
+	_STARPU_LOG_IN();
 	/* notify bound computation of a new task */
 	_starpu_bound_record(j);
 
 	_starpu_increment_nsubmitted_tasks();
+	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
+	
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
+	if(sched_ctx != NULL && j->task->sched_ctx != 0 && j->task->sched_ctx != STARPU_NMAX_SCHED_CTXS
+	   && sched_ctx->perf_counters != NULL)
+	{
+		_starpu_compute_buffers_footprint(j->task->cl->model, STARPU_CPU_DEFAULT, 0, j);
+		sched_ctx->perf_counters->notify_submitted_job(j->task, j->footprint);
+	}
+#endif
 
 	/* We retain handle reference count */
 	if (task->cl) {
 		unsigned i;
-		for (i=0; i<task->cl->nbuffers; i++) {
+		for (i=0; i<task->cl->nbuffers; i++) 
+		{
 			starpu_data_handle_t handle = task->handles[i];
 			_starpu_spin_lock(&handle->header_lock);
 			handle->busy_count++;
@@ -229,8 +246,8 @@ int _starpu_submit_job(struct _starpu_job *j)
 
 	int ret = _starpu_enforce_deps_and_schedule(j);
 
-        _STARPU_LOG_OUT();
-        return ret;
+	_STARPU_LOG_OUT();
+	return ret;
 }
 
 void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
@@ -346,21 +363,28 @@ int starpu_task_submit(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task->magic == 42);
+	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
+	unsigned set_sched_ctx = STARPU_NMAX_SCHED_CTXS;
+	
+	if(task->sched_ctx == 0 && nsched_ctxs != 1 && !task->control_task)
+		set_sched_ctx = starpu_get_sched_ctx();
+	if(set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+		task->sched_ctx = set_sched_ctx;
 
 	int ret;
 	unsigned is_sync = task->synchronous;
 	starpu_task_bundle_t bundle = task->bundle;
-        _STARPU_LOG_IN();
+	_STARPU_LOG_IN();
 
 	if (is_sync)
 	{
 		/* Perhaps it is not possible to submit a synchronous
 		 * (blocking) task */
-                if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 		{
-                        _STARPU_LOG_OUT_TAG("EDEADLK");
+			_STARPU_LOG_OUT_TAG("EDEADLK");
 			return -EDEADLK;
-                }
+		}
 
 		task->detach = 0;
 	}
@@ -393,26 +417,45 @@ int starpu_task_submit(struct starpu_task *task)
 		/* Check the type of worker(s) required by the task exist */
 		if (!_starpu_worker_exists(task))
 		{
-                        _STARPU_LOG_OUT_TAG("ENODEV");
+			_STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
-                }
+		}
 
 		/* In case we require that a task should be explicitely
 		 * executed on a specific worker, we make sure that the worker
 		 * is able to execute this task.  */
 		if (task->execute_on_a_specific_worker && !starpu_combined_worker_can_execute_task(task->workerid, task, 0))
 		{
-                        _STARPU_LOG_OUT_TAG("ENODEV");
+			_STARPU_LOG_OUT_TAG("ENODEV");
 			return -ENODEV;
-                }
+		}
 
 		_starpu_detect_implicit_data_deps(task);
 
-		if (task->cl->model && task->cl->model->symbol)
-			_starpu_load_perfmodel(task->cl->model);
 
-		if (task->cl->power_model && task->cl->power_model->symbol)
-			_starpu_load_perfmodel(task->cl->power_model);
+		if(task->bundle)
+		{
+			struct _starpu_task_bundle_entry *entry;
+			entry = task->bundle->list;
+			while(entry)
+			{
+				if (entry->task->cl->model && task->cl->model->symbol)
+					_starpu_load_perfmodel(entry->task->cl->model);
+				
+				if (entry->task->cl->power_model && task->cl->power_model->symbol)
+					_starpu_load_perfmodel(entry->task->cl->power_model);
+
+				entry = entry->next;
+			}
+		}
+		else
+		{
+			if (task->cl->model && task->cl->model->symbol)
+				_starpu_load_perfmodel(task->cl->model);
+			
+			if (task->cl->power_model && task->cl->power_model->symbol)
+				_starpu_load_perfmodel(task->cl->power_model);
+		}
 	}
 
 	if (bundle)
@@ -467,6 +510,19 @@ int starpu_task_submit(struct starpu_task *task)
 	return ret;
 }
 
+int _starpu_task_submit_internally(struct starpu_task *task)
+{
+	task->control_task = 1;
+	return starpu_task_submit(task);
+}
+
+/* application should submit new tasks to StarPU through this function */
+int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
+{
+	task->sched_ctx = sched_ctx_id;
+	return starpu_task_submit(task);
+}
+
 /* The StarPU core can submit tasks directly to the scheduler or a worker,
  * skipping dependencies completely (when it knows what it is doing).  */
 int _starpu_task_submit_nodeps(struct starpu_task *task)
@@ -485,7 +541,7 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	_starpu_increment_nsubmitted_tasks();
-
+	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 	j->submitted = 1;
@@ -536,6 +592,7 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	_starpu_increment_nsubmitted_tasks();
+	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	j->submitted = 1;
 	_starpu_increment_nready_tasks();
@@ -603,25 +660,43 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
  */
 int starpu_task_wait_for_all(void)
 {
-	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
-		return -EDEADLK;
+	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
+	unsigned sched_ctx = nsched_ctxs == 1 ? 0 : starpu_get_sched_ctx();
 
-	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	/* if there is no indication about which context to wait,
+	   we wait for all tasks submitted to starpu */
+	if(sched_ctx == STARPU_NMAX_SCHED_CTXS)
+	{
+		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
+			return -EDEADLK;
 
-	_STARPU_TRACE_TASK_WAIT_FOR_ALL;
+		_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 
-	while (nsubmitted > 0)
-		_STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
+		_STARPU_TRACE_TASK_WAIT_FOR_ALL;
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+		while (nsubmitted > 0)
+			_STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
+
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 
 #ifdef HAVE_AYUDAME_H
+		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
+#endif
+	}
+	else
+		_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+	return 0;
+}
+
+int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
+{
+	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+#ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
 #endif
 
 	return 0;
 }
-
 /*
  * We wait until there is no ready task any more (i.e. StarPU will not be able
  * to progress any more).

+ 4 - 0
src/core/task.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -53,6 +54,9 @@ struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
 
 struct starpu_task *_starpu_create_task_alias(struct starpu_task *task);
 
+/* Submits starpu internal tasks to the initial context */
+int _starpu_task_submit_internally(struct starpu_task *task);
+
 int _starpu_handle_needs_conversion_task(starpu_data_handle_t handle,
 					 unsigned int node);
 

+ 7 - 0
src/core/topology.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -449,6 +450,8 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 
 	topology->nworkers = 0;
 	topology->ncombinedworkers = 0;
+	topology->nsched_ctxs = 0;
+
 #ifdef STARPU_USE_OPENCL
 	_starpu_opencl_init();
 #endif
@@ -505,6 +508,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
+		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + cudagpu].workerid);
 		config->worker_mask |= STARPU_CUDA;
 
 		struct handle_entry *entry;
@@ -580,6 +584,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
+		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + openclgpu].workerid);
 		config->worker_mask |= STARPU_OPENCL;
 	}
 
@@ -619,6 +624,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].id = spu;
 		config->workers[worker_idx].worker_is_running = 0;
 		config->workers[worker_idx].worker_mask = STARPU_GORDON;
+		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + spu].workerid);
 		config->worker_mask |= STARPU_GORDON;
 	}
 
@@ -664,6 +670,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
+		_starpu_init_sched_ctx_for_worker(config->workers[topology->nworkers + cpu].workerid);
 	}
 
 	topology->nworkers += topology->ncpus;

+ 126 - 18
src/core/workers.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Institut National de Recherche en Informatique et Automatique
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011-2012  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -246,8 +247,8 @@ static struct _starpu_worker_set gordon_worker_set;
 
 static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 {
-	_starpu_pthread_cond_t *cond = workerarg->sched_cond;
-	_starpu_pthread_mutex_t *mutex = workerarg->sched_mutex;
+	_starpu_pthread_cond_t *cond = &workerarg->sched_cond;
+	_starpu_pthread_mutex_t *mutex = &workerarg->sched_mutex;
 
 	unsigned memory_node = workerarg->memory_node;
 
@@ -360,14 +361,23 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
 		workerarg->config = config;
 
+		_starpu_barrier_counter_init(&workerarg->tasks_barrier, 0);
+
 		_STARPU_PTHREAD_MUTEX_INIT(&workerarg->mutex, NULL);
 		_STARPU_PTHREAD_COND_INIT(&workerarg->ready_cond, NULL);
 
 		workerarg->worker_size = 1;
 		workerarg->combined_workerid = workerarg->workerid;
 		workerarg->current_rank = 0;
+		workerarg->has_prev_init = 0;
+		/* mutex + cond only for the local list */
+		/* we have a single local list */
+		/* afterwards there would be a mutex + cond for the list of each strategy */
 		workerarg->run_by_starpu = 1;
 
+		_STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
+		_STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
+
 		/* if some codelet's termination cannot be handled directly :
 		 * for instance in the Gordon driver, Gordon tasks' callbacks
 		 * may be executed by another thread than that of the Gordon
@@ -720,6 +730,13 @@ int starpu_init(struct starpu_conf *user_conf)
 		AYU_event(AYU_PREINIT, 0, (void*) &ayu_rt);
 	}
 #endif
+	_starpu_open_debug_logfile();
+
+	_starpu_data_interface_init();
+
+	_starpu_timing_init();
+
+	_starpu_profiling_init();
 
 	/* store the pointer to the user explicit configuration during the
 	 * initialization */
@@ -739,8 +756,12 @@ int starpu_init(struct starpu_conf *user_conf)
 	     config.conf = user_conf;
 	     config.default_conf = 0;
 	}
+
+	_starpu_load_bus_performance_files();
+
 	_starpu_conf_check_environment(config.conf);
 
+	_starpu_init_all_sched_ctxs(&config);
 	_starpu_init_progression_hooks();
 
 	_starpu_init_tags();
@@ -749,16 +770,6 @@ int starpu_init(struct starpu_conf *user_conf)
 	_starpu_start_fxt_profiling();
 #endif
 
-	_starpu_open_debug_logfile();
-
-	_starpu_data_interface_init();
-
-	_starpu_timing_init();
-
-	_starpu_profiling_init();
-
-	_starpu_load_bus_performance_files();
-
 	ret = _starpu_build_topology(&config);
 	if (ret)
 	{
@@ -775,8 +786,10 @@ int starpu_init(struct starpu_conf *user_conf)
 	 * threads */
 	_starpu_initialize_current_task_key();
 
-	/* initialize the scheduling policy */
-	_starpu_init_sched_policy(&config);
+	if(user_conf == NULL)
+		_starpu_create_sched_ctx(NULL, NULL, -1, 1, "init");
+	else
+		_starpu_create_sched_ctx(user_conf->sched_policy_name, NULL, -1, 1, "init");
 
 	_starpu_initialize_registered_performance_models();
 
@@ -793,6 +806,10 @@ int starpu_init(struct starpu_conf *user_conf)
 	return 0;
 }
 
+void starpu_profiling_init()
+{
+	_starpu_profiling_init();
+}
 /*
  * Handle runtime termination
  */
@@ -901,6 +918,16 @@ static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
 	starpu_wake_all_blocked_workers();
 }
 
+void starpu_display_stats()
+{
+	const char *stats;
+	if ((stats = getenv("STARPU_BUS_STATS")) && atoi(stats))
+		starpu_bus_profiling_helper_display_summary();
+
+	if ((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))
+		starpu_worker_profiling_helper_display_summary();
+}
+
 void starpu_shutdown(void)
 {
 	_STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
@@ -948,7 +975,7 @@ void starpu_shutdown(void)
 	/* wait for their termination */
 	_starpu_terminate_workers(&config);
 
-	_starpu_deinit_sched_policy(&config);
+	_starpu_delete_all_sched_ctxs();
 
 	_starpu_destroy_topology(&config);
 
@@ -1129,6 +1156,17 @@ struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 	return &config.workers[id];
 }
 
+unsigned starpu_worker_is_combined_worker(int id)
+{
+	return id >= (int)config.topology.nworkers;
+}
+
+struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
+{
+        STARPU_ASSERT(id <= STARPU_NMAX_SCHED_CTXS);
+	return &config.sched_ctxs[id];
+}
+
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 {
 	unsigned basic_worker_count = starpu_worker_get_count();
@@ -1184,10 +1222,80 @@ void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status)
 	config.workers[workerid].status = status;
 }
 
-void starpu_worker_set_sched_condition(int workerid, _starpu_pthread_cond_t *sched_cond, _starpu_pthread_mutex_t *sched_mutex)
+int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+{
+	unsigned nworkers = starpu_worker_get_count();
+
+	int cnt = 0;
+
+	unsigned id;
+	for (id = 0; id < nworkers; id++)
+	{
+		if (starpu_worker_get_type(id) == type)
+		{
+			/* Perhaps the array is too small ? */
+			if (cnt >= maxsize)
+				return cnt;
+
+			workerids[cnt++] = id;
+		}
+	}
+
+	return cnt;
+}
+
+int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *workerids, int maxsize)
+{
+	unsigned nworkers = starpu_worker_get_count();
+
+	int cnt = 0;
+
+	unsigned id, worker;
+	unsigned found = 0;
+	for (id = 0; id < nworkers; id++)
+	{
+		found = 0;
+		if (starpu_worker_get_type(id) == type)
+		{
+			/* Perhaps the array is too small ? */
+			if (cnt >= maxsize)
+				return cnt;
+			int s;
+			for(s = 1; s < STARPU_NMAX_SCHED_CTXS; s++)
+			{
+				if(config.sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+				{
+					struct worker_collection *workers = config.sched_ctxs[s].workers;
+					if(workers->init_cursor)
+						workers->init_cursor(workers);
+					
+					while(workers->has_next(workers))
+					{
+						worker = workers->get_next(workers);
+						if(worker == id)
+						{
+							found = 1;
+							break;
+						}
+					}
+					
+					if(workers->init_cursor)
+						workers->deinit_cursor(workers);
+					if(found) break;
+				}
+			}
+			if(!found)
+				workerids[cnt++] = id;
+		}
+	}
+
+	return cnt;
+}
+
+
+struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
 {
-	config.workers[workerid].sched_cond = sched_cond;
-	config.workers[workerid].sched_mutex = sched_mutex;
+	return &config.sched_ctxs[0];
 }
 
 int

+ 29 - 6
src/core/workers.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,7 +30,7 @@
 #include <core/sched_policy.h>
 #include <core/topology.h>
 #include <core/errorcheck.h>
-
+#include <core/sched_ctx.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -69,8 +70,8 @@ struct _starpu_worker
 	int worker_size; /* size of the worker in case we use a combined worker */
         _starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
 	unsigned memory_node; /* which memory node is the worker associated with ? */
-	_starpu_pthread_cond_t *sched_cond; /* condition variable used when the worker waits for tasks. */
-	_starpu_pthread_mutex_t *sched_mutex; /* mutex protecting sched_cond */
+	_starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
+	_starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task *current_task; /* task currently executed by this worker */
 	struct _starpu_worker_set *set; /* in case this worker belongs to a set */
@@ -82,6 +83,17 @@ struct _starpu_worker
 	char short_name[10];
 	unsigned run_by_starpu; /* Is this run by StarPU or directly by the application ? */
 
+	struct _starpu_sched_ctx **sched_ctx;
+	unsigned nsched_ctxs; /* the no of contexts a worker belongs to*/
+	struct _starpu_barrier_counter tasks_barrier; /* wait for the tasks submitted */
+	struct starpu_task *tasks[STARPU_NMAX_SCHED_CTXS];
+       
+	unsigned has_prev_init; /* had already been inited in another ctx */
+
+	/* indicated in each ctx the workers can execute tasks on,
+	 used for overlapping ctx in order to determine on which 
+	ctx the worker is allowed to pop */
+	unsigned active_ctx;
 #ifdef __GLIBC__
 	cpu_set_t initial_cpu_set;
 	cpu_set_t current_cpu_set;
@@ -162,6 +174,9 @@ struct _starpu_machine_config
 	/* this flag is set until the runtime is stopped */
 	unsigned running;
 
+	/* all the sched ctx of the current instance of starpu */
+	struct _starpu_sched_ctx sched_ctxs[STARPU_NMAX_SCHED_CTXS];
+
 	/* this flag is set until the application is finished submitting tasks */
 	unsigned submitting;
 };
@@ -209,6 +224,10 @@ struct _starpu_worker *_starpu_get_local_worker_key(void);
  * specified worker. */
 struct _starpu_worker *_starpu_get_worker_struct(unsigned id);
 
+/* Returns the starpu_sched_ctx structure that descriebes the state of the 
+ * specified ctx */
+struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
+
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id);
 
 int _starpu_is_initialized(void);
@@ -227,8 +246,12 @@ void _starpu_worker_set_status(int workerid, enum _starpu_worker_status status);
 /* TODO move */
 unsigned _starpu_execute_registered_progression_hooks(void);
 
-#if defined(_MSC_VER) || defined(STARPU_SIMGRID)
-void starpu_worker_set_sched_condition(int workerid, _starpu_pthread_cond_t *sched_cond, _starpu_pthread_mutex_t *sched_mutex);
-#endif
+/* We keep an initial sched ctx which might be used in case no other ctx is available */
+struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void);
+
+int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, int maxsize);
 
+/* returns workers not belonging to any context, be careful no mutex is used, 
+   the list might not be updated */
+int starpu_worker_get_nids_ctx_free_by_type(enum starpu_archtype type, int *workerids, int maxsize);
 #endif // __WORKERS_H__

+ 2 - 1
src/datawizard/filters.c

@@ -20,6 +20,7 @@
 #include <datawizard/filters.h>
 #include <datawizard/footprint.h>
 #include <datawizard/interfaces/data_interface.h>
+#include <core/task.h>
 
 static void starpu_data_create_children(starpu_data_handle_t handle, unsigned nchildren, struct starpu_data_filter *f);
 
@@ -302,7 +303,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, uint32_t gatherin
 			task->handles[0] = child_handle;
 			task->cl = &cl;
 			task->synchronous = 1;
-			if (starpu_task_submit(task) != 0)
+			if (_starpu_task_submit_internally(task) != 0)
 				_STARPU_ERROR("Could not submit the conversion task while unpartitionning\n");
 		}
 

+ 3 - 3
src/datawizard/malloc.c

@@ -106,7 +106,7 @@ int starpu_malloc(void **A, size_t dim)
 
 		_starpu_exclude_task_from_dag(task);
 
-		push_res = starpu_task_submit(task);
+		push_res = _starpu_task_submit_internally(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 #endif
 #endif
@@ -132,7 +132,7 @@ int starpu_malloc(void **A, size_t dim)
 //
 //		_starpu_exclude_task_from_dag(task);
 //
-//		push_res = starpu_task_submit(task);
+//		push_res = _starpu_task_submit_internally(task);
 //		STARPU_ASSERT(push_res != -ENODEV);
 //#endif
 //        }
@@ -219,7 +219,7 @@ int starpu_free(void *A)
 
 		_starpu_exclude_task_from_dag(task);
 
-		push_res = starpu_task_submit(task);
+		push_res = _starpu_task_submit_internally(task);
 		STARPU_ASSERT(push_res != -ENODEV);
 	}
 #endif

+ 3 - 3
src/datawizard/reduction.c

@@ -254,7 +254,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 		unsigned i;
 		for (i = 0; i < redux_task_idx; i++)
 		{
-			int ret = starpu_task_submit(redux_tasks[i]);
+			int ret = _starpu_task_submit_internally(redux_tasks[i]);
 			STARPU_ASSERT(ret == 0);
 		}
 #else
@@ -276,7 +276,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			redux_task->cl->modes[0] = STARPU_W;
 			redux_task->handles[0] = handle;
 
-			int ret = starpu_task_submit(redux_task);
+			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);
 		}
 
@@ -305,7 +305,7 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			redux_task->handles[0] = handle;
 			redux_task->handles[1] = replicate_array[replicate];
 
-			int ret = starpu_task_submit(redux_task);
+			int ret = _starpu_task_submit_internally(redux_task);
 			STARPU_ASSERT(!ret);
 		}
 #endif

+ 5 - 5
src/datawizard/user_interactions.c

@@ -157,12 +157,12 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 
 		if (new_task) {
-			int ret = starpu_task_submit(new_task);
+			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 
 		/* TODO detect if this is superflous */
-		int ret = starpu_task_submit(wrapper->pre_sync_task);
+		int ret = _starpu_task_submit_internally(wrapper->pre_sync_task);
 		STARPU_ASSERT(!ret);
 	}
 	else
@@ -229,7 +229,7 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		handle->mf_node = 0;
 		_starpu_spin_unlock(&handle->header_lock);
 		task->synchronous = 1;
-		ret = starpu_task_submit(task);
+		ret = _starpu_task_submit_internally(task);
 		STARPU_ASSERT(!ret);
 	}
 
@@ -265,13 +265,13 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum
 		new_task = _starpu_detect_implicit_data_deps_with_handle(wrapper.pre_sync_task, wrapper.post_sync_task, handle, mode);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 		if (new_task) {
-			int ret = starpu_task_submit(new_task);
+			int ret = _starpu_task_submit_internally(new_task);
 			STARPU_ASSERT(!ret);
 		}
 
 		/* TODO detect if this is superflous */
 		wrapper.pre_sync_task->synchronous = 1;
-		int ret = starpu_task_submit(wrapper.pre_sync_task);
+		int ret = _starpu_task_submit_internally(wrapper.pre_sync_task);
 		STARPU_ASSERT(!ret);
 	}
 	else

+ 45 - 5
src/debug/traces/starpu_fxt.c

@@ -333,21 +333,36 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 
 	/* create the Paje state */
 	if (out_paje_file)
-	fprintf(out_paje_file, "6       %s       S       %s \"%f %f %f\" \n", name, name, red, green, blue);
+	{
+		fprintf(out_paje_file, "6       %s       S       %s \"%f %f %f\" \n", name, name, red, green, blue);
+		fprintf(out_paje_file, "6       %s       Ctx1       %s \"255.0 255.0 0.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx2       %s \".0 255.0 .0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx3       %s \"75.0 .0 130.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx4       %s \".0 245.0 255.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx5       %s \".0 .0 .0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx6       %s \".0 .0 128.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx7       %s \"105.0 105.0 105.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx8       %s \"255.0 .0 255.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx9       %s \".0 .0 1.0\" \n", name, name);
+		fprintf(out_paje_file, "6       %s       Ctx10       %s \"154.0 205.0 50.0\" \n", name, name);
+
+	}
+		
 }
 
 
 static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
-	worker = find_worker_id(ev->param[1]);
+	worker = find_worker_id(ev->param[2]);
 
+	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
 
 	char *prefix = options->file_prefix;
 
-	unsigned long has_name = ev->param[2];
-	char *name = has_name?(char *)&ev->param[3]:"unknown";
+	unsigned long has_name = ev->param[3];
+	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
 	snprintf(last_codelet_symbol[worker], 128, "%s", name);
 
@@ -357,7 +372,32 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	create_paje_state_if_not_found(name, options);
 
 	if (out_paje_file)
-	fprintf(out_paje_file, "10       %f	S      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[1], name);
+	{
+	  	  fprintf(out_paje_file, "10       %f	S      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+
+		if(sched_ctx == 1)
+		  fprintf(out_paje_file, "10       %f	Ctx1      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 2)
+		  fprintf(out_paje_file, "10       %f	Ctx2      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 3)
+		  fprintf(out_paje_file, "10       %f	Ctx3      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 4)
+		  fprintf(out_paje_file, "10       %f	Ctx4      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 5)
+		  fprintf(out_paje_file, "10       %f	Ctx5      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 6)
+		  fprintf(out_paje_file, "10       %f	Ctx6      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 7)
+		  fprintf(out_paje_file, "10       %f	Ctx7      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 8)
+		  fprintf(out_paje_file, "10       %f	Ctx8      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 9)
+		  fprintf(out_paje_file, "10       %f	Ctx9      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+		else if(sched_ctx == 10)
+		  fprintf(out_paje_file, "10       %f	Ctx10      %s%"PRIu64"      %s\n", start_codelet_time, prefix, ev->param[2], name);
+
+	}
+	
 }
 
 static long dumped_codelets_count;

+ 90 - 0
src/debug/traces/starpu_paje.c

@@ -132,6 +132,16 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	1       Sc       P       \"Scheduler State\"                        \n \
 	2       event   T       \"event type\"				\n \
 	3       S       T       \"Thread State\"                        \n \
+	3       Ctx1      T     \"InCtx1\"         		\n \
+	3       Ctx2      T     \"InCtx2\"         		\n \
+	3       Ctx3      T     \"InCtx3\"         		\n \
+	3       Ctx4      T     \"InCtx4\"         		\n \
+	3       Ctx5      T     \"InCtx5\"         		\n \
+	3       Ctx6      T     \"InCtx6\"         		\n \
+	3       Ctx7      T     \"InCtx7\"         		\n \
+	3       Ctx8      T     \"InCtx8\"         		\n \
+	3       Ctx9      T     \"InCtx9\"         		\n \
+	3       Ctx10     T     \"InCtx10\"         		\n \
 	3       MS       Mn       \"Memory Node State\"                        \n \
 	4       ntask    Sc       \"Number of tasks\"                        \n \
 	4       bw      Mn       \"Bandwidth\"                        \n \
@@ -143,6 +153,86 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	6       B       S       Blocked         \".9 .1 .0\"		\n \
 	6       Sl       S      Sleeping         \".9 .1 .0\"		\n \
 	6       P       S       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx1      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx1      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx1      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx1      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx1       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx1       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx1      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx1       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx2      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx2      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx2      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx2      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx2       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx2       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx2      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx2       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx3      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx3      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx3      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx3      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx3       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx3       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx3      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx3       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx4      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx4      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx4      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx4      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx4       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx4       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx4      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx4       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx5      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx5      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx5      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx5      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx5       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx5       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx5      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx5       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx6      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx6      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx6      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx6      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx6       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx6       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx6      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx6       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx7      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx7      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx7      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx7      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx7       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx7       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx7      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx7       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx8      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx8      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx8      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx8      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx8       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx8       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx8      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx8       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx9      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx9      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx9      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx9      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx9       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx9       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx9      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx9       Progressing         \".4 .1 .6\"		\n \
+	6       I       Ctx10      Initializing       \"0.0 .7 1.0\"            \n \
+	6       D       Ctx10      Deinitializing       \"0.0 .1 .7\"            \n \
+	6       Fi       Ctx10      FetchingInput       \"1.0 .1 1.0\"            \n \
+	6       Po       Ctx10      PushingOutput       \"0.1 1.0 1.0\"            \n \
+	6       C       Ctx10       Callback       \".0 .3 .8\"            \n \
+	6       B       Ctx10       Blocked         \".9 .1 .0\"		\n \
+	6       Sl       Ctx10      Sleeping         \".9 .1 .0\"		\n \
+	6       P       Ctx10       Progressing         \".4 .1 .6\"		\n \
 	6       A       MS      Allocating         \".4 .1 .0\"		\n \
 	6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n \
 	6       R       MS      Reclaiming         \".0 .1 .4\"		\n \

+ 11 - 10
src/drivers/driver_common/driver_common.c

@@ -156,16 +156,19 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 {
 	struct starpu_task *task;
 
-	/* Note: we need to keep the sched condition mutex all along the path
-	 * from popping a task from the scheduler to blocking. Otherwise the
-	 * driver may go block just after the scheduler got a new task to be
-	 * executed, and thus hanging. */
-	_STARPU_PTHREAD_MUTEX_LOCK(args->sched_mutex);
-
 	task = _starpu_pop_task(args);
 
 	if (task == NULL)
 	{
+		/*TODO: check this out after the merge */
+
+
+		/* Note: we need to keep the sched condition mutex all along the path
+		 * from popping a task from the scheduler to blocking. Otherwise the
+		 * driver may go block just after the scheduler got a new task to be
+		 * executed, and thus hanging. */
+		_STARPU_PTHREAD_MUTEX_LOCK(&args->sched_mutex);
+
 		if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
 		{
 			_STARPU_TRACE_WORKER_SLEEP_START
@@ -174,15 +177,13 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		}
 
 		if (_starpu_worker_can_block(memnode))
-			_STARPU_PTHREAD_COND_WAIT(args->sched_cond, args->sched_mutex);
+			_STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
 
-		_STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&args->sched_mutex);
 
 		return NULL;
 	}
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
-
 	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
 	{
 		_STARPU_TRACE_WORKER_SLEEP_END

+ 310 - 185
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
- * Copyright (C) 2012  inria
+ * Copyright (C) 2011-2012  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,11 +41,17 @@
 #define DBL_MAX __DBL_MAX__
 #endif
 
-static unsigned nworkers;
-static struct _starpu_fifo_taskq *queue_array[STARPU_NMAXWORKERS];
+typedef struct {
+	double alpha;
+	double beta;
+	double _gamma;
+	double idle_power;
 
-static _starpu_pthread_cond_t sched_cond[STARPU_NMAXWORKERS];
-static _starpu_pthread_mutex_t sched_mutex[STARPU_NMAXWORKERS];
+	struct _starpu_fifo_taskq **queue_array;
+
+	long int total_task_cnt;
+	long int ready_task_cnt;
+} dmda_data;
 
 static double alpha = _STARPU_DEFAULT_ALPHA;
 static double beta = _STARPU_DEFAULT_BETA;
@@ -63,11 +69,6 @@ static const float idle_power_minimum=0;
 static const float idle_power_maximum=10000.0;
 #endif /* !STARPU_USE_TOP */
 
-#ifdef STARPU_VERBOSE
-static long int total_task_cnt = 0;
-static long int ready_task_cnt = 0;
-#endif
-
 static int count_non_ready_buffers(struct starpu_task *task, uint32_t node)
 {
 	int cnt = 0;
@@ -150,12 +151,14 @@ static struct starpu_task *_starpu_fifo_pop_first_ready_task(struct _starpu_fifo
 	return task;
 }
 
-static struct starpu_task *dmda_pop_ready_task(void)
+static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 {
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	struct starpu_task *task;
 
 	int workerid = starpu_worker_get_id();
-	struct _starpu_fifo_taskq *fifo = queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
 	unsigned node = starpu_worker_get_memory_node(workerid);
 
@@ -173,22 +176,24 @@ static struct starpu_task *dmda_pop_ready_task(void)
 		{
 			int non_ready = count_non_ready_buffers(task, node);
 			if (non_ready == 0)
-				ready_task_cnt++;
+				dt->ready_task_cnt++;
 		}
 
-		total_task_cnt++;
+		dt->total_task_cnt++;
 #endif
 	}
 
 	return task;
 }
 
-static struct starpu_task *dmda_pop_task(void)
+static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 {
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	struct starpu_task *task;
 
 	int workerid = starpu_worker_get_id();
-	struct _starpu_fifo_taskq *fifo = queue_array[workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
 	task = _starpu_fifo_pop_local_task(fifo);
 	if (task)
@@ -204,27 +209,29 @@ static struct starpu_task *dmda_pop_task(void)
 		{
 			int non_ready = count_non_ready_buffers(task, starpu_worker_get_memory_node(workerid));
 			if (non_ready == 0)
-				ready_task_cnt++;
+				dt->ready_task_cnt++;
 		}
 
-		total_task_cnt++;
+		dt->total_task_cnt++;
 #endif
 	}
 
 	return task;
 }
 
-
-
-static struct starpu_task *dmda_pop_every_task(void)
+static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 {
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	struct starpu_task *new_list;
 
 	int workerid = starpu_worker_get_id();
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
-	struct _starpu_fifo_taskq *fifo = queue_array[workerid];
-
-	new_list = _starpu_fifo_pop_every_task(fifo, &sched_mutex[workerid], workerid);
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+	new_list = _starpu_fifo_pop_every_task(fifo, sched_mutex, workerid);
 
 	while (new_list)
 	{
@@ -240,21 +247,27 @@ static struct starpu_task *dmda_pop_every_task(void)
 	return new_list;
 }
 
-
-
-
 static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 				    double predicted, double predicted_transfer,
-				    int prio)
+				    int prio, unsigned sched_ctx_id)
 {
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 
-	struct _starpu_fifo_taskq *fifo = queue_array[best_workerid];
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
 
-	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[best_workerid]);
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, best_workerid, &sched_mutex, &sched_cond);
 
-	/* Sometimes workers didn't take the tasks as early as we expected */
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+/* Sometimes workers didn't take the tasks as early as we expected */
 	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
@@ -277,9 +290,9 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 	fifo->exp_end += predicted_transfer;
 	fifo->exp_len += predicted_transfer;
-
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[best_workerid]);
-
+	
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+	
 	task->predicted = predicted;
 	task->predicted_transfer = predicted_transfer;
 
@@ -303,20 +316,20 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	}
 #endif
 	if (prio)
-		return _starpu_fifo_push_sorted_task(queue_array[best_workerid],
-			&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
+		return _starpu_fifo_push_sorted_task(dt->queue_array[best_workerid],
+			sched_mutex, sched_cond, task);
 	else
-		return _starpu_fifo_push_task(queue_array[best_workerid],
-			&sched_mutex[best_workerid], &sched_cond[best_workerid], task);
+		return _starpu_fifo_push_task(dt->queue_array[best_workerid],
+			sched_mutex, sched_cond, task);
 }
 
 /* TODO: factorize with dmda!! */
-static int _dm_push_task(struct starpu_task *task, unsigned prio)
+static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 {
-	/* find the queue */
-	unsigned worker;
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	unsigned worker, worker_ctx = 0;
 	int best = -1;
-
+	
 	double best_exp_end = 0.0;
 	double model_best = 0.0;
 	double transfer_model_best = 0.0;
@@ -324,41 +337,51 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 	int ntasks_best = -1;
 	double ntasks_best_end = 0.0;
 	int calibrating = 0;
-
+	
 	/* A priori, we know all estimations */
 	int unknown = 0;
-
+	
 	unsigned best_impl = 0;
 	unsigned nimpl;
-
-	for (worker = 0; worker < nworkers; worker++)
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+	
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+	
+	while(workers->has_next(workers))
 	{
-		struct _starpu_fifo_taskq *fifo  = queue_array[worker];
+		worker = workers->get_next(workers);
+		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
-
+		
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 				/* no one on that queue may execute this task */
+				//			worker_ctx++;
 				continue;
 			}
-
+			
 			double exp_end;
-
+			pthread_mutex_t *sched_mutex;
+			pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(sched_ctx_id, worker, &sched_mutex, &sched_cond);
+			
 			/* Sometimes workers didn't take the tasks as early as we expected */
-			_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[worker]);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 			fifo->exp_end = fifo->exp_start + fifo->exp_len;
-			_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[worker]);
-
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+			
+			
 			double local_length = starpu_task_expected_length(task, perf_arch, nimpl);
 			double local_penalty = starpu_task_expected_data_transfer_time(memory_node, task);
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
-
+			
 			//_STARPU_DEBUG("Scheduler dm: task length (%lf) worker (%u) kernel (%u) \n", local_length,worker,nimpl);
-
+			
 			if (ntasks_best == -1
 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
 			    || (!calibrating && isnan(local_length)) /* Not calibrating but this worker is being calibrated */
@@ -369,23 +392,23 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 				ntasks_best = worker;
 				best_impl = nimpl;
 			}
-
+			
 			if (isnan(local_length))
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
-
+			
 			if (isnan(local_length) || _STARPU_IS_ZERO(local_length))
 				/* there is no prediction available for that task
 				 * with that arch yet, so switch to a greedy strategy */
 				unknown = 1;
-
+			
 			if (unknown)
 				continue;
 
 			exp_end = fifo->exp_start + fifo->exp_len + local_length;
-
+			
 			if (best == -1 || exp_end < best_exp_end)
 			{
 				/* a better solution was found */
@@ -396,6 +419,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 				best_impl = nimpl;
 			}
 		}
+		worker_ctx++;
 	}
 
 	if (unknown)
@@ -404,24 +428,27 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio)
 		model_best = 0.0;
 		transfer_model_best = 0.0;
 	}
-
+	
 	//_STARPU_DEBUG("Scheduler dm: kernel (%u)\n", best_impl);
-
-	 _starpu_get_job_associated_to_task(task)->nimpl = best_impl;
-
+	
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+	
+	_starpu_get_job_associated_to_task(task)->nimpl = best_impl;
+	
 	/* we should now have the best worker in variable "best" */
 	return push_task_on_best_worker(task, best,
-					model_best, transfer_model_best, prio);
+									model_best, transfer_model_best, prio, sched_ctx_id);
 }
 
 static void compute_all_performance_predictions(struct starpu_task *task,
-					double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
-					double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
-					double *max_exp_endp,
-					double *best_exp_endp,
-					double local_data_penalty[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
-					double local_power[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
-					int *forced_worker, int *forced_impl)
+												double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+												double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+												double *max_exp_endp,
+												double *best_exp_endp,
+												double local_data_penalty[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+												double local_power[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+												int *forced_worker, int *forced_impl, unsigned sched_ctx_id)
 {
 	int calibrating = 0;
 	double max_exp_end = DBL_MIN;
@@ -432,20 +459,23 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
 	/* A priori, we know all estimations */
 	int unknown = 0;
-	unsigned worker;
+	unsigned worker, worker_ctx = 0;
 
 	unsigned nimpl;
 
 	starpu_task_bundle_t bundle = task->bundle;
-
-	for (worker = 0; worker < nworkers; worker++)
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+		
+	while(workers->has_next(workers))
 	{
-		struct _starpu_fifo_taskq *fifo = queue_array[worker];
+		worker = workers->get_next(workers);
+		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-		{
+		for(nimpl  = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+	 	{
 			if (!starpu_worker_can_execute_task(worker, task, nimpl))
 			{
 				/* no one on that queue may execute this task */
@@ -453,38 +483,41 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			}
 
 			/* Sometimes workers didn't take the tasks as early as we expected */
-			_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[worker]);
+			pthread_mutex_t *sched_mutex;
+			pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(sched_ctx_id, worker, &sched_mutex, &sched_cond);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
-			exp_end[worker][nimpl] = fifo->exp_start + fifo->exp_len;
-			_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[worker]);
-			if (exp_end[worker][nimpl] > max_exp_end)
-				max_exp_end = exp_end[worker][nimpl];
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len;
+			if (exp_end[worker_ctx][nimpl] > max_exp_end)
+				max_exp_end = exp_end[worker_ctx][nimpl];
 
 			//_STARPU_DEBUG("Scheduler dmda: task length (%lf) worker (%u) kernel (%u) \n", local_task_length[worker][nimpl],worker,nimpl);
 
 			if (bundle)
 			{
 				/* TODO : conversion time */
-				local_task_length[worker][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
-				local_data_penalty[worker][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
-				local_power[worker][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
+				local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+				local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch,nimpl);
 			}
 			else
 			{
-				local_task_length[worker][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
-				local_data_penalty[worker][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
-				local_power[worker][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
+				local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+				local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
+				local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
 				double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
 				if (conversion_time > 0.0)
-					local_task_length[worker][nimpl] += conversion_time;
+					local_task_length[worker_ctx][nimpl] += conversion_time;
 			}
 
 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
 
 			if (ntasks_best == -1
 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
-			    || (!calibrating && isnan(local_task_length[worker][nimpl])) /* Not calibrating but this worker is being calibrated */
-			    || (calibrating && isnan(local_task_length[worker][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
 				)
 			{
 				ntasks_best_end = ntasks_end;
@@ -492,34 +525,35 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 				nimpl_best = nimpl;
 			}
 
-			if (isnan(local_task_length[worker][nimpl]))
+			if (isnan(local_task_length[worker_ctx][nimpl]))
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
-
-			if (isnan(local_task_length[worker][nimpl])
-				|| _STARPU_IS_ZERO(local_task_length[worker][nimpl]))
+			
+			if (isnan(local_task_length[worker_ctx][nimpl])
+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
 				/* there is no prediction available for that task
 				 * with that arch (yet or at all), so switch to a greedy strategy */
 				unknown = 1;
 
 			if (unknown)
 				continue;
-
-			exp_end[worker][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker][nimpl];
-
-			if (exp_end[worker][nimpl] < best_exp_end)
+			
+			exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
+			
+			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 				/* a better solution was found */
-				best_exp_end = exp_end[worker][nimpl];
+				best_exp_end = exp_end[worker_ctx][nimpl];
 				nimpl_best = nimpl;
 			}
-
-			if (isnan(local_power[worker][nimpl]))
-				local_power[worker][nimpl] = 0.;
-
+			
+			if (isnan(local_power[worker_ctx][nimpl]))
+				local_power[worker_ctx][nimpl] = 0.;
+			
 		}
+		worker_ctx++;
 	}
 
 	*forced_worker = unknown?ntasks_best:-1;
@@ -529,11 +563,11 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	*max_exp_endp = max_exp_end;
 }
 
-static int _dmda_push_task(struct starpu_task *task, unsigned prio)
+static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 {
 	/* find the queue */
-	unsigned worker;
-	int best = -1;
+	unsigned worker, worker_ctx = 0;
+	int best = -1, best_in_ctx = -1;
 	int selected_impl = 0;
 	double model_best = 0.0;
 	double transfer_model_best = 0.0;
@@ -543,32 +577,39 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 	int forced_best = -1;
 	int forced_impl = -1;
 
-	double local_task_length[nworkers][STARPU_MAXIMPLEMENTATIONS];
-	double local_data_penalty[nworkers][STARPU_MAXIMPLEMENTATIONS];
-	double local_power[nworkers][STARPU_MAXIMPLEMENTATIONS];
-	double exp_end[nworkers][STARPU_MAXIMPLEMENTATIONS];
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+	unsigned nworkers_ctx = workers->nworkers;
+	double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
 	double max_exp_end = 0.0;
 	double best_exp_end;
 
-	double fitness[nworkers][STARPU_MAXIMPLEMENTATIONS];
+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
 
 	compute_all_performance_predictions(task,
-		local_task_length,
-		exp_end,
-		&max_exp_end,
-		&best_exp_end,
-		local_data_penalty,
-		local_power,
-		&forced_best,
-		&forced_impl);
+										local_task_length,
+										exp_end,
+										&max_exp_end,
+										&best_exp_end,
+										local_data_penalty,
+										local_power,
+										&forced_best,
+										&forced_impl, sched_ctx_id);
 
 	double best_fitness = -1;
 
 	unsigned nimpl;
 	if (forced_best == -1)
 	{
-		for (worker = 0; worker < nworkers; worker++)
+		while(workers->has_next(workers))
 		{
+			worker = workers->get_next(workers);
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 				if (!starpu_worker_can_execute_task(worker, task, nimpl))
@@ -576,29 +617,32 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 					/* no one on that queue may execute this task */
 					continue;
 				}
-
-				fitness[worker][nimpl] = alpha*(exp_end[worker][nimpl] - best_exp_end)
-					+ beta*(local_data_penalty[worker][nimpl])
-					+ _gamma*(local_power[worker][nimpl]);
-
-				if (exp_end[worker][nimpl] > max_exp_end)
+				
+				
+				fitness[worker_ctx][nimpl] = dt->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
+					+ dt->beta*(local_data_penalty[worker_ctx][nimpl])
+					+ dt->_gamma*(local_power[worker_ctx][nimpl]);
+				
+				if (exp_end[worker_ctx][nimpl] > max_exp_end)
 				{
 					/* This placement will make the computation
 					 * longer, take into account the idle
 					 * consumption of other cpus */
-					fitness[worker][nimpl] += _gamma * idle_power * (exp_end[worker][nimpl] - max_exp_end) / 1000000.0;
+					fitness[worker_ctx][nimpl] += dt->_gamma * dt->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
 				}
-
-				if (best == -1 || fitness[worker][nimpl] < best_fitness)
+				
+				if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
 				{
 					/* we found a better solution */
-					best_fitness = fitness[worker][nimpl];
+					best_fitness = fitness[worker_ctx][nimpl];
 					best = worker;
+					best_in_ctx = worker_ctx;
 					selected_impl = nimpl;
 
 					//_STARPU_DEBUG("best fitness (worker %d) %e = alpha*(%e) + beta(%e) +gamma(%e)\n", worker, best_fitness, exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl], local_power[worker][nimpl]);
 				}
 			}
+			worker_ctx++;
 		}
 	}
 
@@ -615,66 +659,148 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio)
 	}
 	else if (task->bundle)
 	{
-		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
+		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
 	}
 	else
 	{
-		model_best = local_task_length[best][selected_impl];
-		transfer_model_best = local_data_penalty[best][selected_impl];
+		model_best = local_task_length[best_in_ctx][selected_impl];
+		transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
 	}
-
+	
 	if (task->bundle)
 		starpu_task_bundle_remove(task->bundle, task);
-
+        if(workers->init_cursor)
+                workers->deinit_cursor(workers);
 
 	//_STARPU_DEBUG("Scheduler dmda: kernel (%u)\n", best_impl);
 	 _starpu_get_job_associated_to_task(task)->nimpl = selected_impl;
 
 	/* we should now have the best worker in variable "best" */
-	return push_task_on_best_worker(task, best,
-					model_best, transfer_model_best, prio);
+	return push_task_on_best_worker(task, best, model_best, transfer_model_best, prio, sched_ctx_id);
 }
 
 static int dmda_push_sorted_task(struct starpu_task *task)
 {
-	return _dmda_push_task(task, 1);
+	unsigned sched_ctx_id = task->sched_ctx;
+        pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+        unsigned nworkers;
+        int ret_val = -1;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+	if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	ret_val = _dmda_push_task(task, 1, sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	return ret_val;
+
 }
 
 static int dm_push_task(struct starpu_task *task)
 {
-	return _dm_push_task(task, 0);
+	unsigned sched_ctx_id = task->sched_ctx;
+        pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+        unsigned nworkers;
+        int ret_val = -1;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+	if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	ret_val = _dm_push_task(task, 0, sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	return ret_val;
 }
 
 static int dmda_push_task(struct starpu_task *task)
 {
-	return _dmda_push_task(task, 0);
+	unsigned sched_ctx_id = task->sched_ctx;
+        pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+        unsigned nworkers;
+        int ret_val = -1;
+
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+	if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	STARPU_ASSERT(task);
+	ret_val = _dmda_push_task(task, 0, sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	return ret_val;
 }
 
-static void initialize_dmda_policy(struct starpu_machine_topology *topology,
-				   struct starpu_sched_policy *policy)
+static void dmda_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers) 
 {
-	(void) policy;
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
-	nworkers = topology->nworkers;
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		dt->queue_array[workerid] = _starpu_create_fifo();
+		starpu_worker_init_sched_condition(sched_ctx_id, workerid);
+	}
+}
+
+static void dmda_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_fifo(dt->queue_array[workerid]);
+		starpu_worker_deinit_sched_condition(sched_ctx_id, workerid);
+	}
+}
+
+static void initialize_dmda_policy(unsigned sched_ctx_id) 
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	dmda_data *dt = (dmda_data*)malloc(sizeof(dmda_data));
+	dt->alpha = _STARPU_DEFAULT_ALPHA;
+	dt->beta = _STARPU_DEFAULT_BETA;
+	dt->_gamma = _STARPU_DEFAULT_GAMMA;
+	dt->idle_power = 0.0;
+
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)dt);
+
+	dt->queue_array = (struct _starpu_fifo_taskq**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_fifo_taskq*));
 
 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
 	if (strval_alpha)
-		alpha = atof(strval_alpha);
+		dt->alpha = atof(strval_alpha);
 
 	const char *strval_beta = getenv("STARPU_SCHED_BETA");
 	if (strval_beta)
-		beta = atof(strval_beta);
+		dt->beta = atof(strval_beta);
 
 	const char *strval_gamma = getenv("STARPU_SCHED_GAMMA");
 	if (strval_gamma)
-		_gamma = atof(strval_gamma);
+		dt->_gamma = atof(strval_gamma);	
 
 	const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
 	if (strval_idle_power)
-		idle_power = atof(strval_idle_power);
+		dt->idle_power = atof(strval_idle_power);
 
 #ifdef STARPU_USE_TOP
 	starpu_top_register_parameter_float("DMDA_ALPHA", &alpha,
@@ -686,43 +812,25 @@ static void initialize_dmda_policy(struct starpu_machine_topology *topology,
 	starpu_top_register_parameter_float("DMDA_IDLE_POWER", &idle_power,
 		idle_power_minimum, idle_power_maximum, param_modified);
 #endif /* !STARPU_USE_TOP */
-
-	unsigned workerid;
-	for (workerid = 0; workerid < nworkers; workerid++)
-	{
-		queue_array[workerid] = _starpu_create_fifo();
-
-		_STARPU_PTHREAD_MUTEX_INIT(&sched_mutex[workerid], NULL);
-		_STARPU_PTHREAD_COND_INIT(&sched_cond[workerid], NULL);
-
-		starpu_worker_set_sched_condition(workerid, &sched_cond[workerid], &sched_mutex[workerid]);
-	}
 }
 
-static void initialize_dmda_sorted_policy(struct starpu_machine_topology *topology,
-					struct starpu_sched_policy *policy)
+static void initialize_dmda_sorted_policy(unsigned sched_ctx_id)
 {
-	initialize_dmda_policy(topology, policy);
+	initialize_dmda_policy(sched_ctx_id);
 
 	/* The application may use any integer */
 	starpu_sched_set_min_priority(INT_MIN);
 	starpu_sched_set_max_priority(INT_MAX);
 }
 
-static void deinitialize_dmda_policy(struct starpu_machine_topology *topology,
-				     struct starpu_sched_policy *policy)
+static void deinitialize_dmda_policy(unsigned sched_ctx_id) 
 {
-	(void) policy;
-
-	unsigned workerid;
-	for (workerid = 0; workerid < topology->nworkers; workerid++)
-	{
-		_starpu_destroy_fifo(queue_array[workerid]);
-		_STARPU_PTHREAD_MUTEX_DESTROY(&sched_mutex[workerid]);
-		_STARPU_PTHREAD_COND_DESTROY(&sched_cond[workerid]);
-	}
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	free(dt->queue_array);
+	free(dt);
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
 
-	_STARPU_DEBUG("total_task_cnt %ld ready_task_cnt %ld -> %f\n", total_task_cnt, ready_task_cnt, (100.0f*ready_task_cnt)/total_task_cnt);
+	_STARPU_DEBUG("total_task_cnt %ld ready_task_cnt %ld -> %f\n", dt->total_task_cnt, dt->ready_task_cnt, (100.0f*dt->ready_task_cnt)/dt->total_task_cnt);
 }
 
 /* dmda_pre_exec_hook is called right after the data transfer is done and right
@@ -730,24 +838,30 @@ static void deinitialize_dmda_policy(struct starpu_machine_topology *topology,
  * value of the expected start, end, length, etc... */
 static void dmda_pre_exec_hook(struct starpu_task *task)
 {
+	unsigned sched_ctx_id = task->sched_ctx;
 	int workerid = starpu_worker_get_id();
-	struct _starpu_fifo_taskq *fifo = queue_array[workerid];
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	double model = task->predicted;
 	double transfer_model = task->predicted_transfer;
 
-	/* Once the task is executing, we can update the predicted amount of
- 	 * work. */
-	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+	/* Once the task is executing, we can update the predicted amount
+	 * of work. */
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	fifo->exp_len-= transfer_model;
 	fifo->exp_start = starpu_timing_now() + model;
 	fifo->exp_end= fifo->exp_start + fifo->exp_len;
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 }
 
 static void dmda_push_task_notify(struct starpu_task *task, int workerid)
 {
-	struct _starpu_fifo_taskq *fifo = queue_array[workerid];
-
+	unsigned sched_ctx_id = task->sched_ctx;
+	dmda_data *dt = (dmda_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	/* Compute the expected penality */
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
@@ -756,10 +870,13 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid)
 			_starpu_get_job_associated_to_task(task)->nimpl);
 
 	double predicted_transfer = starpu_task_expected_data_transfer_time(memory_node, task);
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
 
-	/* Update the predictions */
-	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
 
+	/* Update the predictions */
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	/* Sometimes workers didn't take the tasks as early as we expected */
 	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_end = fifo->exp_start + fifo->exp_len;
@@ -794,7 +911,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid)
 
 	fifo->ntasks++;
 
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 }
 
 /* TODO: use post_exec_hook to fix the expected start */
@@ -802,6 +919,8 @@ struct starpu_sched_policy _starpu_sched_dm_policy =
 {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
+	.add_workers = dmda_add_workers ,
+	.remove_workers = dmda_remove_workers,
 	.push_task = dm_push_task,
 	.pop_task = dmda_pop_task,
 	.pre_exec_hook = NULL,
@@ -815,6 +934,8 @@ struct starpu_sched_policy _starpu_sched_dmda_policy =
 {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
+	.add_workers = dmda_add_workers ,
+	.remove_workers = dmda_remove_workers,
 	.push_task = dmda_push_task,
 	.push_task_notify = dmda_push_task_notify,
 	.pop_task = dmda_pop_task,
@@ -829,6 +950,8 @@ struct starpu_sched_policy _starpu_sched_dmda_sorted_policy =
 {
 	.init_sched = initialize_dmda_sorted_policy,
 	.deinit_sched = deinitialize_dmda_policy,
+	.add_workers = dmda_add_workers ,
+	.remove_workers = dmda_remove_workers,
 	.push_task = dmda_push_sorted_task,
 	.push_task_notify = dmda_push_task_notify,
 	.pop_task = dmda_pop_ready_task,
@@ -843,6 +966,8 @@ struct starpu_sched_policy _starpu_sched_dmda_ready_policy =
 {
 	.init_sched = initialize_dmda_policy,
 	.deinit_sched = deinitialize_dmda_policy,
+	.add_workers = dmda_add_workers ,
+	.remove_workers = dmda_remove_workers,
 	.push_task = dmda_push_task,
 	.push_task_notify = dmda_push_task_notify,
 	.pop_task = dmda_pop_ready_task,

+ 173 - 535
src/sched_policies/detect_combined_workers.c

@@ -25,605 +25,243 @@
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 
-#if 0
-/* struct _starpu_tree
- * ==================
- * Purpose
- * =======
- * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
- * object and storing every workers it contained in every sub-trees by recursion.
- *
- * Fields
- * ======
- * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
- *
- * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
- 			or in this very object.
- *
- * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
- */
-
-struct _starpu_tree
-{
-    hwloc_obj_t obj;
-    unsigned nb_workers;
-    int *workers;
-};
-
-/* gather_trees
- * ============
- * Purpose
- * =======
- * Gather all the workers of every source tree in one target tree.
- * We assume the target array of workers is big enough to contain all the workers.
- *
- * Arguments
- * =========
- * target_tree		(input, output)
- *			Pointer to the tree which will contain all the workers of every source.
- *
- * source_trees		(input)
- *			Array of trees we want to combine in a unique tree.
- *
- * nb_source_trees	(input)
- *			Number of trees we want to combine (size of the array).
- */
-
-static void gather_trees(struct _starpu_tree *target_tree, struct _starpu_tree *source_trees, unsigned nb_source_trees)
-{
-    unsigned tree_id, worker_id, index = 0;
-    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
-	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
-	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
-}
-
-/* assign_multiple_trees
- * ========================
- * Purpose
- * =======
- * Assign every tree which is large enough (greater than min_size) and merge small ones.
- * If there is no tree large enough to be assigned any more, we return.
- *
- * Return value
- * ============
- * The number of workers assigned during the function.
- *
- * Arguments
- * =========
- * trees		(input, output)
- *			In entry, array of trees to assign. In the end at most one tree still contains workers.
- *
- * nb_trees		(input)
- *			The number of trees (size of the array).
- *
- * min_size		(input)
- *			Minimum size of a combined worker.
- *
- * max_size		(input)
- *			Maximum size of a combined worker.
- */
-
-static unsigned assign_multiple_trees(struct _starpu_tree *trees, unsigned nb_trees, unsigned int min_size, unsigned int max_size)
+static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
 {
-    unsigned short complete = 0;
-    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
-
-    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	nb_workers_total += trees[tree_id].nb_workers;;
-
-    while(!complete)
-    {
-	complete = 1;
-
-	/* First we manage to assign every subtree large enough to be assigned alone */
-	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	{
-	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
-		continue;
-
-	    nb_workers_tree = trees[tree_id].nb_workers;
-
-	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
-	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
-	    {
-		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
-		STARPU_ASSERT(ret >= 0);
-		nb_workers_assigned += nb_workers_tree;
-		nb_workers_total -= nb_workers_tree;
-		trees[tree_id].nb_workers = 0;
-	    }
-	}
-
-	/* Then we merge too small subtrees into not too large ones
-	 * if we manage to merge some subtrees we turn the flag
-	 * complete to 0 thus we know he have to start again to assign
-	 * just merged subtrees */
-	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
-	{
-	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
-		continue;
-
-	    nb_workers_tree = trees[tree_id].nb_workers;
-
-	    /* We go through the array to find another subtree we can merge with this one */
-	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
-	    {
-		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
-		    continue;
-
-		nb_workers_tree2 = trees[tree_id2].nb_workers;
-
-		/*  We can merge the two subtrees, let's do it */
-		if(nb_workers_tree + nb_workers_tree2 <= max_size)
+		if (!obj->userdata)
+				/* Not something we run something on, don't care */
+				return;
+		if (obj->userdata == (void*) -1)
 		{
-		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
-			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
-
-		    trees[tree_id].nb_workers += nb_workers_tree2;
-		    trees[tree_id2].nb_workers = 0;
-
-		    /* We just merged two subtrees, we need to restart again and try to assign it */
-		    complete = 0;
-		    break;
+				/* Intra node, recurse */
+				unsigned i;
+				for (i = 0; i < obj->arity; i++)
+						find_workers(obj->children[i], cpu_workers, n);
+				return;
+		}
+		
+		/* Got to a PU leaf */
+		struct _starpu_worker *worker = obj->userdata;
+		/* is it a CPU worker? */
+		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		{
+				_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
+				/* Add it to the combined worker */
+				cpu_workers[(*n)++] = worker->workerid;
 		}
-	    }
-
-	    if(!complete)
-		break;
-	}
-    }
-
-    return nb_workers_assigned;
-}
-
-/* find_and_assign_combinations_with_hwloc_recursive
- * =================================================
- * Purpose
- * =======
- * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
- * assign are given back to the calling function to be assigned using data from other subtrees if so.
- *
- * Return value
- * ============
- * The number of workers left to be assigned.
- *
- * Arguments
- * =========
- * tree			(input, output)
- *			Tree structure containing the root to process in entry.
- *			When the function returns it also contains the number of workers left
- *			to be assigned and these very workers in the array previously allocated.
- *
- * min_size		(input)
- *			Minimum size of a combined worker.
- *
- * max_size		(input)
- *			Maximum size of a combined worker.
- */
-
-static unsigned find_and_assign_combinations_with_hwloc_recursive(struct _starpu_tree *tree, unsigned int min_size, unsigned int max_size)
-{
-    unsigned subtree_id, nb_workers = 0;
-
-    hwloc_obj_t obj = tree->obj;
-    int *workers = tree->workers;
-
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-
-    /* Is this a leaf ? (eg. a PU for hwloc) */
-    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
-    {
-	struct _starpu_worker *worker = obj->userdata;
-
-	/* If this is a CPU worker add it at the beginning
-	 * of the array , write 1 in the field nb_workers and
-	 * return the number of CPU workers found : 1 in this case. */
-	if (worker && worker->arch == STARPU_CPU_WORKER)
-	{
-	    workers[0] = worker->workerid;
-	    tree->nb_workers = 1;
-	    return 1;
-	}
-
-	tree->nb_workers = 0;
-	return 0;
-    }
-
-
-    /* If there is only one child, we go to the next level right away */
-    if (obj->arity == 1)
-    {
-	struct _starpu_tree subtree = *tree;
-	subtree.obj = obj->children[0];
-	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
-	tree->nb_workers = nb_workers;
-	return nb_workers;
-    }
-
-    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
-     * CPU leaves that fits between min and max. */
-
-    /* We allocate an array of tree structures which will contain the current node's subtrees data */
-    struct _starpu_tree *subtrees = (struct _starpu_tree *) malloc(obj->arity * sizeof(struct _starpu_tree));
-
-    /* We allocate the array containing the workers of each subtree and initialize the fields left */
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-    {
-	struct _starpu_tree *subtree = subtrees + subtree_id;
-
-	subtree->obj = obj->children[subtree_id];
-	subtree->nb_workers = 0;
-	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
-    }
-
-    /* We recursively go through every subtree and get all the workers which are not assigned yet */
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
-
-    if(nb_workers > max_size)
-    {
-	/* We withdraw the number of workers just assigned from the total number of workers */
-	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
-
-	/* Some workers are not assigned yet : we gather them in the array
-	 * which is returned to the father which will handle them later */
-	if(nb_workers)
-	    gather_trees(tree, subtrees, obj->arity);
-    }
-    else if(nb_workers < max_size)
-    {
-	gather_trees(tree, subtrees, obj->arity);
-    }
-    else // nb_workers == max_size
-    {
-	gather_trees(tree, subtrees, obj->arity);
-
-	int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
-	STARPU_ASSERT(ret >= 0);
-	nb_workers = 0;
-    }
-
-    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
-	free(subtrees[subtree_id].workers);
-    free(subtrees);
-
-    tree->nb_workers = nb_workers;
-    return nb_workers;
-}
-
-/* get_min_max_sizes
- * =================================================
- * Purpose
- * =======
- * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
- * environment variables.
- * If both of them were not set, then we try do get some efficient values following the rule beneath :
- *
- * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
- * S = square_root(nb_cpus)
- *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
- *
- * If only one of both was not set then we set it with a value relative to the other, for example :
- *
- *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
- *
- * Arguments
- * =========
- * min_size		(output)
- *			Pointer to the minimum size of a combined worker, whether set with
- *			value given by the user or processed from the number of cpus.
- *
- * max_size		(output)
- *			Pointer to the maximum size of a combined worker, whether set with
- *			value given by the user or processed from the number of cpus.
- *
- * topology		(input)
- *			Topology of the machine : used to know the number of cpus.
- */
-
-static void get_min_max_sizes(unsigned int *min_size, unsigned int *max_size, struct starpu_machine_topology *topology)
-{
-    int _min_size, _max_size;
-
-    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
-    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-
-    /* If the user didn't set both the environment variables,
-     * we need to find a minimum and a maximum size ourselves */
-    if(_min_size <= -1 || _max_size <= -1)
-    {
-
-	int nb_cpus = topology->nhwcpus;
-	int sqrt_nb_cpus = (int)sqrt((double)nb_cpus);
-	int exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
-
-	    if(_min_size == -1)
-	    {
-		if(_max_size > -1)
-		    _min_size = _max_size - 1;
-		else
-		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
-	    }
-
-	if(_max_size == -1)
-	{
-	    if(_min_size > -1)
-		_max_size = _min_size + 1;
-	    else
-		_max_size = sqrt_nb_cpus + 1;
-	}
-    }
-
-    *min_size = _min_size;
-    *max_size = _max_size;
-
-    return;
-}
-
-/* find_and_assign_combinations_with_hwloc
- * =======================================
- * Purpose
- * =======
- * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
- * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
- * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
- * some workers, we assign them no matter the number for there is no way to respect
- * the wanted sizes anymore.
- *
- * Arguments
- * =========
- * topology		(input)
- *			Topology of the machine : used to know the number of cpus and
- *			to get the hwloc tree.
- */
-
-static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
-{
-    unsigned nb_workers;
-    unsigned int min_size, max_size;
-
-    get_min_max_sizes(&min_size, &max_size, topology);
-
-    STARPU_ASSERT(min_size <= max_size);
-
-    struct _starpu_tree tree;
-
-    /* Of course we start from the root */
-    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0);
-    tree.nb_workers = 0;
-    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
-
-    /* We recursively go from the root to the leaves of the tree to find
-     * subtrees that only have CPUs as leaves. */
-    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
-
-    /* There are still some workers left, since the only possibility is that
-     * the number of workers left is less than the minimum worker size we assign them all */
-    if(nb_workers > 0)
-    {
-	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
-	 * while there are enough workers to assign regarding the min_size value */
-	STARPU_ASSERT(nb_workers <= max_size);
-
-	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
-	STARPU_ASSERT(ret >= 0);
-    }
-
-    free(tree.workers);
-}
-#endif
-
-static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], unsigned *n)
-{
-    if (!obj->userdata)
-	/* Not something we run something on, don't care */
-	return;
-    if (obj->userdata == (void*) -1)
-    {
-	/* Intra node, recurse */
-	unsigned i;
-	for (i = 0; i < obj->arity; i++)
-	    find_workers(obj->children[i], cpu_workers, n);
-	return;
-    }
-
-    /* Got to a PU leaf */
-    struct _starpu_worker *worker = obj->userdata;
-    /* is it a CPU worker? */
-    if (worker->perf_arch == STARPU_CPU_DEFAULT)
-    {
-	_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
-	/* Add it to the combined worker */
-	cpu_workers[(*n)++] = worker->workerid;
-    }
 }
 
-static void synthesize_intermediate_workers(struct starpu_machine_topology *topology, hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
+static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned arity, unsigned n, unsigned synthesize_arity)
 {
-    unsigned nworkers, i, j;
-    unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
-    unsigned chunk_start;
-    int cpu_workers[STARPU_NMAXWORKERS];
-    int ret;
-
-    if (n <= synthesize_arity)
-	/* Not too many children, do not synthesize */
-	return;
-
-    _STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
-
-    n = 0;
-    j = 0;
-    nworkers = 0;
-    chunk_start = 0;
-    for (i = 0 ; i < arity; i++)
-    {
-	if (children[i]->userdata) {
-	    n++;
-	    _STARPU_DEBUG("child %u\n", i);
-	    find_workers(children[i], cpu_workers, &nworkers);
-	    j++;
-	}
-	/* Completed a chunk, or last bit (but not if it's just 1 subobject) */
-	if (j == chunk_size || (i == arity-1 && j > 1)) {
-	    _STARPU_DEBUG("Adding it\n");
-	    ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
-	    STARPU_ASSERT(ret >= 0);
-	    /* Recurse there */
-	    synthesize_intermediate_workers(topology, children+chunk_start, i - chunk_start, n, synthesize_arity);
-	    /* And restart another one */
-	    n = 0;
-	    j = 0;
-	    nworkers = 0;
-	    chunk_start = i+1;
-	}
-    }
+		unsigned nworkers, i, j;
+		unsigned chunk_size = (n + synthesize_arity-1) / synthesize_arity;
+		unsigned chunk_start;
+		int cpu_workers[STARPU_NMAXWORKERS];
+		int ret;
+		
+		if (n <= synthesize_arity)
+				/* Not too many children, do not synthesize */
+				return;
+
+		_STARPU_DEBUG("%u children > %u, synthesizing intermediate combined workers of size %u\n", n, synthesize_arity, chunk_size);
+
+		n = 0;
+		j = 0;
+		nworkers = 0;
+		chunk_start = 0;
+		for (i = 0 ; i < arity; i++)
+		{
+				if (children[i]->userdata) 
+				{
+						n++;
+						_STARPU_DEBUG("child %u\n", i);
+						find_workers(children[i], cpu_workers, &nworkers);
+						j++;
+				}
+				/* Completed a chunk, or last bit (but not if it's just 1 subobject) */
+				if (j == chunk_size || (i == arity-1 && j > 1)) 
+				{
+						_STARPU_DEBUG("Adding it\n");
+						ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
+						STARPU_ASSERT(ret >= 0);
+						/* Recurse there */
+						synthesize_intermediate_workers(children+chunk_start, i - chunk_start, n, synthesize_arity);
+						/* And restart another one */
+						n = 0;
+						j = 0;
+						nworkers = 0;
+						chunk_start = i+1;
+				}
+		}
 }
 
-static void find_and_assign_combinations(struct starpu_machine_topology *topology, hwloc_obj_t obj, unsigned synthesize_arity)
+static void find_and_assign_combinations(hwloc_obj_t obj, unsigned synthesize_arity)
 {
     char name[64];
     unsigned i, n, nworkers;
     int cpu_workers[STARPU_NMAXWORKERS];
 
-    int ret;
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct starpu_machine_topology *topology = &config->topology;
 
     hwloc_obj_snprintf(name, sizeof(name), topology->hwtopology, obj, "#", 0);
     _STARPU_DEBUG("Looking at %s\n", name);
 
     for (n = 0, i = 0; i < obj->arity; i++)
-	if (obj->children[i]->userdata)
-	    /* it has a CPU worker */
-	    n++;
-
+			if (obj->children[i]->userdata)
+					/* it has a CPU worker */
+					n++;
+	
     if (n == 1) {
-	/* If there is only one child, we go to the next level right away */
-	find_and_assign_combinations(topology, obj->children[0], synthesize_arity);
-	return;
+			/* If there is only one child, we go to the next level right away */
+			find_and_assign_combinations(obj->children[0], synthesize_arity);
+			return;
     }
-
+	
     /* Add this object */
     nworkers = 0;
     find_workers(obj, cpu_workers, &nworkers);
-
+	
     if (nworkers > 1)
     {
-	_STARPU_DEBUG("Adding it\n");
-	ret = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
-	STARPU_ASSERT(ret >= 0);
+			_STARPU_DEBUG("Adding it\n");
+			unsigned sched_ctx_id  = starpu_get_sched_ctx();
+			if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+					sched_ctx_id = 0; 
+			
+			struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+			int newworkerid = starpu_combined_worker_assign_workerid(nworkers, cpu_workers);
+			STARPU_ASSERT(newworkerid >= 0);
+			workers->add(workers,newworkerid);
     }
-
+	
     /* Add artificial intermediate objects recursively */
-    synthesize_intermediate_workers(topology, obj->children, obj->arity, n, synthesize_arity);
-
+    synthesize_intermediate_workers(obj->children, obj->arity, n, synthesize_arity);
+	
     /* And recurse */
     for (i = 0; i < obj->arity; i++)
-	if (obj->children[i]->userdata == (void*) -1)
-	    find_and_assign_combinations(topology, obj->children[i], synthesize_arity);
+			if (obj->children[i]->userdata == (void*) -1)
+					find_and_assign_combinations(obj->children[i], synthesize_arity);
 }
 
-static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology *topology)
+static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers)
 {
-    unsigned i;
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-    int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
-
-    if (synthesize_arity == -1)
-	synthesize_arity = 2;
-
-    /* First, mark nodes which contain CPU workers, simply by setting their userdata field */
-    for (i = 0; i < topology->nworkers; i++)
-    {
-	struct _starpu_worker *worker = &config->workers[i];
-	if (worker->perf_arch == STARPU_CPU_DEFAULT)
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	struct starpu_machine_topology *topology = &config->topology;
+	int synthesize_arity = starpu_get_env_number("STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER");
+	
+	if (synthesize_arity == -1)
+		synthesize_arity = 2;
+	
+	/* First, mark nodes which contain CPU workers, simply by setting their userdata field */
+	int i;
+	for (i = 0; i < nworkers; i++)
 	{
-	    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
-	    STARPU_ASSERT(obj->userdata == worker);
-	    obj = obj->parent;
-	    while (obj) {
-		obj->userdata = (void*) -1;
-		obj = obj->parent;
-	    }
+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
+		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		{
+			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
+			STARPU_ASSERT(obj->userdata == worker);
+			obj = obj->parent;
+			while (obj) {
+				obj->userdata = (void*) -1;
+				obj = obj->parent;
+			}
+		}
 	}
-    }
-    find_and_assign_combinations(topology, hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
+	find_and_assign_combinations(hwloc_get_root_obj(topology->hwtopology), synthesize_arity);
 }
 
 #else /* STARPU_HAVE_HWLOC */
 
-static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology *topology)
+static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
 {
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-
+    unsigned sched_ctx_id  = starpu_get_sched_ctx();
+    if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+	    sched_ctx_id = 0; 
+	
+    struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+	
+	
     /* We put the id of all CPU workers in this array */
     int cpu_workers[STARPU_NMAXWORKERS];
     unsigned ncpus = 0;
-
+	
+    struct _starpu_worker *worker;
     unsigned i;
-    for (i = 0; i < topology->nworkers; i++)
+    for (i = 0; i < nworkers; i++)
     {
-	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-	    cpu_workers[ncpus++] = i;
+	    worker = _starpu_get_worker_struct(workerids[i]);
+		
+	    if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		    cpu_workers[ncpus++] = i;
     }
-
+	
     unsigned size;
     for (size = 2; size <= ncpus; size *= 2)
     {
-	unsigned first_cpu;
-	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-	{
-	    if (first_cpu + size <= ncpus)
-	    {
-		int workerids[size];
-
-		for (i = 0; i < size; i++)
-		    workerids[i] = cpu_workers[first_cpu + i];
-
-		/* We register this combination */
-		int ret;
-		ret = starpu_combined_worker_assign_workerid(size, workerids);
-		STARPU_ASSERT(ret >= 0);
-	    }
-	}
+		unsigned first_cpu;
+		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
+		{
+			if (first_cpu + size <= ncpus)
+			{
+				int found_workerids[size];
+				
+				for (i = 0; i < size; i++)
+					found_workerids[i] = cpu_workers[first_cpu + i];
+				
+				/* We register this combination */
+				int newworkerid;
+				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
+				STARPU_ASSERT(newworkerid >= 0);
+				workers->add(workers, newworkerid);
+			}
+		}
     }
 }
 
 #endif /* STARPU_HAVE_HWLOC */
 
-static void combine_all_cpu_workers(struct starpu_machine_topology *topology)
-{
-    struct _starpu_machine_config *config = _starpu_get_machine_config();
-
-    int cpu_workers[STARPU_NMAXWORKERS];
-    unsigned ncpus = 0;
 
-    unsigned i;
-    for (i = 0; i < topology->nworkers; i++)
-    {
-	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-	    cpu_workers[ncpus++] = i;
-    }
-
-    for (i = ncpus; i >= 1; i--)
-    {
-	int ret;
-	ret = starpu_combined_worker_assign_workerid(i, cpu_workers);
-	STARPU_ASSERT(ret >= 0);
-    }
+static void combine_all_cpu_workers(int *workerids, int nworkers)
+{
+	unsigned sched_ctx_id  = starpu_get_sched_ctx();
+	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
+		sched_ctx_id = 0;
+	struct worker_collection* workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+	int cpu_workers[STARPU_NMAXWORKERS];
+	int ncpus = 0;
+	struct _starpu_worker *worker;
+	int i;
+	for (i = 0; i < nworkers; i++)
+	{
+		worker = _starpu_get_worker_struct(workerids[i]);
+		
+		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+			cpu_workers[ncpus++] = workerids[i];
+	}
+	
+	for (i = 1; i <= ncpus; i++)
+	{
+		int newworkerid;
+		newworkerid = starpu_combined_worker_assign_workerid(i, cpu_workers);
+		STARPU_ASSERT(newworkerid >= 0);
+		workers->add(workers, newworkerid);
+	}
 }
 
-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology)
+void _starpu_sched_find_worker_combinations(int *workerids, int nworkers)
 {
     struct _starpu_machine_config *config = _starpu_get_machine_config();
 
     if (config->conf->single_combined_worker > 0)
-	combine_all_cpu_workers(topology);
+	    combine_all_cpu_workers(workerids, nworkers);
     else
     {
 #ifdef STARPU_HAVE_HWLOC
-	find_and_assign_combinations_with_hwloc(topology);
+	    find_and_assign_combinations_with_hwloc(workerids, nworkers);
 #else
-	find_and_assign_combinations_without_hwloc(topology);
+	    find_and_assign_combinations_without_hwloc(workerids, nworkers);
 #endif
     }
 }

+ 1 - 1
src/sched_policies/detect_combined_workers.h

@@ -17,5 +17,5 @@
 #include <starpu.h>
 
 /* Initialize combined workers */
-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology *topology);
+void _starpu_sched_find_worker_combinations(int *workerids, int nworkers);
 

+ 76 - 21
src/sched_policies/eager_central_policy.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,56 +24,110 @@
 #include <core/workers.h>
 #include <sched_policies/fifo_queues.h>
 
-/* the former is the actual queue, the latter some container */
-static struct _starpu_fifo_taskq *fifo;
+typedef struct {
+	struct _starpu_fifo_taskq *fifo;
+	pthread_mutex_t sched_mutex;
+	pthread_cond_t sched_cond;
+} eager_center_policy_data;
 
-static _starpu_pthread_cond_t sched_cond;
-static _starpu_pthread_mutex_t sched_mutex;
+static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers) 
+{
+	eager_center_policy_data *data = (eager_center_policy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, &data->sched_mutex, &data->sched_cond);
+	}
+}
+
+static void eager_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL, NULL);
+	}
+}
 
-static void initialize_eager_center_policy(struct starpu_machine_topology *topology,
-		   __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void initialize_eager_center_policy(unsigned sched_ctx_id) 
 {
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	eager_center_policy_data *data = (eager_center_policy_data*)malloc(sizeof(eager_center_policy_data));
+
 	_STARPU_DISP("Warning: you are running the default eager scheduler, which is not very smart. Make sure to read the StarPU documentation about adding performance models in order to be able to use the dmda scheduler instead.\n");
 
 	/* there is only a single queue in that trivial design */
-	fifo = _starpu_create_fifo();
+	data->fifo =  _starpu_create_fifo();
 
-	_STARPU_PTHREAD_MUTEX_INIT(&sched_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&sched_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
 
-	unsigned workerid;
-	for (workerid = 0; workerid < topology->nworkers; workerid++)
-		starpu_worker_set_sched_condition(workerid, &sched_cond, &sched_mutex);
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)data);
 }
 
-static void deinitialize_eager_center_policy(__attribute__ ((unused)) struct starpu_machine_topology *topology,
-		   __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void deinitialize_eager_center_policy(unsigned sched_ctx_id) 
 {
-	//STARPU_ASSERT(_starpu_fifo_empty(fifo));
+	/* TODO check that there is no task left in the queue */
+
+	eager_center_policy_data *data = (eager_center_policy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
 	/* deallocate the job queue */
-	_starpu_destroy_fifo(fifo);
+	_starpu_destroy_fifo(data->fifo);
+
+	_STARPU_PTHREAD_MUTEX_DESTROY(&data->sched_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&data->sched_cond);
+	
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+
+	free(data);	
 }
 
 static int push_task_eager_policy(struct starpu_task *task)
 {
-	return _starpu_fifo_push_task(fifo, &sched_mutex, &sched_cond, task);
+	unsigned sched_ctx_id = task->sched_ctx;
+	eager_center_policy_data *data = (eager_center_policy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+	unsigned nworkers;
+	int ret_val = -1;
+	
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+	if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	ret_val = _starpu_fifo_push_task(data->fifo, &data->sched_mutex, &data->sched_cond, task);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	return ret_val;
 }
 
-static struct starpu_task *pop_every_task_eager_policy(void)
+static struct starpu_task *pop_every_task_eager_policy(unsigned sched_ctx_id)
 {
-	return _starpu_fifo_pop_every_task(fifo, &sched_mutex, starpu_worker_get_id());
+	eager_center_policy_data *data = (eager_center_policy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	return _starpu_fifo_pop_every_task(data->fifo, &data->sched_mutex, starpu_worker_get_id());
 }
 
-static struct starpu_task *pop_task_eager_policy(void)
+static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 {
-	return _starpu_fifo_pop_task(fifo, starpu_worker_get_id());
+	unsigned workerid = starpu_worker_get_id();
+	eager_center_policy_data *data = (eager_center_policy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	
+	return _starpu_fifo_pop_task(data->fifo, workerid);
 }
 
 struct starpu_sched_policy _starpu_sched_eager_policy =
 {
 	.init_sched = initialize_eager_center_policy,
 	.deinit_sched = deinitialize_eager_center_policy,
+	.add_workers = eager_add_workers,
+	.remove_workers = eager_remove_workers,
 	.push_task = push_task_eager_policy,
 	.pop_task = pop_task_eager_policy,
 	.pre_exec_hook = NULL,

+ 82 - 25
src/sched_policies/eager_central_priority_policy.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,13 +42,11 @@ struct _starpu_priority_taskq
 	unsigned total_ntasks;
 };
 
-/* the former is the actual queue, the latter some container */
-static struct _starpu_priority_taskq *taskq;
-
-/* keep track of the total number of tasks to be scheduled to avoid infinite
- * polling when there are really few tasks in the overall queue */
-static _starpu_pthread_cond_t global_sched_cond;
-static _starpu_pthread_mutex_t global_sched_mutex;
+typedef struct eager_central_prio_data{
+	struct _starpu_priority_taskq *taskq;
+	_starpu_pthread_mutex_t sched_mutex;
+	_starpu_pthread_cond_t sched_cond;
+} eager_central_prio_data;
 
 /*
  * Centralized queue with priorities
@@ -75,37 +74,88 @@ static void _starpu_destroy_priority_taskq(struct _starpu_priority_taskq *priori
 	free(priority_queue);
 }
 
-static void initialize_eager_center_priority_policy(struct starpu_machine_topology *topology,
-			__attribute__ ((unused))	struct starpu_sched_policy *_policy)
+static void eager_priority_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers) 
+{
+	eager_central_prio_data *data = (eager_central_prio_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, &data->sched_mutex, &data->sched_cond);
+	}
+}
+
+static void eager_priority_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL, NULL);
+	}	
+}
+
+static void initialize_eager_center_priority_policy(unsigned sched_ctx_id) 
 {
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+	eager_central_prio_data *data = (eager_central_prio_data*)malloc(sizeof(eager_central_prio_data));
+
 	/* In this policy, we support more than two levels of priority. */
 	starpu_sched_set_min_priority(MIN_LEVEL);
 	starpu_sched_set_max_priority(MAX_LEVEL);
 
 	/* only a single queue (even though there are several internaly) */
-	taskq = _starpu_create_priority_taskq();
+	data->taskq = _starpu_create_priority_taskq();
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)data);
 
-	_STARPU_PTHREAD_MUTEX_INIT(&global_sched_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&global_sched_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
 
-	unsigned workerid;
-	for (workerid = 0; workerid < topology->nworkers; workerid++)
-		starpu_worker_set_sched_condition(workerid, &global_sched_cond, &global_sched_mutex);
 }
 
-static void deinitialize_eager_center_priority_policy(struct starpu_machine_topology *topology __attribute__ ((unused)),
-		   __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void deinitialize_eager_center_priority_policy(unsigned sched_ctx_id) 
 {
 	/* TODO check that there is no task left in the queue */
+	eager_central_prio_data *data = (eager_central_prio_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
 	/* deallocate the task queue */
-	_starpu_destroy_priority_taskq(taskq);
+	_starpu_destroy_priority_taskq(data->taskq);
+
+	_STARPU_PTHREAD_MUTEX_DESTROY(&data->sched_mutex);
+        _STARPU_PTHREAD_COND_DESTROY(&data->sched_cond);
+
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+        free(data);
+	
 }
 
 static int _starpu_priority_push_task(struct starpu_task *task)
 {
+	unsigned sched_ctx_id = task->sched_ctx;
+	eager_central_prio_data *data = (eager_central_prio_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	struct _starpu_priority_taskq *taskq = data->taskq;
+
+	/* if the context has no workers return */
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+        unsigned nworkers;
+        int ret_val = -1;
+
+        _STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+        nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+        if(nworkers == 0)
+        {
+                _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                return ret_val;
+        }
+
+
+	/*if there are no tasks block */
 	/* wake people waiting for a task */
-	_STARPU_PTHREAD_MUTEX_LOCK(&global_sched_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&data->sched_mutex);
 
 	_STARPU_TRACE_JOB_PUSH(task, 1);
 
@@ -115,18 +165,23 @@ static int _starpu_priority_push_task(struct starpu_task *task)
 	taskq->ntasks[priolevel]++;
 	taskq->total_ntasks++;
 
-	_STARPU_PTHREAD_COND_SIGNAL(&global_sched_cond);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&global_sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&data->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&data->sched_mutex);
 
-	return 0;
+        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+        return 0;
 }
 
-static struct starpu_task *_starpu_priority_pop_task(void)
+static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 {
 	struct starpu_task *chosen_task = NULL, *task;
 	unsigned workerid = starpu_worker_get_id();
 	int skipped = 0;
 
+	eager_central_prio_data *data = (eager_central_prio_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	
+	struct _starpu_priority_taskq *taskq = data->taskq;
+
 	/* block until some event happens */
 
 	if ((taskq->total_ntasks == 0) && _starpu_machine_is_running())
@@ -134,7 +189,7 @@ static struct starpu_task *_starpu_priority_pop_task(void)
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 		return NULL;
 #else
-		_STARPU_PTHREAD_COND_WAIT(&global_sched_cond, &global_sched_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&data->sched_cond, &data->sched_mutex);
 #endif
 	}
 
@@ -170,7 +225,7 @@ static struct starpu_task *_starpu_priority_pop_task(void)
 
 	if (!chosen_task && skipped)
 		/* Notify another worker to do that task */
-		_STARPU_PTHREAD_COND_SIGNAL(&global_sched_cond);
+		_STARPU_PTHREAD_COND_SIGNAL(&data->sched_cond);
 
 	return chosen_task;
 }
@@ -179,6 +234,8 @@ struct starpu_sched_policy _starpu_sched_prio_policy =
 {
 	.init_sched = initialize_eager_center_priority_policy,
 	.deinit_sched = deinitialize_eager_center_priority_policy,
+        .add_workers = eager_priority_add_workers,
+        .remove_workers = eager_priority_remove_workers,
 	/* we always use priorities in that policy */
 	.push_task = _starpu_priority_push_task,
 	.pop_task = _starpu_priority_pop_task,

+ 641 - 0
src/sched_policies/heft.c

@@ -0,0 +1,641 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Distributed queues using performance modeling to assign tasks */
+
+#include <float.h>
+
+#include <core/workers.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/task_bundle.h>
+#include <core/workers.h>
+#include <starpu_parameters.h>
+#include <starpu_task_bundle.h>
+#include <starpu_top.h>
+#include <core/jobs.h>
+#include <top/starpu_top_core.h>
+#include <sched_policies/fifo_queues.h>
+#include <core/debug.h>
+
+#ifndef DBL_MIN
+#define DBL_MIN __DBL_MIN__
+#endif
+
+#ifndef DBL_MAX
+#define DBL_MAX __DBL_MAX__
+#endif
+
+
+static double current_time[STARPU_NMAXWORKERS][STARPU_NMAX_SCHED_CTXS];
+
+typedef struct {
+	double alpha;
+	double beta;
+	double _gamma;
+	double idle_power;
+	struct _starpu_fifo_taskq **queue_array;
+} heft_data;
+
+const float alpha_minimum=0;
+const float alpha_maximum=10.0;
+const float beta_minimum=0;
+const float beta_maximum=10.0;
+const float gamma_minimum=0;
+const float gamma_maximum=10000.0;
+const float idle_power_minimum=0;
+const float idle_power_maximum=10000.0;
+
+static void param_modified(struct starpu_top_param* d)
+{
+	//just to show parameter modification
+	fprintf(stderr,"%s has been modified : %f !\n", d->name, *(double*)d->value);
+}
+
+
+static void heft_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		hd->queue_array[workerid] = _starpu_create_fifo();
+		starpu_worker_init_sched_condition(sched_ctx_id, workerid);
+
+		current_time[workerid][sched_ctx_id] = 0.0;
+	}
+}
+
+static void heft_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_fifo(hd->queue_array[workerid]);
+		starpu_worker_deinit_sched_condition(sched_ctx_id, workerid);
+		current_time[workerid][sched_ctx_id] = 0.0;
+	}
+}
+
+static void heft_init(unsigned sched_ctx_id)
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	heft_data *hd = (heft_data*)malloc(sizeof(heft_data));
+	hd->alpha = _STARPU_DEFAULT_ALPHA;
+	hd->beta = _STARPU_DEFAULT_BETA;
+	hd->_gamma = _STARPU_DEFAULT_GAMMA;
+	hd->idle_power = 0.0;
+	
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)hd);
+
+	hd->queue_array = (struct _starpu_fifo_taskq**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_fifo_taskq*));
+
+	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
+	if (strval_alpha)
+		hd->alpha = atof(strval_alpha);
+
+	const char *strval_beta = getenv("STARPU_SCHED_BETA");
+	if (strval_beta)
+		hd->beta = atof(strval_beta);
+
+	const char *strval_gamma = getenv("STARPU_SCHED_GAMMA");
+	if (strval_gamma)
+		hd->_gamma = atof(strval_gamma);
+
+	const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
+	if (strval_idle_power)
+		hd->idle_power = atof(strval_idle_power);
+
+	starpu_top_register_parameter_float("HEFT_ALPHA", &hd->alpha, alpha_minimum,alpha_maximum,param_modified);
+	starpu_top_register_parameter_float("HEFT_BETA", &hd->beta, beta_minimum,beta_maximum,param_modified);
+	starpu_top_register_parameter_float("HEFT_GAMMA", &hd->_gamma, gamma_minimum,gamma_maximum,param_modified);
+	starpu_top_register_parameter_float("HEFT_IDLE_POWER", &hd->idle_power, idle_power_minimum,idle_power_maximum,param_modified);
+}
+
+
+/* heft_pre_exec_hook is called right after the data transfer is done and right before
+ * the computation to begin, it is useful to update more precisely the value
+ * of the expected start, end, length, etc... */
+static void heft_pre_exec_hook(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	int workerid = starpu_worker_get_id();
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = hd->queue_array[workerid];
+	double model = task->predicted;
+	double transfer_model = task->predicted_transfer;
+
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+	/* Once the task is executing, we can update the predicted amount
+	 * of work. */
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+	fifo->exp_len-= transfer_model;
+	fifo->exp_start = starpu_timing_now() + model;
+	fifo->exp_end= fifo->exp_start + fifo->exp_len;
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+}
+
+static void heft_push_task_notify(struct starpu_task *task, int workerid)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = hd->queue_array[workerid];
+	/* Compute the expected penality */
+	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	unsigned memory_node = starpu_worker_get_memory_node(workerid);
+
+	double predicted = starpu_task_expected_length(task, perf_arch,
+			_starpu_get_job_associated_to_task(task)->nimpl);
+
+	double predicted_transfer = starpu_task_expected_data_transfer_time(memory_node, task);
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+
+
+	/* Update the predictions */
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+	/* Sometimes workers didn't take the tasks as early as we expected */
+	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
+
+	/* If there is no prediction available, we consider the task has a null length */
+	if (!isnan(predicted))
+	{
+		task->predicted = predicted;
+		fifo->exp_end += predicted;
+		fifo->exp_len += predicted;
+	}
+
+	/* If there is no prediction available, we consider the task has a null length */
+	if (!isnan(predicted_transfer))
+	{
+		if (starpu_timing_now() + predicted_transfer < fifo->exp_end)
+		{
+			/* We may hope that the transfer will be finished by
+			 * the start of the task. */
+			predicted_transfer = 0;
+		}
+		else
+		{
+			/* The transfer will not be finished by then, take the
+			 * remainder into account */
+			predicted_transfer = (starpu_timing_now() + predicted_transfer) - fifo->exp_end;
+		}
+		task->predicted_transfer = predicted_transfer;
+		fifo->exp_end += predicted_transfer;
+		fifo->exp_len += predicted_transfer;
+	}
+
+	fifo->ntasks++;
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+}
+
+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double predicted, double predicted_transfer, unsigned sched_ctx_id)
+ {
+	/* make sure someone coule execute that task ! */
+	STARPU_ASSERT(best_workerid != -1);
+
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = hd->queue_array[best_workerid];
+
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, best_workerid, &sched_mutex, &sched_cond);
+
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	starpu_call_pushed_task_cb(best_workerid, sched_ctx_id);
+#endif //STARPU_USE_SCHED_CTX_HYPERVISOR
+
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	/* Sometimes workers didn't take the tasks as early as we expected */
+	fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
+
+	fifo->exp_end += predicted;
+	fifo->exp_len += predicted;
+
+	if (starpu_timing_now() + predicted_transfer < fifo->exp_end)
+	{
+		/* We may hope that the transfer will be finished by
+		 * the start of the task. */
+		predicted_transfer = 0;
+	}
+	else
+	{
+		/* The transfer will not be finished by then, take the
+		 * remainder into account */
+		predicted_transfer = (starpu_timing_now() + predicted_transfer) - fifo->exp_end;
+	}
+	fifo->exp_end += predicted_transfer;
+	fifo->exp_len += predicted_transfer;
+
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+	task->predicted = predicted;
+	task->predicted_transfer = predicted_transfer;
+
+	if (_starpu_top_status_get())
+		_starpu_top_task_prevision(task, best_workerid,
+					(unsigned long long)(fifo->exp_end-predicted)/1000,
+					(unsigned long long)fifo->exp_end/1000);
+
+	if (starpu_get_prefetch_flag())
+	{
+		unsigned memory_node = starpu_worker_get_memory_node(best_workerid);
+		starpu_prefetch_task_input_on_node(task, memory_node);
+	}
+
+
+	double max_time_on_ctx = starpu_get_max_time_worker_on_ctx();
+	if(max_time_on_ctx != -1.0 && starpu_are_overlapping_ctxs_on_worker(best_workerid) && starpu_is_ctxs_turn(best_workerid, sched_ctx_id))
+	{
+		current_time[best_workerid][sched_ctx_id] += predicted;
+		
+		if(current_time[best_workerid][sched_ctx_id] >= max_time_on_ctx)
+		{
+			current_time[best_workerid][sched_ctx_id] = 0.0;
+			starpu_set_turn_to_other_ctx(best_workerid, sched_ctx_id);
+		}
+	}
+
+#ifdef HAVE_AYUDAME_H
+	if (AYU_event) {
+		int id = best_workerid;
+		AYU_event(AYU_ADDTASKTOQUEUE, _starpu_get_job_associated_to_task(task)->job_id, &id);
+	}
+#endif
+	return _starpu_fifo_push_task(hd->queue_array[best_workerid],
+				      sched_mutex, sched_cond, task);
+}
+
+/* TODO: Correct the bugs in the scheduling !!! */
+/* TODO: factorize with dmda!! */
+static void compute_all_performance_predictions(struct starpu_task *task,
+						double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS], 
+						double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+						double *max_exp_endp, double *best_exp_endp,
+						double local_data_penalty[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS],
+						double local_power[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS], 
+						int *forced_worker, int *forced_impl, unsigned sched_ctx_id)
+{
+	int calibrating = 0;
+	double max_exp_end = DBL_MIN;
+	double best_exp_end = DBL_MAX;
+	int ntasks_best = -1;
+	int nimpl_best = 0;
+	double ntasks_best_end = 0.0;
+
+	/* A priori, we know all estimations */
+	int unknown = 0;
+	int worker, worker_ctx = 0;
+	unsigned nimpl;
+
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	starpu_task_bundle_t bundle = task->bundle;
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		if(starpu_is_ctxs_turn(worker, sched_ctx_id) || sched_ctx_id == 0)
+		{
+			for (nimpl = 0; nimpl <STARPU_MAXIMPLEMENTATIONS; nimpl++) 
+			{
+				if (!starpu_worker_can_execute_task(worker, task, nimpl))
+				{
+					/* no one on that queue may execute this task */
+//				worker_ctx++;
+					continue;
+				}
+		
+				/* Sometimes workers didn't take the tasks as early as we expected */
+				struct _starpu_fifo_taskq *fifo = hd->queue_array[worker];
+				pthread_mutex_t *sched_mutex;
+				pthread_cond_t *sched_cond;
+				starpu_worker_get_sched_condition(sched_ctx_id, worker, &sched_mutex, &sched_cond);
+				_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+				fifo->exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
+				exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len;
+				if (exp_end[worker_ctx][nimpl] > max_exp_end)
+					max_exp_end = exp_end[worker_ctx][nimpl];
+				_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+				
+				enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+				unsigned memory_node = starpu_worker_get_memory_node(worker);
+				
+				if (bundle)
+				{
+					/* TODO : conversion time */
+					local_task_length[worker_ctx][nimpl] = starpu_task_bundle_expected_length(bundle, perf_arch, nimpl);
+					local_data_penalty[worker_ctx][nimpl] = starpu_task_bundle_expected_data_transfer_time(bundle, memory_node);
+					local_power[worker_ctx][nimpl] = starpu_task_bundle_expected_power(bundle, perf_arch, nimpl);
+					//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
+				}
+				else 
+				{
+					local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch, nimpl);
+					local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
+					local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch, nimpl);
+					double conversion_time = starpu_task_expected_conversion_time(task, perf_arch, nimpl);
+					if (conversion_time > 0.0)
+						local_task_length[worker_ctx][nimpl] += conversion_time;
+
+					//_STARPU_DEBUG("Scheduler heft bundle: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker_ctx],local_power[worker_ctx],worker,nimpl);
+				}
+				double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
+
+/* 				printf("**********%d/%d: len = %lf penalty = %lf \n", worker, worker_ctx,  */
+/* 				       local_task_length[worker_ctx][nimpl], local_data_penalty[worker_ctx][nimpl]); */
+				
+				if (ntasks_best == -1
+				    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better worker */
+				    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+				    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+					)
+				{
+					ntasks_best_end = ntasks_end;
+					ntasks_best = worker;
+					nimpl_best = nimpl;
+				}
+				
+				if (isnan(local_task_length[worker_ctx][nimpl]))
+					/* we are calibrating, we want to speed-up calibration time
+					 * so we privilege non-calibrated tasks (but still
+					 * greedily distribute them to avoid dumb schedules) */
+					calibrating = 1;
+				
+				if (isnan(local_task_length[worker_ctx][nimpl])
+				    || _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
+					/* there is no prediction available for that task
+					 * with that arch (yet or at all), so switch to a greedy strategy */
+					unknown = 1;
+				
+				if (unknown)
+					continue;
+				
+				exp_end[worker_ctx][nimpl] = fifo->exp_start + fifo->exp_len + local_task_length[worker_ctx][nimpl];
+			
+				if (exp_end[worker_ctx][nimpl] < best_exp_end)
+				{
+					/* a better solution was found */
+					best_exp_end = exp_end[worker_ctx][nimpl];
+					nimpl_best = nimpl;
+				}
+				
+				if (isnan(local_power[worker_ctx][nimpl]))
+					local_power[worker_ctx][nimpl] = 0.;
+				
+			}
+		}
+		worker_ctx++;
+	}
+
+	*forced_worker = unknown?ntasks_best:-1;
+	*forced_impl = unknown?nimpl_best:-1;
+
+	*best_exp_endp = best_exp_end;
+	*max_exp_endp = max_exp_end;
+}
+
+/* TODO: factorize with dmda */
+static int _heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
+{
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	int worker, worker_ctx = 0;
+	unsigned nimpl;
+	int best = -1, best_in_ctx = -1;
+	int selected_impl= -1;
+
+	/* this flag is set if the corresponding worker is selected because
+	   there is no performance prediction available yet */
+	int forced_worker;
+	int forced_impl;
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+	unsigned nworkers_ctx = workers->nworkers;
+	double local_task_length[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double exp_end[STARPU_NMAXWORKERS][STARPU_MAXIMPLEMENTATIONS];
+	double max_exp_end = 0.0;
+
+	double best_exp_end;
+
+	/*
+	 *	Compute the expected end of the task on the various workers,
+	 *	and detect if there is some calibration that needs to be done.
+	 */
+
+	starpu_task_bundle_t bundle = task->bundle;
+
+	if(workers->init_cursor)
+		workers->init_cursor(workers);
+
+	compute_all_performance_predictions(task, local_task_length, exp_end,
+					&max_exp_end, &best_exp_end,
+					local_data_penalty,
+					local_power, &forced_worker, &forced_impl,
+					sched_ctx_id);
+
+	/* If there is no prediction available for that task with that arch we
+	 * want to speed-up calibration time so we force this measurement */
+	if (forced_worker != -1)
+	{
+		_starpu_get_job_associated_to_task(task)->nimpl = forced_impl;
+
+		if (task->bundle)
+			starpu_task_bundle_remove(task->bundle, task);
+
+		return push_task_on_best_worker(task, forced_worker, 0.0, 0.0, sched_ctx_id);
+	}
+
+	/*
+	 *	Determine which worker optimizes the fitness metric which is a
+	 *	trade-off between load-balacing, data locality, and energy
+	 *	consumption.
+	 */
+	
+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double best_fitness = -1;
+
+	while(workers->has_next(workers))
+	{
+		worker = workers->get_next(workers);
+		if(starpu_is_ctxs_turn(worker, sched_ctx_id) || sched_ctx_id == 0)
+		{
+			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{
+				if (!starpu_worker_can_execute_task(worker, task, nimpl))
+				{
+					/* no one on that queue may execute this task */
+					//worker_ctx++;
+					continue;
+				}
+				
+				
+				fitness[worker_ctx][nimpl] = hd->alpha*(exp_end[worker_ctx][nimpl] - best_exp_end) 
+					+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
+					+ hd->_gamma*(local_power[worker_ctx][nimpl]);
+				
+				if (exp_end[worker_ctx][nimpl] > max_exp_end)
+					/* This placement will make the computation
+					 * longer, take into account the idle
+				 * consumption of other cpus */
+					fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
+			
+				if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
+				{
+					/* we found a better solution */
+					best_fitness = fitness[worker_ctx][nimpl];
+					best = worker;
+					best_in_ctx = worker_ctx;
+					selected_impl = nimpl;
+				}
+			}
+		}
+		worker_ctx++;
+	}
+
+	if(best == -1)
+		return -1;
+
+	/* By now, we must have found a solution */
+	STARPU_ASSERT(best != -1);
+
+	/* we should now have the best worker in variable "best" */
+	double model_best, transfer_model_best;
+
+	if (bundle)
+	{
+		/* If we have a task bundle, we have computed the expected
+		 * length for the entire bundle, but not for the task alone. */
+		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(best);
+		unsigned memory_node = starpu_worker_get_memory_node(best);
+		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
+		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
+
+		/* Remove the task from the bundle since we have made a
+		 * decision for it, and that other tasks should not consider it
+		 * anymore. */
+		starpu_task_bundle_remove(bundle, task);
+	}
+	else 
+	{
+		model_best = local_task_length[best_in_ctx][selected_impl];
+		transfer_model_best = local_data_penalty[best_in_ctx][selected_impl];
+	}
+
+	if(workers->init_cursor)
+		workers->deinit_cursor(workers);
+
+	_starpu_get_job_associated_to_task(task)->nimpl = selected_impl;
+
+	return push_task_on_best_worker(task, best, model_best, transfer_model_best, sched_ctx_id);
+}
+
+static int heft_push_task(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+	unsigned nworkers; 
+	int ret_val = -1;
+	if (task->priority > 0)
+	{
+		_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+		nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+		if(nworkers == 0)
+		{
+			_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+			return ret_val;
+		}
+
+		ret_val = _heft_push_task(task, 1, sched_ctx_id);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+	if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+
+	ret_val = _heft_push_task(task, 0, sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+
+	return ret_val;
+}
+
+static struct starpu_task *heft_pop_task(unsigned sched_ctx_id)
+{
+	struct starpu_task *task;
+
+	int workerid = starpu_worker_get_id();
+	heft_data *hd = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	struct _starpu_fifo_taskq *fifo = hd->queue_array[workerid];
+
+	task = _starpu_fifo_pop_local_task(fifo);
+	if (task)
+	{
+		double model = task->predicted;
+
+		fifo->exp_len -= model;
+		fifo->exp_start = starpu_timing_now() + model;
+		fifo->exp_end = fifo->exp_start + fifo->exp_len;
+	}
+	return task;
+}
+
+static void heft_deinit(unsigned sched_ctx_id) 
+{
+	heft_data *ht = (heft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	free(ht);
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+}
+
+struct starpu_sched_policy _starpu_sched_heft_policy =
+{
+	.init_sched = heft_init,
+	.deinit_sched = heft_deinit,
+	.push_task = heft_push_task,
+	.push_task_notify = heft_push_task_notify,
+	.pop_task = heft_pop_task,
+	.pop_every_task = NULL,
+	.pre_exec_hook = heft_pre_exec_hook,
+	.post_exec_hook = NULL,
+	.add_workers = heft_add_workers	,
+	.remove_workers = heft_remove_workers,
+	.policy_name = "heft",
+	.policy_description = "Heterogeneous Earliest Finish Task"
+};

+ 117 - 60
src/sched_policies/parallel_greedy.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,52 +21,55 @@
 #include <common/barrier.h>
 #include <sched_policies/detect_combined_workers.h>
 
-/* the former is the actual queue, the latter some container */
-static struct _starpu_fifo_taskq *fifo;
-static struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
+typedef struct pgreedy_data {
+	struct _starpu_fifo_taskq *fifo;
+	struct _starpu_fifo_taskq *local_fifo[STARPU_NMAXWORKERS];
 
-static int master_id[STARPU_NMAXWORKERS];
+	int master_id[STARPU_NMAXWORKERS];
 
-static _starpu_pthread_cond_t sched_cond;
-static _starpu_pthread_mutex_t sched_mutex;
+	_starpu_pthread_cond_t sched_cond;
+	_starpu_pthread_mutex_t sched_mutex;
 
-static _starpu_pthread_cond_t master_sched_cond[STARPU_NMAXWORKERS];
-static _starpu_pthread_mutex_t master_sched_mutex[STARPU_NMAXWORKERS];
+	_starpu_pthread_cond_t master_sched_cond[STARPU_NMAXWORKERS];
+	_starpu_pthread_mutex_t master_sched_mutex[STARPU_NMAXWORKERS];
+} pgreedy_data;
 
 /* XXX instead of 10, we should use some "MAX combination .."*/
 static int possible_combinations_cnt[STARPU_NMAXWORKERS];
 static int possible_combinations[STARPU_NMAXWORKERS][10];
 static int possible_combinations_size[STARPU_NMAXWORKERS][10];
 
-static void initialize_pgreedy_policy(struct starpu_machine_topology *topology,
-		   __attribute__ ((unused)) struct starpu_sched_policy *_policy)
-{
-	/* masters pick tasks from that queue */
-	fifo = _starpu_create_fifo();
 
-	_starpu_sched_find_worker_combinations(topology);
+/*!!!!!!! It doesn't work with several contexts because the combined workers are constructed
+  from the workers available to the program, and not to the context !!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+static void pgreedy_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct pgreedy_data *data = (struct pgreedy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
-	unsigned workerid;
-	unsigned ncombinedworkers, nworkers;
+	_starpu_sched_find_worker_combinations(workerids, nworkers);
 
-	nworkers = topology->nworkers;
+	unsigned workerid, i;
+	unsigned ncombinedworkers;
+	
 	ncombinedworkers = starpu_combined_worker_get_count();
 
 	/* Find the master of each worker. We first assign the worker as its
 	 * own master, and then iterate over the different worker combinations
 	 * to find the biggest combination containing this worker. */
-
-	for (workerid = 0; workerid < nworkers; workerid++)
+	for(i = 0; i < nworkers; i++)
 	{
+		workerid = workerids[i];
+		
 		int cnt = possible_combinations_cnt[workerid]++;
 		possible_combinations[workerid][cnt] = workerid;
 		possible_combinations_size[workerid][cnt] = 1;
-
-		master_id[workerid] = workerid;
+		
+		data->master_id[workerid] = workerid;
 	}
-
-	unsigned i;
-
+	
+	
 	for (i = 0; i < ncombinedworkers; i++)
 	{
 		workerid = nworkers + i;
@@ -80,8 +84,8 @@ static void initialize_pgreedy_policy(struct starpu_machine_topology *topology,
 		int j;
 		for (j = 0; j < size; j++)
 		{
-			if (master_id[workers[j]] > master)
-				master_id[workers[j]] = master;
+			if (data->master_id[workers[j]] > master)
+				data->master_id[workers[j]] = master;
 
 			int cnt = possible_combinations_cnt[workers[j]]++;
 			possible_combinations[workers[j]][cnt] = workerid;
@@ -89,74 +93,125 @@ static void initialize_pgreedy_policy(struct starpu_machine_topology *topology,
 		}
 	}
 
-	_STARPU_PTHREAD_MUTEX_INIT(&sched_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&sched_cond, NULL);
-
-	for (workerid = 0; workerid < nworkers; workerid++)
-	{
-		_STARPU_PTHREAD_MUTEX_INIT(&master_sched_mutex[workerid], NULL);
-		_STARPU_PTHREAD_COND_INIT(&master_sched_cond[workerid], NULL);
+	for(i = 0; i < nworkers; i++)
+        {
+		workerid = workerids[i];
+		_STARPU_PTHREAD_MUTEX_INIT(&data->master_sched_mutex[workerid], NULL);
+		_STARPU_PTHREAD_COND_INIT(&data->master_sched_cond[workerid], NULL);
 	}
 
-	for (workerid = 0; workerid < nworkers; workerid++)
-	{
+	for(i = 0; i < nworkers; i++)
+        {
+		workerid = workerids[i];
+
 		/* slaves pick up tasks from their local queue, their master
 		 * will put tasks directly in that local list when a parallel
 		 * tasks comes. */
-		local_fifo[workerid] = _starpu_create_fifo();
+		data->local_fifo[workerid] = _starpu_create_fifo();
 
-		unsigned master = master_id[workerid];
+		unsigned master = data->master_id[workerid];
 
 		/* All masters use the same condition/mutex */
 		if (master == workerid)
-		{
-			starpu_worker_set_sched_condition(workerid,
-				&sched_cond, &sched_mutex);
-		}
+			starpu_worker_set_sched_condition(sched_ctx_id, workerid, &data->sched_mutex, &data->sched_cond);
 		else
-		{
-			starpu_worker_set_sched_condition(workerid,
-				&master_sched_cond[master],
-				&master_sched_mutex[master]);
-		}
+			starpu_worker_set_sched_condition(sched_ctx_id, workerid, &data->master_sched_mutex[master], &data->master_sched_cond[master]);
 	}
 
 #if 0
-	for (workerid = 0; workerid < nworkers; workerid++)
-	{
+	for(i = 0; i < nworkers; i++)
+        {
+		workerid = workerids[i];
+
 		fprintf(stderr, "MASTER of %d = %d\n", workerid, master_id[workerid]);
 	}
 #endif
 }
 
-static void deinitialize_pgreedy_policy(__attribute__ ((unused)) struct starpu_machine_topology *topology,
-		   __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void pgreedy_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct pgreedy_data *data = (struct pgreedy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	int workerid;
+	unsigned i;
+	for(i = 0; i < nworkers; i++)
+        {
+		workerid = workerids[i];
+		_starpu_destroy_fifo(data->local_fifo[workerid]);
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL, NULL);
+		_STARPU_PTHREAD_MUTEX_DESTROY(&data->master_sched_mutex[workerid]);
+		_STARPU_PTHREAD_COND_DESTROY(&data->master_sched_cond[workerid]);
+	}
+}
+
+static void initialize_pgreedy_policy(unsigned sched_ctx_id) 
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	struct pgreedy_data *data = (struct pgreedy_data*)malloc(sizeof(pgreedy_data));
+	/* masters pick tasks from that queue */
+	data->fifo = _starpu_create_fifo();
+
+	_STARPU_PTHREAD_MUTEX_INIT(&data->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&data->sched_cond, NULL);
+
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)data);
+}
+
+static void deinitialize_pgreedy_policy(unsigned sched_ctx_id) 
 {
 	/* TODO check that there is no task left in the queue */
+	struct pgreedy_data *data = (struct pgreedy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
 	/* deallocate the job queue */
-	_starpu_destroy_fifo(fifo);
+	_starpu_destroy_fifo(data->fifo);
+
+	_STARPU_PTHREAD_MUTEX_DESTROY(&data->sched_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&data->sched_cond);
+
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+
+	free(data);	
 }
 
 static int push_task_pgreedy_policy(struct starpu_task *task)
 {
-	return _starpu_fifo_push_task(fifo, &sched_mutex, &sched_cond, task);
+	unsigned sched_ctx_id = task->sched_ctx;
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+	unsigned nworkers;
+	int ret_val = -1;
+
+	/* if the context has no workers return */
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+
+   	if(nworkers == 0)
+	{
+   		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+		return ret_val;
+	}
+	struct pgreedy_data *data = (struct pgreedy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	ret_val = _starpu_fifo_push_task(data->fifo, &data->sched_mutex, &data->sched_cond, task);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	
+	return ret_val;
 }
 
-static struct starpu_task *pop_task_pgreedy_policy(void)
+static struct starpu_task *pop_task_pgreedy_policy(unsigned sched_ctx_id)
 {
+	struct pgreedy_data *data = (struct pgreedy_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	int workerid = starpu_worker_get_id();
 
 	/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
 	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER)
-		return  _starpu_fifo_pop_task(fifo, workerid);
+		return  _starpu_fifo_pop_task(data->fifo, workerid);
 
-	int master = master_id[workerid];
+	int master = data->master_id[workerid];
 
 	if (master == workerid)
 	{
 		/* The worker is a master */
-		struct starpu_task *task = _starpu_fifo_pop_task(fifo, workerid);
+		struct starpu_task *task = _starpu_fifo_pop_task(data->fifo, workerid);
 
 		if (!task)
 			return NULL;
@@ -218,9 +273,9 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 				struct starpu_task *alias = _starpu_create_task_alias(task);
 				int local_worker = combined_workerid[i];
 
-				_starpu_fifo_push_task(local_fifo[local_worker],
-					&master_sched_mutex[master],
-					&master_sched_cond[master], alias);
+				_starpu_fifo_push_task(data->local_fifo[local_worker],
+						       &data->master_sched_mutex[master], 
+						       &data->master_sched_cond[master], alias);
 			}
 
 			/* The master also manipulated an alias */
@@ -231,7 +286,7 @@ static struct starpu_task *pop_task_pgreedy_policy(void)
 	else
 	{
 		/* The worker is a slave */
-		return _starpu_fifo_pop_task(local_fifo[workerid], workerid);
+		return _starpu_fifo_pop_task(data->local_fifo[workerid], workerid);
 	}
 }
 
@@ -239,6 +294,8 @@ struct starpu_sched_policy _starpu_sched_pgreedy_policy =
 {
 	.init_sched = initialize_pgreedy_policy,
 	.deinit_sched = deinitialize_pgreedy_policy,
+	.add_workers = pgreedy_add_workers,
+	.remove_workers = pgreedy_remove_workers,
 	.push_task = push_task_pgreedy_policy,
 	.pop_task = pop_task_pgreedy_policy,
 	.pre_exec_hook = NULL,

+ 241 - 136
src/sched_policies/parallel_heft.c

@@ -34,27 +34,30 @@
 #define DBL_MAX __DBL_MAX__
 #endif
 
-static unsigned nworkers, ncombinedworkers;
+//static unsigned ncombinedworkers;
 //static enum starpu_perf_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static unsigned napplicable_perf_archtypes = 0;
 
-static _starpu_pthread_cond_t sched_cond[STARPU_NMAXWORKERS];
-static _starpu_pthread_mutex_t sched_mutex[STARPU_NMAXWORKERS];
-
+typedef struct {
+	double alpha;
+	double beta;
+	double _gamma;
+	double idle_power;
 /* When we push a task on a combined worker we need all the cpu workers it contains
  * to be locked at once */
-static _starpu_pthread_mutex_t global_push_mutex;
-
-static double alpha = _STARPU_DEFAULT_ALPHA;
-static double beta = _STARPU_DEFAULT_BETA;
-static double _gamma = _STARPU_DEFAULT_GAMMA;
-static double idle_power = 0.0;
+	pthread_mutex_t global_push_mutex;
+} pheft_data;
 
 static double worker_exp_start[STARPU_NMAXWORKERS];
 static double worker_exp_end[STARPU_NMAXWORKERS];
 static double worker_exp_len[STARPU_NMAXWORKERS];
 static int ntasks[STARPU_NMAXWORKERS];
 
+
+/*!!!!!!! It doesn't work with several contexts because the combined workers are constructed         
+  from the workers available to the program, and not to the context !!!!!!!!!!!!!!!!!!!!!!!          
+*/
+
 static void parallel_heft_pre_exec_hook(struct starpu_task *task)
 {
 	if (!task->cl || task->execute_on_a_specific_worker)
@@ -62,30 +65,34 @@ static void parallel_heft_pre_exec_hook(struct starpu_task *task)
 
 	int workerid = starpu_worker_get_id();
 	double model = task->predicted;
+	unsigned sched_ctx_id = task->sched_ctx;
 	double transfer_model = task->predicted_transfer;
 
 	if (isnan(model))
 		model = 0.0;
 
+	pthread_mutex_t *sched_mutex;
+	pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(sched_ctx_id, workerid, &sched_mutex, &sched_cond);
+
 	/* Once we have executed the task, we can update the predicted amount
 	 * of work. */
-	_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[workerid]);
+	_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 	worker_exp_len[workerid] -= model + transfer_model;
-	worker_exp_start[workerid] = starpu_timing_now() + model;
+	worker_exp_start[workerid] = starpu_timing_now();
 	worker_exp_end[workerid] = worker_exp_start[workerid] + worker_exp_len[workerid];
 	ntasks[workerid]--;
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[workerid]);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 }
 
-static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio)
+static int push_task_on_best_worker(struct starpu_task *task, int best_workerid, double exp_end_predicted, int prio, unsigned sched_ctx_id)
 {
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
+	
+	pheft_data *hd = (pheft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
 	/* Is this a basic worker or a combined worker ? */
-	int nbasic_workers = (int)starpu_worker_get_count();
-	int is_basic_worker = (best_workerid < nbasic_workers);
-
 	unsigned memory_node;
 	memory_node = starpu_worker_get_memory_node(best_workerid);
 
@@ -94,28 +101,30 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 	int ret = 0;
 
-	if (is_basic_worker)
+	if (!starpu_worker_is_combined_worker(best_workerid))
 	{
 		task->predicted = exp_end_predicted - worker_exp_end[best_workerid];
 		/* TODO */
 		task->predicted_transfer = 0;
+		pthread_mutex_t *sched_mutex;
+		pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(sched_ctx_id, best_workerid, &sched_mutex, &sched_cond);
 
-		_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[best_workerid]);
-
-		worker_exp_start[best_workerid] = STARPU_MAX(worker_exp_start[best_workerid], starpu_timing_now());
+		_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 		worker_exp_len[best_workerid] += task->predicted;
 		worker_exp_end[best_workerid] = exp_end_predicted;
-		ntasks[best_workerid]++;
+		worker_exp_start[best_workerid] = exp_end_predicted - worker_exp_len[best_workerid];
 
-		_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[best_workerid]);
+		ntasks[best_workerid]++;
+		_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 		/* We don't want it to interlace its task with a combined
 		 * worker's one */
-		_STARPU_PTHREAD_MUTEX_LOCK(&global_push_mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&hd->global_push_mutex);
 
 		ret = starpu_push_local_task(best_workerid, task, prio);
 
-		_STARPU_PTHREAD_MUTEX_UNLOCK(&global_push_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&hd->global_push_mutex);
 	}
 	else
 	{
@@ -140,7 +149,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		_STARPU_PTHREAD_BARRIER_INIT(&j->after_work_barrier, NULL, worker_size);
 
 		/* All cpu workers must be locked at once */
-		_STARPU_PTHREAD_MUTEX_LOCK(&global_push_mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&hd->global_push_mutex);
 
 		int i;
 		for (i = 0; i < worker_size; i++)
@@ -151,20 +160,21 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			alias->predicted = exp_end_predicted - worker_exp_end[local_worker];
 			/* TODO */
 			alias->predicted_transfer = 0;
-
-			_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[local_worker]);
-
-			worker_exp_start[local_worker] = STARPU_MAX(worker_exp_start[local_worker], starpu_timing_now());
+			pthread_mutex_t *sched_mutex;
+			pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(sched_ctx_id, local_worker, &sched_mutex, &sched_cond);
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
 			worker_exp_len[local_worker] += alias->predicted;
 			worker_exp_end[local_worker] = exp_end_predicted;
-			ntasks[local_worker]++;
+			worker_exp_start[local_worker] = exp_end_predicted - worker_exp_len[local_worker];
 
-			_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[local_worker]);
+			ntasks[local_worker]++;
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
 
 			ret |= starpu_push_local_task(local_worker, alias, prio);
 		}
 
-		_STARPU_PTHREAD_MUTEX_UNLOCK(&global_push_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&hd->global_push_mutex);
 
 		//TODO : free task
 
@@ -175,7 +185,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 static double compute_expected_end(int workerid, double length)
 {
-	if (workerid < (int)nworkers)
+	if (!starpu_worker_is_combined_worker(workerid))
 	{
 		/* This is a basic worker */
 		return worker_exp_start[workerid] + worker_exp_len[workerid] + length;
@@ -205,7 +215,7 @@ static double compute_expected_end(int workerid, double length)
 static double compute_ntasks_end(int workerid)
 {
 	enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
-	if (workerid < (int)nworkers)
+	if (!starpu_worker_is_combined_worker(workerid))
 	{
 		/* This is a basic worker */
 		return ntasks[workerid] / starpu_worker_get_relative_speedup(perf_arch);
@@ -230,90 +240,110 @@ static double compute_ntasks_end(int workerid)
 	}
 }
 
-static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
+static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, unsigned sched_ctx_id)
 {
-	unsigned worker;
-	int best = -1;
+	pheft_data *hd = (pheft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+	unsigned nworkers_ctx = workers->nworkers;
 
+	unsigned worker, worker_ctx = 0;
+	int best = -1, best_id_ctx = -1;
+	
 	/* this flag is set if the corresponding worker is selected because
 	   there is no performance prediction available yet */
-	int forced_best = -1, forced_nimpl = -1;
+	int forced_best = -1, forced_best_ctx = -1, forced_nimpl = -1;
 
-	double local_task_length[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
-	double local_data_penalty[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
-	double local_power[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
-	double local_exp_end[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
-	double fitness[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	double local_task_length[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_data_penalty[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_power[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double local_exp_end[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
+	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
 	double max_exp_end = 0.0;
 
-	int skip_worker[nworkers+ncombinedworkers][STARPU_MAXIMPLEMENTATIONS];
+	int skip_worker[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
 	double best_exp_end = DBL_MAX;
 	//double penality_best = 0.0;
 
-	int ntasks_best = -1, nimpl_best = -1;
+	int ntasks_best = -1, ntasks_best_ctx = -1, nimpl_best = -1;
 	double ntasks_best_end = 0.0;
 	int calibrating = 0;
 
 	/* A priori, we know all estimations */
 	int unknown = 0;
+	if(workers->init_cursor)
+                workers->init_cursor(workers);
 
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		/* Sometimes workers didn't take the tasks as early as we expected */
-		_STARPU_PTHREAD_MUTEX_LOCK(&sched_mutex[worker]);
-		worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
-		worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
-		if (worker_exp_end[worker] > max_exp_end)
-			max_exp_end = worker_exp_end[worker];
-		_STARPU_PTHREAD_MUTEX_UNLOCK(&sched_mutex[worker]);
+	while(workers->has_next(workers))
+        {
+                worker = workers->get_next(workers);
+
+		if(!starpu_worker_is_combined_worker(worker))
+		{
+			pthread_mutex_t *sched_mutex;
+			pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(sched_ctx_id, worker, &sched_mutex, &sched_cond);
+			/* Sometimes workers didn't take the tasks as early as we expected */
+			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+			worker_exp_start[worker] = STARPU_MAX(worker_exp_start[worker], starpu_timing_now());
+			worker_exp_end[worker] = worker_exp_start[worker] + worker_exp_len[worker];
+			if (worker_exp_end[worker] > max_exp_end)
+				max_exp_end = worker_exp_end[worker];
+			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+		}
 	}
 
 	unsigned nimpl;
-	for (worker = 0; worker < (nworkers+ncombinedworkers); worker++)
+	worker_ctx = 0;
+	while(workers->has_next(workers))
 	{
+                worker = workers->get_next(workers);
+
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 			if (!starpu_combined_worker_can_execute_task(worker, task, nimpl))
 			{
 				/* no one on that queue may execute this task */
-				skip_worker[worker][nimpl] = 1;
+				skip_worker[worker_ctx][nimpl] = 1;
 				continue;
 			}
 			else
 			{
-				skip_worker[worker][nimpl] = 0;
+				skip_worker[worker_ctx][nimpl] = 0;
 			}
 
+       
 			enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 
-			local_task_length[worker][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
+			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
 			unsigned memory_node = starpu_worker_get_memory_node(worker);
-			local_data_penalty[worker][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
+			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time(memory_node, task);
 
 			double ntasks_end = compute_ntasks_end(worker);
 
 			if (ntasks_best == -1
 			    || (!calibrating && ntasks_end < ntasks_best_end) /* Not calibrating, take better task */
-			    || (!calibrating && isnan(local_task_length[worker][nimpl])) /* Not calibrating but this worker is being calibrated */
-			    || (calibrating && isnan(local_task_length[worker][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
+			    || (!calibrating && isnan(local_task_length[worker_ctx][nimpl])) /* Not calibrating but this worker is being calibrated */
+			    || (calibrating && isnan(local_task_length[worker_ctx][nimpl]) && ntasks_end < ntasks_best_end) /* Calibrating, compete this worker with other non-calibrated */
 					)
 			{
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
+				ntasks_best_ctx = worker_ctx;
 				nimpl_best = nimpl;
 			}
 
-			if (isnan(local_task_length[worker][nimpl]))
+			if (isnan(local_task_length[worker_ctx][nimpl]))
 				/* we are calibrating, we want to speed-up calibration time
 				 * so we privilege non-calibrated tasks (but still
 				 * greedily distribute them to avoid dumb schedules) */
 				calibrating = 1;
 
-			if (isnan(local_task_length[worker][nimpl])
-					|| _STARPU_IS_ZERO(local_task_length[worker][nimpl]))
+			if (isnan(local_task_length[worker_ctx][nimpl])
+					|| _STARPU_IS_ZERO(local_task_length[worker_ctx][nimpl]))
 				/* there is no prediction available for that task
 				 * with that arch yet, so switch to a greedy strategy */
 				unknown = 1;
@@ -321,70 +351,78 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 			if (unknown)
 				continue;
 
-			local_exp_end[worker][nimpl] = compute_expected_end(worker, local_task_length[worker][nimpl]);
+			local_exp_end[worker_ctx][nimpl] = compute_expected_end(worker, local_task_length[worker_ctx][nimpl]);
 
-			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker][nimpl], local_exp_end[worker][nimpl]);
+			//fprintf(stderr, "WORKER %d -> length %e end %e\n", worker, local_task_length[worker_ctx][nimpl], local_exp_end[worker][nimpl]);
 
-			if (local_exp_end[worker][nimpl] < best_exp_end)
+			if (local_exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 				/* a better solution was found */
-				best_exp_end = local_exp_end[worker][nimpl];
+				best_exp_end = local_exp_end[worker_ctx][nimpl];
 				nimpl_best = nimpl;
 			}
 
 
-			local_power[worker][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
-			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker][nimpl],local_power[worker][nimpl],worker,nimpl);
+			local_power[worker_ctx][nimpl] = starpu_task_expected_power(task, perf_arch,nimpl);
+			//_STARPU_DEBUG("Scheduler parallel heft: task length (%lf) local power (%lf) worker (%u) kernel (%u) \n", local_task_length[worker],local_power[worker],worker,nimpl);
 
-			if (isnan(local_power[worker][nimpl]))
-				local_power[worker][nimpl] = 0.;
+			if (isnan(local_power[worker_ctx][nimpl]))
+				local_power[worker_ctx][nimpl] = 0.;
 
-		} //end for
+		}
+		worker_ctx++;
 	}
 
 	if (unknown) {
 		forced_best = ntasks_best;
+		forced_best_ctx = ntasks_best_ctx;
 		forced_nimpl = nimpl_best;
 	}
 
 	double best_fitness = -1;
 
-
 	if (forced_best == -1)
 	{
-		for (worker = 0; worker < nworkers+ncombinedworkers; worker++)
+		worker_ctx = 0;
+		while(workers->has_next(workers))
 		{
+			worker = workers->get_next(workers);
+			
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
-				if (skip_worker[worker][nimpl])
+				if (skip_worker[worker_ctx][nimpl])
 				{
 					/* no one on that queue may execute this task */
 					continue;
 				}
 
-				fitness[worker][nimpl] = alpha*(local_exp_end[worker][nimpl] - best_exp_end)
-						+ beta*(local_data_penalty[worker][nimpl])
-						+ _gamma*(local_power[worker][nimpl]);
+				fitness[worker_ctx][nimpl] = hd->alpha*(local_exp_end[worker_ctx][nimpl] - best_exp_end) 
+						+ hd->beta*(local_data_penalty[worker_ctx][nimpl])
+						+ hd->_gamma*(local_power[worker_ctx][nimpl]);
 
-				if (local_exp_end[worker][nimpl] > max_exp_end)
+				if (local_exp_end[worker_ctx][nimpl] > max_exp_end)
 					/* This placement will make the computation
 					 * longer, take into account the idle
 					 * consumption of other cpus */
-					fitness[worker][nimpl] += _gamma * idle_power * (local_exp_end[worker][nimpl] - max_exp_end) / 1000000.0;
+					fitness[worker_ctx][nimpl] += hd->_gamma * hd->idle_power * (local_exp_end[worker_ctx][nimpl] - max_exp_end) / 1000000.0;
 
-				if (best == -1 || fitness[worker][nimpl] < best_fitness)
+				if (best == -1 || fitness[worker_ctx][nimpl] < best_fitness)
 				{
 					/* we found a better solution */
-					best_fitness = fitness[worker][nimpl];
+					best_fitness = fitness[worker_ctx][nimpl];
 					best = worker;
+					best_id_ctx = worker_ctx;
 					nimpl_best = nimpl;
 				}
 
 			//	fprintf(stderr, "FITNESS worker %d -> %e local_exp_end %e - local_data_penalty %e\n", worker, fitness[worker][nimpl], local_exp_end[worker][nimpl] - best_exp_end, local_data_penalty[worker][nimpl]);
 			}
+			worker_ctx++;
 		}
 	}
 
+        if(workers->init_cursor)                                                                                                                                                                                                    workers->deinit_cursor(workers);
+
 	STARPU_ASSERT(forced_best != -1 || best != -1);
 
 	if (forced_best != -1)
@@ -393,83 +431,98 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio)
 		 * with that arch we want to speed-up calibration time
 		 * so we force this measurement */
 		best = forced_best;
+		best_id_ctx = forced_best_ctx;
 		nimpl_best = forced_nimpl;
 		//penality_best = 0.0;
 		best_exp_end = compute_expected_end(best, 0);
 	}
 	else
 	{
-		//penality_best = local_data_penalty[best][nimpl_best];
-		best_exp_end = local_exp_end[best][nimpl_best];
+		//penality_best = local_data_penalty[best_id_ctx][nimpl_best];
+		best_exp_end = local_exp_end[best_id_ctx][nimpl_best];
 	}
 
-
 	//_STARPU_DEBUG("Scheduler parallel heft: kernel (%u)\n", nimpl_best);
 	_starpu_get_job_associated_to_task(task)->nimpl = nimpl_best;
 	/* we should now have the best worker in variable "best" */
-	return push_task_on_best_worker(task, best, best_exp_end, prio);
+	return push_task_on_best_worker(task, best, best_exp_end, prio, sched_ctx_id);
 }
 
 static int parallel_heft_push_task(struct starpu_task *task)
 {
+	unsigned sched_ctx_id = task->sched_ctx;
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+	unsigned nworkers;
+	int ret_val = -1;
+
 	if (task->priority == STARPU_MAX_PRIO)
-		return _parallel_heft_push_task(task, 1);
+	{  
+		_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+                nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+                if(nworkers == 0)
+                {
+                        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                        return ret_val;
+                }
+		
+		ret_val = _parallel_heft_push_task(task, 1, sched_ctx_id);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                return ret_val;
+        }
+
+
+	_STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+        if(nworkers == 0)
+	{
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                return ret_val;
+        }
 
-	return _parallel_heft_push_task(task, 0);
+        ret_val = _parallel_heft_push_task(task, 0, sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+	return ret_val;
 }
 
-static void initialize_parallel_heft_policy(struct starpu_machine_topology *topology,
-	 __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void parallel_heft_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
-	nworkers = topology->nworkers;
-
-	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
-	if (strval_alpha)
-		alpha = atof(strval_alpha);
-
-	const char *strval_beta = getenv("STARPU_SCHED_BETA");
-	if (strval_beta)
-		beta = atof(strval_beta);
-
-	const char *strval_gamma = getenv("STARPU_SCHED_GAMMA");
-	if (strval_gamma)
-		_gamma = atof(strval_gamma);
-
-	const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
-	if (strval_idle_power)
-		idle_power = atof(strval_idle_power);
-
-	_starpu_sched_find_worker_combinations(topology);
-
-	ncombinedworkers = topology->ncombinedworkers;
-
-	unsigned workerid;
-	for (workerid = 0; workerid < nworkers; workerid++)
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
 	{
-		worker_exp_start[workerid] = starpu_timing_now();
-		worker_exp_len[workerid] = 0.0;
-		worker_exp_end[workerid] = worker_exp_start[workerid];
-		ntasks[workerid] = 0;
-
-		_STARPU_PTHREAD_MUTEX_INIT(&sched_mutex[workerid], NULL);
-		_STARPU_PTHREAD_COND_INIT(&sched_cond[workerid], NULL);
+		workerid = workerids[i];
+		struct _starpu_worker *workerarg = _starpu_get_worker_struct(workerid);
+		/* init these structures only once for each worker */
+		if(!workerarg->has_prev_init)
+		{
+			worker_exp_start[workerid] = starpu_timing_now();
+			worker_exp_len[workerid] = 0.0;
+			worker_exp_end[workerid] = worker_exp_start[workerid]; 
+			ntasks[workerid] = 0;
+			workerarg->has_prev_init = 1;
+		}
 
-		starpu_worker_set_sched_condition(workerid, &sched_cond[workerid], &sched_mutex[workerid]);
+		starpu_worker_init_sched_condition(sched_ctx_id, workerid);
 	}
+	_starpu_sched_find_worker_combinations(workerids, nworkers);
 
-	/* We pre-compute an array of all the perfmodel archs that are applicable */
-	unsigned total_worker_count = nworkers + ncombinedworkers;
+// start_unclear_part: not very clear where this is used
+/* 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config(); */
+/* 	ncombinedworkers = config->topology.ncombinedworkers; */
 
-	unsigned used_perf_archtypes[STARPU_NARCH_VARIATIONS];
-	memset(used_perf_archtypes, 0, sizeof(used_perf_archtypes));
+/* 	/\* We pre-compute an array of all the perfmodel archs that are applicable *\/ */
+/* 	unsigned total_worker_count = nworkers + ncombinedworkers; */
 
-	for (workerid = 0; workerid < total_worker_count; workerid++)
-	{
-		enum starpu_perf_archtype perf_archtype = starpu_worker_get_perf_archtype(workerid);
-		used_perf_archtypes[perf_archtype] = 1;
-	}
+/* 	unsigned used_perf_archtypes[STARPU_NARCH_VARIATIONS]; */
+/* 	memset(used_perf_archtypes, 0, sizeof(used_perf_archtypes)); */
+
+/* 	for (workerid = 0; workerid < total_worker_count; workerid++) */
+/* 	{ */
+/* 		enum starpu_perf_archtype perf_archtype = starpu_worker_get_perf_archtype(workerid); */
+/* 		used_perf_archtypes[perf_archtype] = 1; */
+/* 	} */
 
-	_STARPU_PTHREAD_MUTEX_INIT(&global_push_mutex, NULL);
+// end_unclear_part
 
 //	napplicable_perf_archtypes = 0;
 
@@ -479,14 +532,66 @@ static void initialize_parallel_heft_policy(struct starpu_machine_topology *topo
 //		if (used_perf_archtypes[arch])
 //			applicable_perf_archtypes[napplicable_perf_archtypes++] = arch;
 //	}
+
+}
+
+static void parallel_heft_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	unsigned i;
+	int worker;
+	for(i = 0; i < nworkers; i++)
+	{
+		worker = workerids[i];
+		starpu_worker_deinit_sched_condition(sched_ctx_id, worker);
+	}
+}
+static void initialize_parallel_heft_policy(unsigned sched_ctx_id) 
+{	
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+	pheft_data *hd = (pheft_data*)malloc(sizeof(pheft_data));
+	hd->alpha = _STARPU_DEFAULT_ALPHA;
+	hd->beta = _STARPU_DEFAULT_BETA;
+	hd->_gamma = _STARPU_DEFAULT_GAMMA;
+	hd->idle_power = 0.0;
+	
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)hd);
+
+	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
+	if (strval_alpha)
+		hd->alpha = atof(strval_alpha);
+
+	const char *strval_beta = getenv("STARPU_SCHED_BETA");
+	if (strval_beta)
+		hd->beta = atof(strval_beta);
+
+	const char *strval_gamma = getenv("STARPU_SCHED_GAMMA");
+	if (strval_gamma)
+		hd->_gamma = atof(strval_gamma);
+
+	const char *strval_idle_power = getenv("STARPU_IDLE_POWER");
+	if (strval_idle_power)
+		hd->idle_power = atof(strval_idle_power);
+	
+	_STARPU_PTHREAD_MUTEX_INIT(&hd->global_push_mutex, NULL);
+
+}
+
+static void parallel_heft_deinit(unsigned sched_ctx_id) 
+{
+	pheft_data *hd = (pheft_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&hd->global_push_mutex);
+	free(hd);
 }
 
 /* TODO: use post_exec_hook to fix the expected start */
 struct starpu_sched_policy _starpu_sched_parallel_heft_policy =
 {
 	.init_sched = initialize_parallel_heft_policy,
-	.deinit_sched = NULL,
-	.push_task = parallel_heft_push_task,
+	.deinit_sched = parallel_heft_deinit,
+	.add_workers = parallel_heft_add_workers,
+	.remove_workers = parallel_heft_remove_workers,
+	.push_task = parallel_heft_push_task, 
 	.pop_task = NULL,
 	.pre_exec_hook = parallel_heft_pre_exec_hook,
 	.post_exec_hook = NULL,

+ 67 - 24
src/sched_policies/random_policy.c

@@ -19,25 +19,27 @@
 
 #include <starpu_rand.h>
 #include <core/workers.h>
+#include <core/sched_ctx.h>
 #include <sched_policies/fifo_queues.h>
-#include <core/debug.h>
-
-static unsigned nworkers;
-
-static _starpu_pthread_cond_t sched_cond[STARPU_NMAXWORKERS];
-static _starpu_pthread_mutex_t sched_mutex[STARPU_NMAXWORKERS];
 
 static int _random_push_task(struct starpu_task *task, unsigned prio)
 {
 	/* find the queue */
-	unsigned worker;
 
 	unsigned selected = 0;
 
 	double alpha_sum = 0.0;
 
-	for (worker = 0; worker < nworkers; worker++)
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+        int worker;
+        if(workers->init_cursor)
+                workers->init_cursor(workers);
+
+        while(workers->has_next(workers))
 	{
+                worker = workers->get_next(workers);
+
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		alpha_sum += starpu_worker_get_relative_speedup(perf_arch);
 	}
@@ -46,8 +48,10 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 //	_STARPU_DEBUG("my rand is %e\n", random);
 
 	double alpha = 0.0;
-	for (worker = 0; worker < nworkers; worker++)
-	{
+	while(workers->has_next(workers))
+        {
+                worker = workers->get_next(workers);
+
 		enum starpu_perf_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
 		double worker_alpha = starpu_worker_get_relative_speedup(perf_arch);
 
@@ -67,36 +71,75 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 		AYU_event(AYU_ADDTASKTOQUEUE, _starpu_get_job_associated_to_task(task)->job_id, &id);
 	}
 #endif
+	if(workers->init_cursor)
+                workers->deinit_cursor(workers);
+
 	/* we should now have the best worker in variable "selected" */
-	return starpu_push_local_task(selected, task, prio);
+	int n = starpu_push_local_task(selected, task, prio);
+	return n;
 }
 
 static int random_push_task(struct starpu_task *task)
 {
-	return _random_push_task(task, !!task->priority);
+	unsigned sched_ctx_id = task->sched_ctx;
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+	unsigned nworkers;
+        int ret_val = -1;
+
+        _STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+	nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+        if(nworkers == 0)
+        {
+		_STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                return ret_val;
+        }
+
+        ret_val = _random_push_task(task, !!task->priority);
+        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+        return ret_val;
 }
 
-static void initialize_random_policy(struct starpu_machine_topology *topology,
-				     __attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void random_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers) 
 {
-	starpu_srand48(time(NULL));
-
-	nworkers = topology->nworkers;
-
-	unsigned workerid;
-	for (workerid = 0; workerid < nworkers; workerid++)
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
 	{
-		_STARPU_PTHREAD_MUTEX_INIT(&sched_mutex[workerid], NULL);
-		_STARPU_PTHREAD_COND_INIT(&sched_cond[workerid], NULL);
+		workerid = workerids[i];
+		struct _starpu_worker *workerarg = _starpu_get_worker_struct(workerid);
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, &workerarg->sched_mutex, &workerarg->sched_cond);
+	}
+}
 
-		starpu_worker_set_sched_condition(workerid, &sched_cond[workerid], &sched_mutex[workerid]);
+static void random_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	unsigned i;
+	int workerid;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL, NULL);
 	}
+
+}
+
+static void initialize_random_policy(unsigned sched_ctx_id) 
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+	starpu_srand48(time(NULL));
+}
+
+static void deinitialize_random_policy(unsigned sched_ctx_id) 
+{
+	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
 }
 
 struct starpu_sched_policy _starpu_sched_random_policy =
 {
 	.init_sched = initialize_random_policy,
-	.deinit_sched = NULL,
+	.add_workers = random_add_workers,
+	.remove_workers = random_remove_workers,
+	.deinit_sched = deinitialize_random_policy,
 	.push_task = random_push_task,
 	.pop_task = NULL,
 	.pre_exec_hook = NULL,

+ 158 - 98
src/sched_policies/work_stealing_policy.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2012	Inria
+ * Copyright (C) 2011, 2012  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,19 +24,18 @@
 #include <sched_policies/deque_queues.h>
 #include <core/debug.h>
 
-static unsigned nworkers;
-static unsigned last_pop_worker;
-static unsigned last_push_worker;
-static struct _starpu_deque_jobq *queue_array[STARPU_NMAXWORKERS];
-
-static _starpu_pthread_mutex_t global_sched_mutex;
-static _starpu_pthread_cond_t global_sched_cond;
-
-/**
- * Keep track of the work performed from the beginning of the algorithm to make
- * better decisions about which queue to select when stealing or deferring work
- */
-static int performed_total;
+typedef struct{
+	struct _starpu_deque_jobq **queue_array;
+	unsigned rr_worker;
+	/* keep track of the work performed from the beginning of the algorithm to make
+	 * better decisions about which queue to select when stealing or deferring work
+	 */
+	unsigned performed_total;
+	_starpu_pthread_mutex_t sched_mutex;
+	_starpu_pthread_cond_t sched_cond;
+	unsigned last_pop_worker;
+	unsigned last_push_worker;
+} work_stealing_data;
 
 #ifdef USE_OVERLOAD
 
@@ -55,16 +54,18 @@ static int calibration_value = 0;
  * the worker previously selected doesn't own any task,
  * then we return the first non-empty worker.
  */
-static unsigned select_victim_round_robin(void)
+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
 {
-	unsigned worker = last_pop_worker;
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_pop_worker;
+	unsigned nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
 
 	/* If the worker's queue is empty, let's try
 	 * the next ones */
-	while (!queue_array[worker]->njobs)
+	while (!ws->queue_array[worker]->njobs)
 	{
 		worker = (worker + 1) % nworkers;
-		if (worker == last_pop_worker)
+		if (worker == ws->last_pop_worker)
 		{
 			/* We got back to the first worker,
 			 * don't go in infinite loop */
@@ -72,7 +73,7 @@ static unsigned select_victim_round_robin(void)
 		}
 	}
 
-	last_pop_worker = (worker + 1) % nworkers;
+	ws->last_pop_worker = (worker + 1) % nworkers;
 
 	return worker;
 }
@@ -81,11 +82,13 @@ static unsigned select_victim_round_robin(void)
  * Return a worker to whom add a task.
  * Selecting a worker is done in a round-robin fashion.
  */
-static unsigned select_worker_round_robin(void)
+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
 {
-	unsigned worker = last_push_worker;
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_push_worker;
+	unsigned nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
 
-	last_push_worker = (last_push_worker + 1) % nworkers;
+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
 
 	return worker;
 }
@@ -100,13 +103,14 @@ static unsigned select_worker_round_robin(void)
  * 		a smaller value implies a faster worker with an relatively emptier queue : more suitable to put tasks in
  * 		a bigger value implies a slower worker with an reletively more replete queue : more suitable to steal tasks from
  */
-static float overload_metric(unsigned id)
+static float overload_metric(unsigned sched_ctx_id, unsigned id)
 {
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 	float execution_ratio = 0.0f;
 	float current_ratio = 0.0f;
 
-	int nprocessed = _starpu_get_deque_nprocessed(queue_array[id]);
-	unsigned njobs = _starpu_get_deque_njobs(queue_array[id]);
+	int nprocessed = _starpu_get_deque_nprocessed(ws->queue_array[id]);
+	unsigned njobs = _starpu_get_deque_njobs(ws->queue_array[id]);
 
 	/* Did we get enough information ? */
 	if (performed_total > 0 && nprocessed > 0)
@@ -131,7 +135,7 @@ static float overload_metric(unsigned id)
  * by the tasks are taken into account to select the most suitable
  * worker to steal task from.
  */
-static unsigned select_victim_overload(void)
+static unsigned select_victim_overload(unsigned sched_ctx_id)
 {
 	unsigned worker;
 	float  worker_ratio;
@@ -141,11 +145,17 @@ static unsigned select_victim_overload(void)
 	/* Don't try to play smart until we get
 	 * enough informations. */
 	if (performed_total < calibration_value)
-		return select_victim_round_robin();
+		return select_victim_round_robin(sched_ctx_id);
 
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		worker_ratio = overload_metric(worker);
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+        if(workers->init_cursor)
+                workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+        {
+                worker = workers->get_next(workers);
+		worker_ratio = overload_metric(sched_ctx_id, worker);
 
 		if (worker_ratio > best_ratio)
 		{
@@ -154,6 +164,9 @@ static unsigned select_victim_overload(void)
 		}
 	}
 
+	if(workers->init_cursor)
+                workers->deinit_cursor(workers);
+
 	return best_worker;
 }
 
@@ -164,7 +177,7 @@ static unsigned select_victim_overload(void)
  * by the tasks are taken into account to select the most suitable
  * worker to add a task to.
  */
-static unsigned select_worker_overload(void)
+static unsigned select_worker_overload(unsigned sched_ctx_id)
 {
 	unsigned worker;
 	float  worker_ratio;
@@ -174,11 +187,18 @@ static unsigned select_worker_overload(void)
 	/* Don't try to play smart until we get
 	 * enough informations. */
 	if (performed_total < calibration_value)
-		return select_worker_round_robin();
+		return select_worker_round_robin(sched_ctx_id);
 
-	for (worker = 0; worker < nworkers; worker++)
-	{
-		worker_ratio = overload_metric(worker);
+	struct worker_collection *workers = starpu_get_worker_collection_of_sched_ctx(sched_ctx_id);
+
+        if(workers->init_cursor)
+                workers->init_cursor(workers);
+
+	while(workers->has_next(workers))
+        {
+                worker = workers->get_next(workers);
+
+		worker_ratio = overload_metric(sched_ctx_id, worker);
 
 		if (worker_ratio < best_ratio)
 		{
@@ -187,6 +207,9 @@ static unsigned select_worker_overload(void)
 		}
 	}
 
+	if(workers->init_cursor)
+                workers->deinit_cursor(workers);
+
 	return best_worker;
 }
 
@@ -198,12 +221,12 @@ static unsigned select_worker_overload(void)
  * This is a phony function used to call the right
  * function depending on the value of USE_OVERLOAD.
  */
-static inline unsigned select_victim(void)
+static inline unsigned select_victim(unsigned sched_ctx_id)
 {
 #ifdef USE_OVERLOAD
-	return select_victim_overload();
+	return select_victim_overload(sched_ctx_id);
 #else
-	return select_victim_round_robin();
+	return select_victim_round_robin(sched_ctx_id);
 #endif /* USE_OVERLOAD */
 }
 
@@ -212,29 +235,23 @@ static inline unsigned select_victim(void)
  * This is a phony function used to call the right
  * function depending on the value of USE_OVERLOAD.
  */
-static inline unsigned select_worker(void)
+static inline unsigned select_worker(unsigned sched_ctx_id)
 {
 #ifdef USE_OVERLOAD
-	return select_worker_overload();
+	return select_worker_overload(sched_ctx_id);
 #else
-	return select_worker_round_robin();
+	return select_worker_round_robin(sched_ctx_id);
 #endif /* USE_OVERLOAD */
 }
 
 
 #ifdef STARPU_DEVEL
-#warning TODO rewrite ... this will not scale at all now ...
-#warning and the overload versions are useless with a global mutex ...
+#warning TODO rewrite ... this will not scale at all now
 #endif
-
-/**
- * Return a task to execute.
- * If possible from the calling worker queue, else
- * stealing from an other.
- * For now mutex must be locked before calling this function.
- */
-static struct starpu_task *ws_pop_task(void)
+static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 {
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	struct starpu_task *task;
 	struct _starpu_deque_jobq *q;
 
@@ -242,27 +259,27 @@ static struct starpu_task *ws_pop_task(void)
 
 	STARPU_ASSERT(workerid != -1);
 
-	q = queue_array[workerid];
+	q = ws->queue_array[workerid];
 
 	task = _starpu_deque_pop_task(q, workerid);
 	if (task)
 	{
 		/* there was a local task */
-		performed_total++;
+		ws->performed_total++;
 		q->nprocessed++;
 		q->njobs--;
 		return task;
 	}
 
 	/* we need to steal someone's job */
-	unsigned victim = select_victim();
-	struct _starpu_deque_jobq *victimq = queue_array[victim];
+	unsigned victim = select_victim(sched_ctx_id);
+	struct _starpu_deque_jobq *victimq = ws->queue_array[victim];
 
 	task = _starpu_deque_pop_task(victimq, workerid);
 	if (task)
 	{
 		_STARPU_TRACE_WORK_STEALING(q, workerid);
-		performed_total++;
+		ws->performed_total++;
 
 		/* Beware : we have to increase the number of processed tasks of
 		 * the stealer, not the victim ! */
@@ -273,26 +290,37 @@ static struct starpu_task *ws_pop_task(void)
 	return task;
 }
 
-/**
- * Push a task in the calling worker's queue.
- * If the calling thread is not a worker, push
- * the task in a worker chosen on the fly.
- */
-static int ws_push_task(struct starpu_task *task)
+int ws_push_task(struct starpu_task *task)
 {
+	unsigned sched_ctx_id = task->sched_ctx;
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
 	struct _starpu_deque_jobq *deque_queue;
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task); 
 	int workerid = starpu_worker_get_id();
 
-	_STARPU_PTHREAD_MUTEX_LOCK(&global_sched_mutex);
+	pthread_mutex_t *changing_ctx_mutex = starpu_get_changing_ctx_mutex(sched_ctx_id);
+        unsigned nworkers;
+        int ret_val = -1;
+
+	/* if the context has no workers return */
+        _STARPU_PTHREAD_MUTEX_LOCK(changing_ctx_mutex);
+        nworkers = starpu_get_nworkers_of_sched_ctx(sched_ctx_id);
+        if(nworkers == 0)
+        {
+                _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
+                return ret_val;
+        }
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&ws->sched_mutex);
 
 	/* If the current thread is not a worker but
 	 * the main thread (-1), we find the better one to
 	 * put task on its queue */
 	if (workerid == -1)
-		workerid = select_worker();
+		workerid = select_worker(sched_ctx_id);
 
-	deque_queue = queue_array[workerid];
+	deque_queue = ws->queue_array[workerid];
 
 	_STARPU_TRACE_JOB_PUSH(task, 0);
 #ifdef HAVE_AYUDAME_H
@@ -304,58 +332,90 @@ static int ws_push_task(struct starpu_task *task)
 	_starpu_job_list_push_back(deque_queue->jobq, j);
 	deque_queue->njobs++;
 
-	_STARPU_PTHREAD_COND_SIGNAL(&global_sched_cond);
-	_STARPU_PTHREAD_MUTEX_UNLOCK(&global_sched_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&ws->sched_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&ws->sched_mutex);
+
+        _STARPU_PTHREAD_MUTEX_UNLOCK(changing_ctx_mutex);
 
 	return 0;
 }
 
-/**
- * Initializing the work stealing scheduler.
- */
-static void initialize_ws_policy(struct starpu_machine_topology *topology,
-		__attribute__ ((unused)) struct starpu_sched_policy *_policy)
+static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers) 
+{
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+	
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		ws->queue_array[workerid] = _starpu_create_deque();
+		/**
+		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
+		 * we need to initialize it at -1.
+		 */
+		ws->queue_array[workerid]->nprocessed = -1;
+		ws->queue_array[workerid]->njobs = 0;
+
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, &ws->sched_mutex, &ws->sched_cond);
+	}
+}
+
+static void ws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
-	unsigned workerid;
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
-	nworkers = topology->nworkers;
-	last_pop_worker = 0;
-	last_push_worker = 0;
+	unsigned i;
+	int workerid;
+	
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_deque(ws->queue_array[workerid]);
+		starpu_worker_set_sched_condition(sched_ctx_id, workerid, NULL, NULL);
+	}
+}
+
+static void initialize_ws_policy(unsigned sched_ctx_id) 
+{
+	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
+
+	work_stealing_data *ws = (work_stealing_data*)malloc(sizeof(work_stealing_data));
+	starpu_set_sched_ctx_policy_data(sched_ctx_id, (void*)ws);
+	
+	ws->last_pop_worker = 0;
+	ws->last_push_worker = 0;
 
 	/**
 	 * The first WS_POP_TASK will increase PERFORMED_TOTAL though no task was actually performed yet,
 	 * we need to initialize it at -1.
 	 */
-	performed_total = -1;
+	ws->performed_total = -1;
 
-	_STARPU_PTHREAD_MUTEX_INIT(&global_sched_mutex, NULL);
-	_STARPU_PTHREAD_COND_INIT(&global_sched_cond, NULL);
-
-	for (workerid = 0; workerid < nworkers; workerid++)
-	{
-		queue_array[workerid] = _starpu_create_deque();
+	ws->queue_array = (struct _starpu_deque_jobq**)malloc(STARPU_NMAXWORKERS*sizeof(struct _starpu_deque_jobq*));
 
-		/**
-		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,
-		 * we need to initialize it at -1.
-		 */
-		queue_array[workerid]->nprocessed = -1;
-		queue_array[workerid]->njobs = 0;
+	_STARPU_PTHREAD_MUTEX_INIT(&ws->sched_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&ws->sched_cond, NULL);
+}
 
-		starpu_worker_set_sched_condition(workerid, &global_sched_cond, &global_sched_mutex);
+static void deinit_ws_policy(unsigned sched_ctx_id)
+{
+	work_stealing_data *ws = (work_stealing_data*)starpu_get_sched_ctx_policy_data(sched_ctx_id);
 
-#ifdef USE_OVERLOAD
-		enum starpu_perf_archtype perf_arch;
-		perf_arch = starpu_worker_get_perf_archtype(workerid);
-		calibration_value += (unsigned int) starpu_worker_get_relative_speedup(perf_arch);
-#endif /* USE_OVERLOAD */
-	}
+	free(ws->queue_array);
+	_STARPU_PTHREAD_MUTEX_DESTROY(&ws->sched_mutex);
+	_STARPU_PTHREAD_COND_DESTROY(&ws->sched_cond);
+        free(ws);
+        starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
 }
 
 struct starpu_sched_policy _starpu_sched_ws_policy =
 {
 	.init_sched = initialize_ws_policy,
-	.deinit_sched = NULL,
+	.deinit_sched = deinit_ws_policy,
+	.add_workers = ws_add_workers,
+	.remove_workers = ws_remove_workers,
 	.push_task = ws_push_task,
 	.pop_task = ws_pop_task,
 	.pre_exec_hook = NULL,

+ 6 - 0
src/top/starpu_top.c

@@ -27,6 +27,7 @@
 #include <pthread.h>
 #include <common/timing.h>
 #include <common/utils.h>
+#include <common/config.h>
 
 extern struct _starpu_top_message_queue*  _starpu_top_mt;
 int _starpu_top = 0;
@@ -105,6 +106,11 @@ static void starpu_top_get_device_type(int id, char* type)
 	case STARPU_GORDON_WORKER:
 		strncpy(type, "GORDON",9);
 		break;
+#ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
+	case STARPU_ANY_WORKER:
+		strncpy(type, "ANY",9);
+		break;
+#endif
 	}
 }
 

+ 1 - 1
src/util/execute_on_all.c

@@ -148,7 +148,7 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 
 		_starpu_exclude_task_from_dag(tasks[worker]);
 
-		ret = starpu_task_submit(tasks[worker]);
+		ret = _starpu_task_submit_internally(tasks[worker]);
 		if (ret == -ENODEV)
 		{
 			/* if the worker is not able to execute this tasks, we

+ 2 - 1
src/util/starpu_create_sync_task.c

@@ -17,6 +17,7 @@
 
 #include <starpu.h>
 #include <common/config.h>
+#include <core/task.h>
 
 /* This creates (and submits) an empty task that unlocks a tag once all its
  * dependencies are fulfilled. */
@@ -39,6 +40,6 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 	/* This task does nothing */
 	sync_task->cl = NULL;
 
-	int sync_ret = starpu_task_submit(sync_task);
+	int sync_ret = _starpu_task_submit_internally(sync_task);
 	STARPU_ASSERT(!sync_ret);
 }

+ 1 - 1
src/util/starpu_data_cpy.c

@@ -96,7 +96,7 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 
 	task->synchronous = !asynchronous;
 
-	int ret = starpu_task_submit(task);
+	int ret = _starpu_task_submit_internally(task);
 	STARPU_ASSERT(!ret);
 
 	return 0;

+ 14 - 0
src/util/starpu_insert_task_utils.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -280,6 +281,19 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_si
 		{
 			(void)va_arg(varg_list, starpu_data_handle_t);
 		}
+
+		else if (arg_type==STARPU_HYPERVISOR_TAG) 
+		{
+			int hypervisor_tag = va_arg(varg_list, int);
+			(*task)->hypervisor_tag = hypervisor_tag;
+		}
+		else if (arg_type==STARPU_HYPERVISOR_FLOPS) 
+		{
+			int flops = va_arg(varg_list, int);
+			(*task)->flops = flops;
+		}
+
+
 		else if (arg_type==STARPU_TAG)
 		{
 			starpu_tag_t tag = va_arg(varg_list, starpu_tag_t);

+ 166 - 0
src/worker_collection/worker_list.c

@@ -0,0 +1,166 @@
+#include <starpu.h>
+#include <pthread.h>
+
+static unsigned list_has_next(struct worker_collection *workers)
+{
+	int nworkers = (int)workers->nworkers;
+
+	int *cursor = (int*)pthread_getspecific(workers->cursor_key);
+
+	unsigned ret = cursor ? *cursor < nworkers : 0;
+
+	if(!ret && cursor) *cursor = 0;
+
+	return ret;
+}
+
+static int list_get_next(struct worker_collection *workers)
+{
+	int *workerids = (int *)workers->workerids;
+	int nworkers = (int)workers->nworkers;
+
+	int *cursor = (int*)pthread_getspecific(workers->cursor_key);
+
+	STARPU_ASSERT(*cursor < nworkers);
+
+	int ret = workerids[(*cursor)++];
+
+	return ret;
+}
+
+static unsigned _worker_belongs_to_ctx(struct worker_collection *workers, int workerid)
+{
+	int *workerids = (int *)workers->workerids;
+	unsigned nworkers = workers->nworkers;
+	
+	unsigned i;
+	for(i = 0; i < nworkers; i++)
+	{
+		if(workerids[i] == workerid)
+			return 1;
+	}
+	return 0;
+}
+
+static int list_add(struct worker_collection *workers, int worker)
+{
+	int *workerids = (int *)workers->workerids;
+	unsigned *nworkers = &workers->nworkers;
+
+	STARPU_ASSERT(*nworkers < STARPU_NMAXWORKERS - 1);
+
+	if(!_worker_belongs_to_ctx(workers, worker))
+	{
+		workerids[(*nworkers)++] = worker;
+		return worker;
+	}
+	else 
+		return -1;
+}
+
+static int _get_first_free_worker(int *workerids, int nworkers)
+{
+	int i;
+	for(i = 0; i < nworkers; i++)
+		if(workerids[i] == -1)
+			return i;
+
+	return -1;
+}
+
+/* rearange array of workerids in order not to have {-1, -1, 5, -1, 7}
+   and have instead {5, 7, -1, -1, -1} 
+   it is easier afterwards to iterate the array
+*/
+static void _rearange_workerids(int *workerids, int old_nworkers)
+{
+	int first_free_id = -1;
+	int i;
+	for(i = 0; i < old_nworkers; i++)
+	{
+		if(workerids[i] != -1)
+		{
+			first_free_id = _get_first_free_worker(workerids, old_nworkers);
+			if(first_free_id != -1)
+			{
+				workerids[first_free_id] = workerids[i];
+				workerids[i] = -1;
+			}
+		}
+	  }
+}
+
+static int list_remove(struct worker_collection *workers, int worker)
+{
+	int *workerids = (int *)workers->workerids;
+	unsigned nworkers = workers->nworkers;
+	
+	int found_worker = -1;
+	unsigned i;
+	for(i = 0; i < nworkers; i++)
+	{
+		if(workerids[i] == worker)
+		{
+			workerids[i] = -1;
+			found_worker = worker;
+			break;
+		}
+	}
+
+	_rearange_workerids(workerids, nworkers);
+	if(found_worker != -1)
+		workers->nworkers--;
+
+	return found_worker;
+}
+
+static void _init_workers(int *workerids)
+{
+	unsigned i;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		workerids[i] = -1;
+	return;
+}
+
+static void* list_init(struct worker_collection *workers)
+{
+	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
+	_init_workers(workerids);
+
+	pthread_key_create(&workers->cursor_key, NULL);
+
+	return (void*)workerids;
+}
+
+static void list_deinit(struct worker_collection *workers)
+{
+	free(workers->workerids);
+	pthread_key_delete(workers->cursor_key);
+}
+
+static void list_init_cursor(struct worker_collection *workers)
+{
+	int *cursor = (int*)malloc(sizeof(int));
+	*cursor = 0;
+	pthread_setspecific(workers->cursor_key, (void*)cursor);
+}
+
+static void list_deinit_cursor(struct worker_collection *workers)
+{
+	int *cursor = (int*)pthread_getspecific(workers->cursor_key);
+	*cursor = 0;
+	free(cursor);
+}
+
+struct worker_collection worker_list = {
+	.has_next = list_has_next,
+	.get_next = list_get_next,
+	.add = list_add,
+	.remove = list_remove,
+	.init = list_init,
+	.deinit = list_deinit,
+	.init_cursor = list_init_cursor,
+	.deinit_cursor = list_deinit_cursor,
+	.type = WORKER_LIST
+};
+

+ 1 - 1
starpu-1.0.pc.in

@@ -29,6 +29,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@
+Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SCHED_CTX_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@

+ 39 - 0
tests/cholesky_ctxs/all_sched.sh

@@ -0,0 +1,39 @@
+#!/bin/bash                                                                     
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+#export STARPU_NCUDA=3
+#export STARPU_NCPUS=9
+#export STARPU_DIR=$HOME/sched_ctx/build
+
+#source sched.sh isole 0 0 0 
+#source sched_no_ctxs.sh
+source sched_no_ctxs.sh 1stchole -chole1
+source sched_no_ctxs.sh 2ndchole -chole2
+ 
+source sched_with_ctxs.sh isole 0 0 3 
+source sched_with_ctxs.sh isole 0 1 2
+source sched_with_ctxs.sh isole 0 2 1
+source sched_with_ctxs.sh isole 0 3 0   
+
+source sched_with_ctxs.sh 1gpu 1 0 2
+source sched_with_ctxs.sh 1gpu 1 1 1
+source sched_with_ctxs.sh 1gpu 1 2 0
+
+source sched_with_ctxs.sh 2gpu 2 1 0
+source sched_with_ctxs.sh 2gpu 2 0 1
+
+source sched_with_ctxs.sh 3gpu 3 0 0

+ 108 - 0
tests/cholesky_ctxs/comp.sh

@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+infilename=$1
+outfilename=$2
+withctx=$3
+compute_effic=$4
+ninstr=$5
+best_gflops_withoutctxs=$6
+
+rm -rf $outfilename
+
+while read line
+do 
+    results=($line)
+
+    gflops1=0
+    gflops2=0
+
+    t1=0
+    t2=0
+
+    if [ $withctx -eq 1 ]
+    then
+	gpu=${results[0]}
+	gpu1=${results[1]}
+	gpu2=${results[2]}
+	ncpus1=${results[3]}
+	ncpus2=${results[4]}
+	gflops1=${results[5]}
+	gflops2=${results[6]}
+	t1=${results[7]}
+	t2=${results[8]}
+
+	maxtime=$(echo "$t1/$t2"|bc -l)
+	maxtime=${maxtime/.*}
+
+ 	if [ "$maxtime" == "" ]
+	then
+	    maxtime=$t2
+	else
+	    maxtime=$t1
+	fi
+
+	gflops=$(echo "$ninstr/$maxtime"|bc -l)
+	if [ $compute_effic -eq 1 ]
+	then
+	    gflops_norm=$(echo "$gflops/$best_gflops_withoutctxs"|bc -l)
+	    
+	    echo "$gpu $gpu1 $gpu2 $ncpus1 $ncpus2 `printf '%2.2f %2.2f' $gflops $gflops_norm`" >> $outfilename$gpu1$gpu2
+	else
+	    nres=$(echo "$gpu+$gpu1+$gpu2+$ncpus1+$ncpus2"|bc -l)
+	    best_gflops_rate=$(echo "$best_gflops_withoutctxs/$nres"|bc -l)
+
+	    gflop_rate=$(echo "$gflops/$nres"|bc -l)
+	    gflop_norm_rate=$(echo "$gflop_rate/$best_gflops_rate"|bc -l)
+	    
+	    echo "$ncpus1 $ncpus2 `printf '%2.2f %2.2f %2.2f' $gflops $gflop_rate $gflop_norm_rate`" >> $outfilename  
+	fi
+    else
+
+	nres=${results[0]}
+	gflops1=${results[1]}
+	gflops2=${results[2]}
+	t1=${results[3]}
+	t2=${results[4]}
+
+
+	maxtime=$(echo "$t1/$t2"|bc -l)
+	maxtime=${maxtime/.*}
+
+ 	if [ "$maxtime" == "" ]
+	then
+	    maxtime=$t2
+	else
+	    maxtime=$t1
+	fi
+
+	gflops=$(echo "$ninstr/$maxtime"|bc -l)
+
+	if [ $compute_effic -eq 1 ]
+	then
+	    echo "$nres `printf '%2.2f' $gflops`" >> $outfilename
+	else
+	    gflop_rate=$(echo "$gflops/$nres"|bc -l)
+	    echo "$nres `printf '%2.2f %2.2f' $gflops $gflop_rate`" >> $outfilename
+	fi
+	
+    fi
+
+
+done < $infilename
+

+ 57 - 0
tests/cholesky_ctxs/comp_all.sh

@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+source all_sched.sh
+
+rm -rf res_*
+compute_effic=$1
+#for one matrix 20000 x 20000 and one of 10000 x 10000
+ninstr=2999999987712
+prefix=timings-sched
+
+source comp.sh $prefix/cholesky_no_ctxs res_cholesky_no_ctxs 0 $compute_effic $ninstr
+
+bestval_noctx=0
+while read line
+do 
+    results=($line)
+    val=$(echo "${results[1]}"|bc -l)
+    val=${val/.*}
+
+    if [ $val -gt $bestval_noctx ]
+    then
+	bestval_noctx=$(echo "$val"|bc -l)
+    fi
+done < res_cholesky_no_ctxs
+
+echo $bestval_noctx
+
+source comp.sh $prefix/isole res_isole 1 $compute_effic $ninstr $bestval_noctx
+
+#compute efficiency in a heterogeneous system
+#for the homogeneous one we can compute gflops rate per PU
+
+if [ $compute_effic -eq 1 ]
+then
+    source comp.sh $prefix/1gpu res_1gpu 1 $compute_effic $ninstr $bestval_noctx
+    source comp.sh $prefix/2gpu res_2gpu  1 $compute_effic $ninstr $bestval_noctx
+    source comp.sh $prefix/3gpu res_3gpu 1 $compute_effic $ninstr $bestval_noctx
+
+    source gnuplot_efficiency.sh efficiency
+else
+    source gnuplot_gflopsrate.sh gflopsrate
+fi

+ 58 - 0
tests/cholesky_ctxs/evaluate_expression.sh

@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+nsamples=3
+
+BENCH_NAME=$1
+OPTIONS=$2
+filename=$3
+print_options=$4
+
+gflops1_avg=0
+gflops2_avg=0
+
+t1_avg=0
+t2_avg=0
+t_total_avg=0
+
+for s in `seq 1 $nsamples`
+do
+    echo "$ROOTDIR/examples/$BENCH_NAME $OPTIONS"
+    
+    val=`$ROOTDIR/examples/$BENCH_NAME $OPTIONS`
+    
+    echo "$val"
+    
+    results=($val)
+    
+    gflops1_avg=$(echo "$gflops1_avg+${results[0]}"|bc -l)
+    gflops2_avg=$(echo "$gflops2_avg+${results[1]}"|bc -l)
+    t1_avg=$(echo "$t1_avg+${results[2]}"|bc -l)
+    t2_avg=$(echo "$t2_avg+${results[3]}"|bc -l)
+    t_total_avg=$(echo "$t_total_avg+${results[4]}"|bc -l)
+    
+done
+
+gflops1_avg=$(echo "$gflops1_avg / $nsamples"|bc -l)
+gflops2_avg=$(echo "$gflops2_avg / $nsamples"|bc -l)
+t1_avg=$(echo "$t1_avg / $nsamples"|bc -l)
+t2_avg=$(echo "$t2_avg / $nsamples"|bc -l)
+t_total_avg=$(echo "$t_total_avg / $nsamples"|bc -l)
+
+
+echo "$print_options `printf '%2.2f %2.2f %2.2f %2.2f %2.2f' $gflops1_avg $gflops2_avg $t1_avg $t2_avg $t_total_avg`"
+echo "$print_options `printf '%2.2f %2.2f %2.2f %2.2f %2.2f' $gflops1_avg $gflops2_avg $t1_avg $t2_avg $t_total_avg`" >> $filename

File diff suppressed because it is too large
+ 70 - 0
tests/cholesky_ctxs/gnuplot_efficiency.sh


+ 44 - 0
tests/cholesky_ctxs/gnuplot_gflopsrate.sh

@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+filename=$1
+
+gnuplot > /dev/null << EOF                                                
+set terminal postscript
+set output "| ps2pdf - $filename.pdf"
+                                                                   
+set datafile missing 'x'                                                  
+                                                                          
+set pointsize 0.75                                                        
+set title "Taux du debit per core normalise"
+set grid y                                                                
+set grid x                                                                
+set xrange [20:86]
+set yrange [0.6:1.5]
+
+#set logscale x                                                           
+set xtics ("20/76" 20,"30/66" 30,"40/56" 40, "50/46" 50, "60/36" 60, "70/26" 70, "80/16" 80, "86/10" 86)
+set key invert box right
+#set size 0.1
+
+set xlabel "Nombre de cpus dans le premier contexte / Nombre de cpus dans le deuxieme contexte"
+set ylabel "Efficacite per core"     
+
+                                         
+plot "res_isole" using 1:5 title 'Gflop rate per core' with lines lt rgb "blue" lw 2
+                                                                        
+EOF

+ 56 - 0
tests/cholesky_ctxs/sched_no_ctxs.sh

@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+DIR=$PWD
+ROOTDIR=$DIR/../..
+TIMINGDIR=$DIR/timings-sched/$1
+mkdir -p $TIMINGDIR
+BENCH_NAME=cholesky/cholesky_implicit
+nsamples=5
+
+filename=$TIMINGDIR/cholesky_no_ctxs
+
+
+nmaxcpus=12
+nmincpus=1
+blocks1=40
+blocks2=40
+
+size1=20000
+size2=10000
+
+
+for j in `seq $nmincpus 1 $nmaxcpus`
+do
+    if [ $j -le 3 ]
+    then
+	export STARPU_NCUDA=$j
+    else
+	export STARPU_NCPUS=$(($j-3))
+    fi
+    
+    OPTIONS="$2 -with_noctxs -nblocks1 $blocks1 -size1 $size1 -nblocks2 $blocks2 -size2 $size2"
+
+    source evaluate_expression.sh "$BENCH_NAME" "$OPTIONS" "$filename" "$j"
+
+done
+    
+
+
+
+

+ 70 - 0
tests/cholesky_ctxs/sched_with_ctxs.sh

@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+# 
+# Copyright (C) 2011  INRIA
+# 
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+# 
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# 
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+
+DIR=$PWD
+ROOTDIR=$DIR/../..
+TIMINGDIR=$DIR/timings-sched/
+mkdir -p $TIMINGDIR
+BENCH_NAME=cholesky/cholesky_implicit
+
+filename=$TIMINGDIR/$1
+
+gpu=$2
+gpu1=$3
+gpu2=$4
+
+nmaxcpus=$STARPU_NCPUS
+echo $nmaxcpus
+
+nmincpus1=1
+nmincpus2=1
+
+if [ $gpu1 -gt 0 ]
+then
+    nmincpus1=0
+fi
+
+if [ $gpu2 -gt 0 ]
+then
+    nmincpus2=0
+fi
+
+
+blocks1=40
+blocks2=40
+
+size1=20000
+size2=10000
+
+for j in `seq $nmincpus1 1 $(($nmaxcpus-1))`
+do
+    if [ $j -gt $(($nmaxcpus-$nmincpus2)) ]
+    then
+	break
+    fi
+
+    ncpus1=$j
+    ncpus2=$(($nmaxcpus-$j))    
+    
+    OPTIONS="-with_ctxs -nblocks1 $blocks1 -size1 $size1 -nblocks2 $blocks2 -size2 $size2 -gpu $gpu -gpu1 $gpu1 -gpu2 $gpu2 -cpu1 $ncpus1 -cpu2 $ncpus2"
+
+    source evaluate_expression.sh "$BENCH_NAME" "$OPTIONS" "$filename" "$gpu $gpu1 $gpu2 $ncpus1 $ncpus2"
+
+done
+
+

+ 0 - 0
tools/dev/experimental/test_return_values.sh


Some files were not shown because too many files changed in this diff