Browse Source

mic (perfmodel): merge trunk + finalize perfmodel

Thibaud Lambert 12 years ago
parent
commit
e2b943451f
63 changed files with 1588 additions and 772 deletions
  1. 1 0
      Makefile.am
  2. 7 0
      configure.ac
  3. 7 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  4. 18 3
      doc/doxygen/chapters/api/data_out_of_core.doxy
  5. 99 52
      doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
  6. 35 62
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  7. 0 9
      doc/doxygen/chapters/api/scheduling_policy.doxy
  8. 43 0
      doc/doxygen/chapters/api/workers.doxy
  9. 53 0
      doc/doxygen/chapters/environment_variables.doxy
  10. 24 1
      doc/doxygen/chapters/scheduling_context_hypervisor.doxy
  11. 6 5
      doc/doxygen/chapters/scheduling_contexts.doxy
  12. 1 0
      include/starpu_disk.h
  13. 2 2
      include/starpu_fxt.h
  14. 20 17
      include/starpu_perfmodel.h
  15. 14 34
      include/starpu_sched_ctx.h
  16. 50 0
      include/starpu_sched_ctx_hypervisor.h
  17. 7 7
      include/starpu_scheduler.h
  18. 2 2
      include/starpu_worker.h
  19. 2 2
      sc_hypervisor/examples/app_driven_test/app_driven_test.c
  20. 2 2
      sc_hypervisor/examples/lp_test/lp_resize_test.c
  21. 2 2
      sc_hypervisor/examples/lp_test/lp_test.c
  22. 1 1
      sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
  23. 2 8
      sc_hypervisor/include/sc_hypervisor.h
  24. 3 7
      sc_hypervisor/include/sc_hypervisor_config.h
  25. 0 13
      sc_hypervisor/src/sc_config.c
  26. 25 3
      sc_hypervisor/src/sc_hypervisor.c
  27. 14 0
      sc_hypervisor/src/sc_hypervisor_intern.h
  28. 6 2
      src/Makefile.am
  29. 7 14
      src/core/combined_workers.c
  30. 10 10
      src/core/detect_combined_workers.c
  31. 9 0
      src/core/disk.h
  32. 354 0
      src/core/disk_ops/disk_leveldb.cpp
  33. 1 1
      src/core/jobs.h
  34. 47 28
      src/core/perfmodel/perfmodel.c
  35. 14 4
      src/core/perfmodel/perfmodel.h
  36. 312 289
      src/core/perfmodel/perfmodel_history.c
  37. 41 16
      src/core/perfmodel/perfmodel_print.c
  38. 2 14
      src/core/sched_ctx.c
  39. 1 0
      src/core/sched_ctx.h
  40. 10 5
      src/core/topology.c
  41. 3 3
      src/core/workers.h
  42. 10 0
      src/datawizard/copy_driver.h
  43. 8 4
      src/datawizard/footprint.c
  44. 1 1
      src/datawizard/footprint.h
  45. 9 0
      src/datawizard/memory_manager.h
  46. 4 4
      src/drivers/cpu/driver_cpu.c
  47. 3 3
      src/drivers/driver_common/driver_common.c
  48. 2 2
      src/drivers/driver_common/driver_common.h
  49. 6 5
      src/drivers/mic/driver_mic_common.h
  50. 0 5
      src/drivers/mp_common/sink_common.c
  51. 2 2
      src/drivers/mp_common/source_common.c
  52. 41 15
      src/profiling/bound.c
  53. 4 4
      src/sched_policies/deque_modeling_policy_data_aware.c
  54. 2 2
      src/sched_policies/parallel_heft.c
  55. 1 1
      src/sched_policies/random_policy.c
  56. 1 1
      src/starpu_parameters.h
  57. 3 0
      tests/microbenchs/async_tasks_overhead.c
  58. 1 1
      tests/parallel_tasks/parallel_kernels.c
  59. 13 4
      tests/perfmodels/feed.c
  60. 13 5
      tests/perfmodels/valid_model.c
  61. 20 11
      tests/sched_policies/simple_cpu_gpu_sched.c
  62. 1 1
      tools/Makefile.am
  63. 186 83
      tools/starpu_perfmodel_plot.c

+ 1 - 0
Makefile.am

@@ -75,6 +75,7 @@ versinclude_HEADERS = 				\
 	include/starpu_bound.h			\
 	include/starpu_bound.h			\
 	include/starpu_scheduler.h		\
 	include/starpu_scheduler.h		\
 	include/starpu_sched_ctx.h		\
 	include/starpu_sched_ctx.h		\
+	include/starpu_sched_ctx_hypervisor.h	\
 	include/starpu_top.h			\
 	include/starpu_top.h			\
 	include/starpu_deprecated_api.h         \
 	include/starpu_deprecated_api.h         \
 	include/starpu_hash.h			\
 	include/starpu_hash.h			\

+ 7 - 0
configure.ac

@@ -1578,6 +1578,13 @@ AC_MSG_RESULT($maximplementations)
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 AC_DEFINE_UNQUOTED(STARPU_MAXIMPLEMENTATIONS, [$maximplementations],
 		[maximum number of implementations])
 		[maximum number of implementations])
 
 
+AC_LANG_PUSH([C++])
+AC_CHECK_HEADERS([leveldb/db.h], [AC_DEFINE([STARPU_HAVE_LEVELDB], [1], [Define to 1 if you have the <leveldb/db.h> header file.])])
+STARPU_HAVE_LIBRARY(LEVELDB, [leveldb])
+AM_CONDITIONAL(STARPU_HAVE_LEVELDB, test "x$ac_cv_lib_leveldb_main" = "xyes")
+AC_LANG_POP([C++])
+
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                                    MPI                                      #
 #                                    MPI                                      #

+ 7 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -649,6 +649,13 @@ starpu_task_submit() can be called from anywhere, including codelet
 functions and callbacks, provided that the field
 functions and callbacks, provided that the field
 starpu_task::synchronous is set to 0.
 starpu_task::synchronous is set to 0.
 
 
+\fn int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
+\ingroup API_Codelet_And_Tasks
+This function submits a task to StarPU to the context <c> sched_ctx_id </c>.
+By default starpu_task_submit submits the task to a global context that is
+created automatically by StarPU.
+
+
 \fn int starpu_task_wait_for_all(void)
 \fn int starpu_task_wait_for_all(void)
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks
 This function blocks until all the tasks that were submitted
 This function blocks until all the tasks that were submitted

+ 18 - 3
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -11,17 +11,25 @@
 \ingroup API_Out_Of_Core
 \ingroup API_Out_Of_Core
 This is a set of functions to manipulate datas on disk.
 This is a set of functions to manipulate datas on disk.
 \var starpu_disk_ops::alloc
 \var starpu_disk_ops::alloc
+Create a new location for datas
 \var starpu_disk_ops::free
 \var starpu_disk_ops::free
+Free an allocated data
 \var starpu_disk_ops::open
 \var starpu_disk_ops::open
-open an existing file
+Open an existing location of datas
 \var starpu_disk_ops::close
 \var starpu_disk_ops::close
+Close without delete a location of datas
 \var starpu_disk_ops::read
 \var starpu_disk_ops::read
-~= pread
+Read a data
 \var starpu_disk_ops::write
 \var starpu_disk_ops::write
+Write a data
 \var starpu_disk_ops::plug
 \var starpu_disk_ops::plug
+Connect a disk memory
 \var starpu_disk_ops::unplug
 \var starpu_disk_ops::unplug
+Disconnect a disk memory
 \var starpu_disk_ops::copy
 \var starpu_disk_ops::copy
+Copy disk to disk
 \var starpu_disk_ops::bandwidth
 \var starpu_disk_ops::bandwidth
+Measue the bandwidth and the latency for the disk
 
 
 \fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size)
 \fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size)
 \ingroup API_Out_Of_Core
 \ingroup API_Out_Of_Core
@@ -43,6 +51,7 @@ Close an existing file memory opened with starpu_disk_open.
 \ingroup API_Out_Of_Core
 \ingroup API_Out_Of_Core
 This set uses the stdio library (fwrite, fread...) to read/write on disk. <br />
 This set uses the stdio library (fwrite, fread...) to read/write on disk. <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
+It doesn't support asynchronous transfers.
 
 
 \var starpu_disk_unistd_ops
 \var starpu_disk_unistd_ops
 \ingroup API_Out_Of_Core
 \ingroup API_Out_Of_Core
@@ -53,6 +62,12 @@ This set uses the unistd library (write, read...) to read/write on disk. <br />
 \ingroup API_Out_Of_Core
 \ingroup API_Out_Of_Core
 This set uses the unistd library (write, read...) to read/write on disk with the O_DIRECT flag. <br />
 This set uses the unistd library (write, read...) to read/write on disk with the O_DIRECT flag. <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
 <strong>Warning: It creates one file per allocation !</strong>  <br />
-Only available on Linux.
+Only available on Linux systems.
+
+\var starpu_disk_leveldb_ops
+\ingroup API_Out_Of_Core
+This set uses the leveldb created by Google <br />
+Show here: https://code.google.com/p/leveldb/ <br />
+It doesn't support asynchronous transfers.
 
 
 */
 */

+ 99 - 52
doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2011, 2012, 2013 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  * See the file version.doxy for copying conditions.
  */
  */
 
 
@@ -15,16 +15,26 @@ This structure contains all the methods that implement a hypervisor resizing pol
         Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
         Indicates the name of the policy, if there is not a custom policy, the policy corresponding to this name will be used by the hypervisor
 \var sc_hypervisor_policy::custom
 \var sc_hypervisor_policy::custom
         Indicates whether the policy is custom or not
         Indicates whether the policy is custom or not
+\var sc_hypervisor_policy::size_ctxs
+	Distribute workers to contexts even at the begining of the program
+\var sc_hypervisor_policy::resize_ctxs
+	Require explicit resizing
 \var sc_hypervisor_policy::handle_idle_cycle
 \var sc_hypervisor_policy::handle_idle_cycle
         It is called whenever the indicated worker executes another idle cycle in sched_ctx
         It is called whenever the indicated worker executes another idle cycle in sched_ctx
 \var sc_hypervisor_policy::handle_pushed_task
 \var sc_hypervisor_policy::handle_pushed_task
         It is called whenever a task is pushed on the worker’s queue corresponding to the context sched_ctx
         It is called whenever a task is pushed on the worker’s queue corresponding to the context sched_ctx
 \var sc_hypervisor_policy::handle_poped_task
 \var sc_hypervisor_policy::handle_poped_task
         It is called whenever a task is poped from the worker’s queue corresponding to the context sched_ctx
         It is called whenever a task is poped from the worker’s queue corresponding to the context sched_ctx
+\var sc_hypervisor_policy::handle_poped_task
+	The hypervisor takes a decision when another task was poped from this worker in this ctx
 \var sc_hypervisor_policy::handle_idle_end
 \var sc_hypervisor_policy::handle_idle_end
         It is called whenever a task is executed on the indicated worker and context after a long period of idle time
         It is called whenever a task is executed on the indicated worker and context after a long period of idle time
 \var sc_hypervisor_policy::handle_post_exec_hook
 \var sc_hypervisor_policy::handle_post_exec_hook
         It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
         It is called whenever a tag task has just been executed. The table of resize requests is provided as well as the tag
+\var sc_hypervisor_policy::handle_submitted_job
+	The hypervisor takes a decision when a job was submitted in this ctx
+\var sc_hypervisor_policy::end_ctx
+	The hypervisor takes a decision when a certain ctx was deleted
 
 
 \struct sc_hypervisor_policy_config
 \struct sc_hypervisor_policy_config
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
@@ -41,10 +51,18 @@ can be used to construct new resize strategies.
         Indicates the priority of each worker in the context
         Indicates the priority of each worker in the context
 \var sc_hypervisor_policy_config::max_idle
 \var sc_hypervisor_policy_config::max_idle
         Indicates the maximum idle time accepted before a resize is triggered
         Indicates the maximum idle time accepted before a resize is triggered
+\var sc_hypervisor_policy_config::min_working
+	Indicates that underneath this limit the priority of the worker is reduced
 \var sc_hypervisor_policy_config::fixed_workers
 \var sc_hypervisor_policy_config::fixed_workers
         Indicates which workers can be moved and which ones are fixed
         Indicates which workers can be moved and which ones are fixed
 \var sc_hypervisor_policy_config:: new_workers_max_idle
 \var sc_hypervisor_policy_config:: new_workers_max_idle
         Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
         Indicates the maximum idle time accepted before a resize is triggered for the workers that just arrived in the new context
+\var sc_hypervisor_policy_config::empty_ctx_max_idle
+         Indicates that above this context we allow removing all workers
+\var sc_hypervisor_policy_config::ispeed_w_sample
+         Indicates the sample used to compute the instant speed per worker
+\var sc_hypervisor_policy_config::ispeed_ctx_sample
+        Indicates the sample used to compute the instant speed per ctxs
 
 
 \struct sc_hypervisor_wrapper
 \struct sc_hypervisor_wrapper
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
@@ -102,9 +120,8 @@ Number of tasks of this kind
 \var sc_hypervisor_policy_task_pool::next
 \var sc_hypervisor_policy_task_pool::next
 Other task kinds
 Other task kinds
 
 
-@name Managing the hypervisor
+\fn void *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-
 There is a single hypervisor that is in charge of resizing contexts
 There is a single hypervisor that is in charge of resizing contexts
 and the resizing strategy is chosen at the initialization of the
 and the resizing strategy is chosen at the initialization of the
 hypervisor. A single resize can be done at a time.
 hypervisor. A single resize can be done at a time.
@@ -114,9 +131,7 @@ performance counters to StarPU. By incrementing them, StarPU can help
 the hypervisor in the resizing decision making process. TODO maybe
 the hypervisor in the resizing decision making process. TODO maybe
 they should be hidden to the user
 they should be hidden to the user
 
 
-\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
-\ingroup API_Scheduling_Context_Hypervisor
-Initializes the hypervisor to use the strategy provided as parameter
+This function initializes the hypervisor to use the strategy provided as parameter
 and creates the performance counters (see starpu_sched_ctx_performance_counters).
 and creates the performance counters (see starpu_sched_ctx_performance_counters).
 These performance counters represent actually some callbacks that will
 These performance counters represent actually some callbacks that will
 be used by the contexts to notify the information needed by the
 be used by the contexts to notify the information needed by the
@@ -133,50 +148,88 @@ no synchronization between this function and starpu_shutdown(). Thus,
 this should be called after starpu_shutdown(), because the performance
 this should be called after starpu_shutdown(), because the performance
 counters will still need allocated callback functions.
 counters will still need allocated callback functions.
 
 
-@name Registering Scheduling Contexts to the hypervisor
-\ingroup API_Scheduling_Context_Hypervisor
-
-Scheduling Contexts that have to be resized by the hypervisor must be
-first registered to the hypervisor. Whenever we want to exclude
-contexts from the resizing process we have to unregister them from the
-hypervisor.
-
 \fn void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 \fn void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-Register the context to the hypervisor, and indicate the number of
+Scheduling Contexts that have to be resized by the hypervisor must be
+first registered to the hypervisor. 
+This function registers the context to the hypervisor, and indicate the number of
 flops the context will execute (needed for Gflops rate based strategy
 flops the context will execute (needed for Gflops rate based strategy
 see \ref ResizingStrategies or any other custom strategy needing it, for
 see \ref ResizingStrategies or any other custom strategy needing it, for
 the others we can pass 0.0)
 the others we can pass 0.0)
 
 
 \fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 \fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-Unregister the context from the hypervisor.
+Whenever we want to exclude
+contexts from the resizing process we have to unregister them from the
+hypervisor.
+
+\fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
+\ingroup API_Scheduling_Context_Hypervisor
+Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag
+finished executing 
 
 
-@name Users’ Input In The Resizing Process
-\anchor UsersInputInTheResizingProcess
+\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
+Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
 
 
+\fn void sc_hypervisor_stop_resize(unsigned sched_ctx)
+\ingroup API_Scheduling_Context_Hypervisor
 The user can totally forbid the resizing of a certain context or can
 The user can totally forbid the resizing of a certain context or can
 then change his mind and allow it (in this case the resizing is
 then change his mind and allow it (in this case the resizing is
 managed by the hypervisor, that can forbid it or allow it)
 managed by the hypervisor, that can forbid it or allow it)
 
 
-\fn void sc_hypervisor_stop_resize(unsigned sched_ctx)
-\ingroup API_Scheduling_Context_Hypervisor
-Forbid resizing of a context
-
 \fn void sc_hypervisor_start_resize(unsigned sched_ctx)
 \fn void sc_hypervisor_start_resize(unsigned sched_ctx)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
 Allow resizing of a context. The user can then provide information to
 Allow resizing of a context. The user can then provide information to
 the hypervisor concerning the conditions of resizing.
 the hypervisor concerning the conditions of resizing.
 
 
-\fn void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
+\fn char *sc_hypervisor_get_policy();
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-Requires resizing the context \p sched_ctx whenever a task tagged with the id \p task_tag
-finished executing 
+Returns the name of the resizing policy the hypervisor uses
 
 
-\fn void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+\fn void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-Requires reconsidering the distribution of ressources over the indicated scheduling contexts 
+Ask the hypervisor to add workers to a sched_ctx 
+
+\fn void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigned nworkers_to_remove, unsigned sched_ctx, unsigned now)
+\ingroup API_Scheduling_Context_Hypervisor
+Ask the hypervisor to remove workers from a sched_ctx 
+
+\fn void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now)
+\ingroup API_Scheduling_Context_Hypervisor
+Moves workers from one context to another
+   
+\fn void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+\ingroup API_Scheduling_Context_Hypervisor
+Ask the hypervisor to chose a distribution of workers in the required contexts
+   
+\fn unsigned sc_hypervisor_get_size_req(unsigned **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
+\ingroup API_Scheduling_Context_Hypervisor
+Check if there are pending demands of resizing
+
+\fn void sc_hypervisor_save_size_req(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+\ingroup API_Scheduling_Context_Hypervisor
+Save a demand of resizing
+
+\fn void sc_hypervisor_free_size_req(void)
+\ingroup API_Scheduling_Context_Hypervisor
+Clear the list of pending demands of resizing
+
+\fn unsigned sc_hypervisor_can_resize(unsigned sched_ctx)
+\ingroup API_Scheduling_Context_Hypervisor
+Check out if a context can be resized
+
+\fn void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size)
+\ingroup API_Scheduling_Context_Hypervisor
+Indicate the types of tasks a context will execute in order to better decide the sizing of ctxs
+
+\fn void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total_flops)
+\ingroup API_Scheduling_Context_Hypervisor
+Change dynamically the total number of flops of a context, move the deadline of the finishing time of the context
+
+\fn void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops)
+\ingroup API_Scheduling_Context_Hypervisor
+Change dynamically the number of the elapsed flops in a context, modify the past in order to better compute the speed 
 
 
 \fn void sc_hypervisor_ctl(unsigned sched_ctx, ...)
 \fn void sc_hypervisor_ctl(unsigned sched_ctx, ...)
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
@@ -250,36 +303,33 @@ This macro is used when calling sc_hypervisor_ctl() and must be
 followed by 1 argument (int) indicating the tag an executed task
 followed by 1 argument (int) indicating the tag an executed task
 should have such that this configuration should be taken into account.
 should have such that this configuration should be taken into account.
 
 
-@name Defining a new hypervisor policy
-\ingroup API_Scheduling_Context_Hypervisor
 
 
-While Scheduling Context Hypervisor Plugin comes with a variety of
-resizing policies (see \ref ResizingStrategies), it may sometimes be
-desirable to implement custom policies to address specific problems.
-The API described below allows users to write their own resizing policy.
+\def SC_HYPERVISOR_ISPEED_W_SAMPLE
+\ingroup API_Scheduling_Context_Hypervisor
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument, a double, that indicates the number of flops
+needed to be executed before computing the speed of a worker
 
 
-Here an example of how to define a new policy
+\def SC_HYPERVISOR_ISPEED_CTX_SAMPLE
+\ingroup API_Scheduling_Context_Hypervisor
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 argument, a double, that indicates the number of flops
+needed to be executed before computing the speed of a context
 
 
-\code{.c}
-struct sc_hypervisor_policy dummy_policy =
-{
-       .handle_poped_task = dummy_handle_poped_task,
-       .handle_pushed_task = dummy_handle_pushed_task,
-       .handle_idle_cycle = dummy_handle_idle_cycle,
-       .handle_idle_end = dummy_handle_idle_end,
-       .handle_post_exec_hook = dummy_handle_post_exec_hook,
-       .custom = 1,
-       .name = "dummy"
-};
-\endcode
 
 
-\fn void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
+\def SC_HYPERVISOR_NULL
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-    Moves workers from one context to another
+This macro is used when calling sc_hypervisor_ctl() and must be
+followed by 1 arguments
+
 
 
 \fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
 \fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
-    Returns the configuration structure of a context
+Returns the configuration structure of a context
+
+\fn void sc_hypervisor_set_config(unsigned sched_ctx, void *config);
+\ingroup API_Scheduling_Context_Hypervisor
+Set a certain configuration to a contexts
 
 
 \fn int *sc_hypervisor_get_sched_ctxs();
 \fn int *sc_hypervisor_get_sched_ctxs();
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
@@ -297,8 +347,5 @@ struct sc_hypervisor_policy dummy_policy =
 \ingroup API_Scheduling_Context_Hypervisor
 \ingroup API_Scheduling_Context_Hypervisor
     Returns the flops of a context elapsed from the last resize
     Returns the flops of a context elapsed from the last resize
 
 
-\fn char *sc_hypervisor_get_policy();
-\ingroup API_Scheduling_Context_Hypervisor
-    Returns the name of the resizing policy the hypervisor uses
 
 
 */
 */

+ 35 - 62
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -16,47 +16,6 @@ starpu tasks to them and we schedule them with the policy assigned to
 the context. Scheduling contexts can be created, deleted and modified
 the context. Scheduling contexts can be created, deleted and modified
 dynamically.
 dynamically.
 
 
-\enum starpu_worker_collection_type
-\ingroup API_Scheduling_Contexts
-types of structures the worker collection can implement
-\var starpu_worker_collection_type::STARPU_WORKER_LIST
-\ingroup API_Scheduling_Contexts
-List of workers
-
-\struct starpu_sched_ctx_iterator
-\ingroup API_Scheduling_Contexts
-todo
-\var starpu_sched_ctx_iterator::cursor
-todo
-
-\struct starpu_worker_collection
-\ingroup API_Scheduling_Contexts
-A scheduling context manages a collection of workers that can
-be memorized using different data structures. Thus, a generic
-structure is available in order to simplify the choice of its type.
-Only the list data structure is available but further data
-structures(like tree) implementations are foreseen.
-\var starpu_worker_collection::workerids
-        The workerids managed by the collection
-\var starpu_worker_collection::nworkers
-        The number of workers in the collection
-\var starpu_worker_collection::type
-        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
-\var starpu_worker_collection::has_next
-        Checks if there is another element in collection
-\var starpu_worker_collection::get_next
-        return the next element in the collection
-\var starpu_worker_collection::add
-        add a new element in the collection
-\var starpu_worker_collection::remove
-        remove an element from the collection
-\var starpu_worker_collection::init
-        Initialize the collection
-\var starpu_worker_collection::deinit
-        Deinitialize the colection
-\var starpu_worker_collection::init_iterator
-        Initialize the cursor if there is one
-
 \struct starpu_sched_ctx_performance_counters
 \struct starpu_sched_ctx_performance_counters
 Performance counters used by the starpu to indicate the
 Performance counters used by the starpu to indicate the
 hypervisor how the application and the resources are executing.
 hypervisor how the application and the resources are executing.
@@ -66,11 +25,16 @@ hypervisor how the application and the resources are executing.
 \var starpu_sched_ctx_performance_counters::notify_idle_end
 \var starpu_sched_ctx_performance_counters::notify_idle_end
         Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context. The idle counter it though reset.
         Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context. The idle counter it though reset.
 \var starpu_sched_ctx_performance_counters::notify_pushed_task
 \var starpu_sched_ctx_performance_counters::notify_pushed_task
-        Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+        Notifies the hypervisor that a task has been scheduled on the queue of the worker corresponding to the specified context
 \var starpu_sched_ctx_performance_counters::notify_poped_task
 \var starpu_sched_ctx_performance_counters::notify_poped_task
-        Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+        Informs the hypervisor that a task executing a specified number of instructions has been poped from the worker
 \var starpu_sched_ctx_performance_counters::notify_post_exec_hook
 \var starpu_sched_ctx_performance_counters::notify_post_exec_hook
-        Notifies the hypervisor a task has just been executed
+        Notifies the hypervisor that a task has just been executed
+\var starpu_sched_ctx_performance_counters::notify_submitted_job
+        Notifies the hypervisor that a task has just been submitted
+\var starpu_sched_ctx_performance_counters::notify_delete_context
+        Notifies the hypervisor that the context was deleted
+
 
 
 @name Scheduling Contexts Basic API
 @name Scheduling Contexts Basic API
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
@@ -99,11 +63,6 @@ tasks will be submitted to. The return value should be at most
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
 Create a context indicating an approximate interval of resources
 
 
-\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
-\ingroup API_Scheduling_Contexts
-Delete scheduling context \p sched_ctx_id and transfer remaining
-workers to the inheritor scheduling context.
-
 \fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
 \fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 This function adds dynamically the workers in \p workerids_ctx to the
 This function adds dynamically the workers in \p workerids_ctx to the
@@ -116,6 +75,11 @@ This function removes the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than
 \p sched_ctx_id. The last argument cannot be greater than
 STARPU_NMAX_SCHED_CTXS.
 STARPU_NMAX_SCHED_CTXS.
 
 
+\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Delete scheduling context \p sched_ctx_id and transfer remaining
+workers to the inheritor scheduling context.
+
 \fn void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 \fn void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Indicate which context whill inherit the resources of this context
 Indicate which context whill inherit the resources of this context
@@ -134,12 +98,18 @@ Return the scheduling context the tasks are currently submitted to
 Stop submitting tasks from the empty context list until the next time
 Stop submitting tasks from the empty context list until the next time
 the context has time to check the empty context list
 the context has time to check the empty context list
 
 
-\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
+\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Indicate starpu that the application finished submitting to this
 Indicate starpu that the application finished submitting to this
 context in order to move the workers to the inheritor as soon as
 context in order to move the workers to the inheritor as soon as
 possible.
 possible.
 
 
+\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
+\ingroup API_Scheduling_Contexts
+Returns the list of workers in the array \p workerids, the returned value is the 
+number of workers. The user should free the \p workerids table after finishing
+using it (it is allocated inside the function with the proper size)
+
 \fn unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id)
 \fn unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Return the number of workers managed by the specified contexts
 Return the number of workers managed by the specified contexts
@@ -169,11 +139,6 @@ Manage sharing of resources between contexts: by default a round_robin
 strategy is executed but the user can interfere to tell which ctx has
 strategy is executed but the user can interfere to tell which ctx has
 its turn to pop.
 its turn to pop.
 
 
-\fn double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
-\ingroup API_Scheduling_Contexts
-Time sharing a resources, indicate how long a worker has been active
-in the current sched_ctx.
-
 @name Scheduling Context Priorities
 @name Scheduling Context Priorities
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 
 
@@ -235,16 +200,10 @@ Delete the worker collection of the specified scheduling context
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Return the worker collection managed by the indicated context
 Return the worker collection managed by the indicated context
 
 
-\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
-\ingroup API_Scheduling_Contexts
-Returns the list of workers in the array \p workerids, the returned value is the 
-number of workers. The user should free the \p workerids table after finishing
-using it (it is allocated inside the function with the proper size)
-
 @name Scheduling Context Link with Hypervisor
 @name Scheduling Context Link with Hypervisor
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 
 
-\fn void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
+\fn void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void *perf_counters)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Indicates to starpu the pointer to the performance counter
 Indicates to starpu the pointer to the performance counter
 
 
@@ -261,4 +220,18 @@ Allow the hypervisor to let starpu know he's initialised
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
 Ask starpu if he is informed if the hypervisor is initialised
 Ask starpu if he is informed if the hypervisor is initialised
 
 
+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
+\ingroup API_Scheduling_Contexts
+Allocate the scheduling policy data (private information of the scheduler like queues, variables,
+additional condition variables) the context
+
+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Return the scheduling policy data (private information of the scheduler) of the contexts previously 
+assigned to.
+
+\fn void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+execute any parallel code on the workers of the sched_ctx (workers are blocked)
+
 */
 */

+ 0 - 9
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -73,15 +73,6 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 block and wake up all workers.
 
 
-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
-\ingroup API_Scheduling_Policy
-Each scheduling policy uses some specific data (queues, variables,
-additional condition variables). It is memorize through a local
-structure. This function assigns it to a scheduling context.
-
-\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
-\ingroup API_Scheduling_Policy
-Returns the policy data previously assigned to a context
 
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy
 \ingroup API_Scheduling_Policy

+ 43 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -62,6 +62,49 @@ Intel MIC device
 Intel SCC device
 Intel SCC device
 
 
 
 
+\struct starpu_worker_collection
+\ingroup API_Workers_Properties
+A scheduling context manages a collection of workers that can
+be memorized using different data structures. Thus, a generic
+structure is available in order to simplify the choice of its type.
+Only the list data structure is available but further data
+structures(like tree) implementations are foreseen.
+\var starpu_worker_collection::workerids
+        The workerids managed by the collection
+\var starpu_worker_collection::nworkers
+        The number of workers in the collection
+\var starpu_worker_collection::type
+        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
+\var starpu_worker_collection::has_next
+        Checks if there is another element in collection
+\var starpu_worker_collection::get_next
+        return the next element in the collection
+\var starpu_worker_collection::add
+        add a new element in the collection
+\var starpu_worker_collection::remove
+        remove an element from the collection
+\var starpu_worker_collection::init
+        Initialize the collection
+\var starpu_worker_collection::deinit
+        Deinitialize the colection
+\var starpu_worker_collection::init_iterator
+        Initialize the cursor if there is one
+
+\enum starpu_worker_collection_type
+\ingroup API_Workers_Properties
+Types of structures the worker collection can implement
+\var starpu_worker_collection_type::STARPU_WORKER_LIST
+\ingroup API_Workers_Properties
+The collection is an array
+
+\struct starpu_sched_ctx_iterator
+\ingroup API_Workers_Properties
+Structure needed to iterate on the collection
+\var starpu_sched_ctx_iterator::cursor
+The index of the current worker in the collection, needed when iterating on
+the collection.
+
+
 \fn unsigned starpu_worker_get_count(void)
 \fn unsigned starpu_worker_get_count(void)
 \ingroup API_Workers_Properties
 \ingroup API_Workers_Properties
 This function returns the number of workers (i.e. processing
 This function returns the number of workers (i.e. processing

+ 53 - 0
doc/doxygen/chapters/environment_variables.doxy

@@ -550,4 +550,57 @@ end of the execution of an application (\ref DataStatistics).
 
 
 </dl>
 </dl>
 
 
+\section ConfiguringTheHypervisor Configuring The Hypervisor
+
+<dl>
+
+<dt>SC_HYPERVISOR_POLICY</dt>
+<dd>
+\anchor SC_HYPERVISOR_POLICY
+\addindex __env__SC_HYPERVISOR_POLICY
+Choose between the different resizing policies proposed by StarPU for the hypervisor: 
+idle, app_driven, feft_lp, teft_lp; ispeed_lp, throughput_lp etc.
+
+Use <c>SC_HYPERVISOR_POLICY=help</c> to get the list of available policies for the hypervisor
+</dd>
+
+<dt>SC_HYPERVISOR_TRIGGER_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_TRIGGER_RESIZE
+\addindex __env__SC_HYPERVISOR_TRIGGER_RESIZE
+Choose how should the hypervisor be triggered: <c>speed</c> if the resizing algorithm should
+be called whenever the speed of the context does not correspond to an optimal precomputed value,
+<c>idle</c> it the resizing algorithm should be called whenever the workers are idle for a period
+longer than the value indicated when configuring the hypervisor.
+</dd>
+
+<dt>SC_HYPERVISOR_START_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_START_RESIZE
+\addindex __env__SC_HYPERVISOR_START_RESIZE
+Indicate the moment when the resizing should be available. The value correspond to the percentage
+of the total time of execution of the application. The default value is the resizing frame.
+</dd>
+
+<dt>SC_HYPERVISOR_MAX_SPEED_GAP</dt>
+<dd>
+\anchor SC_HYPERVISOR_MAX_SPEED_GAP
+\addindex __env__SC_HYPERVISOR_MAX_SPEED_GAP
+Indicate the ratio of speed difference between contexts that should trigger the hypervisor.
+This situation may occur only when a theoretical speed could not be computed and the hypervisor
+has no value to compare the speed to. Otherwise the resizing of a context is not influenced by the 
+the speed of the other contexts, but only by the the value that a context should have.
+</dd>
+
+<dt>SC_HYPERVISOR_STOP_PRINT</dt>
+<dd>
+\anchor SC_HYPERVISOR_STOP_PRINT
+\addindex __env__SC_HYPERVISOR_STOP_PRINT
+By default the values of the speed of the workers is printed during the execution
+of the application. If the value 1 is given to this environment variable this printing
+is not done.
+
+</dd>
+
+</dl>
 */
 */

+ 24 - 1
doc/doxygen/chapters/scheduling_context_hypervisor.doxy

@@ -11,7 +11,7 @@
 \section WhatIsTheHypervisor What Is The Hypervisor
 \section WhatIsTheHypervisor What Is The Hypervisor
 
 
 StarPU proposes a platform to construct Scheduling Contexts, to
 StarPU proposes a platform to construct Scheduling Contexts, to
-deleting and modify them dynamically. A parallel kernel, can thus
+delete and modify them dynamically. A parallel kernel, can thus
 be isolated into a scheduling context and interferences between
 be isolated into a scheduling context and interferences between
 several parallel kernels are avoided. If the user knows exactly how
 several parallel kernels are avoided. If the user knows exactly how
 many workers each scheduling context needs, he can assign them to the
 many workers each scheduling context needs, he can assign them to the
@@ -192,4 +192,27 @@ The <b>Throughput </b> strategy focuses on maximizing the throughput of the reso
 and resizes the contexts such that the machine is running at its maximum efficiency
 and resizes the contexts such that the machine is running at its maximum efficiency
 (maximum instant speed of the workers).
 (maximum instant speed of the workers).
 
 
+\section  Defining a new hypervisor policy
+
+While Scheduling Context Hypervisor Plugin comes with a variety of
+resizing policies (see \ref ResizingStrategies), it may sometimes be
+desirable to implement custom policies to address specific problems.
+The API described below allows users to write their own resizing policy.
+
+Here an example of how to define a new policy
+
+\code{.c}
+struct sc_hypervisor_policy dummy_policy =
+{
+       .handle_poped_task = dummy_handle_poped_task,
+       .handle_pushed_task = dummy_handle_pushed_task,
+       .handle_idle_cycle = dummy_handle_idle_cycle,
+       .handle_idle_end = dummy_handle_idle_end,
+       .handle_post_exec_hook = dummy_handle_post_exec_hook,
+       .custom = 1,
+       .name = "dummy"
+};
+\endcode
+
+
 */
 */

+ 6 - 5
doc/doxygen/chapters/scheduling_contexts.doxy

@@ -136,13 +136,14 @@ starpu_sched_ctx_delete(sched_ctx1);
 
 
 \section EmptyingAContext Emptying A Context
 \section EmptyingAContext Emptying A Context
 
 
-A context may not have any resources at the begining or at a certain
+A context may have no resources at the begining or at a certain
 moment of the execution. Task can still be submitted to these contexts
 moment of the execution. Task can still be submitted to these contexts
-and they will execute them as soon as they will have resources. A list
+and they will be executed as soon as the contexts will have resources. A list
 of tasks pending to be executed is kept and when workers are added to
 of tasks pending to be executed is kept and when workers are added to
-the contexts the tasks are submitted. However, if no resources are
-allocated the program will not terminate. If these tasks have not much
-priority the programmer can forbid the application to submitted them
+the contexts these tasks start being submitted. However, if resources 
+are never allocated to the context the program will not terminate. 
+If these tasks have low
+priority the programmer can forbid the application to submit them
 by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
 by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
 
 
 \section ContextsSharingWorkers Contexts Sharing Workers
 \section ContextsSharingWorkers Contexts Sharing Workers

+ 1 - 0
include/starpu_disk.h

@@ -43,6 +43,7 @@ struct starpu_disk_ops {
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
+extern struct starpu_disk_ops starpu_disk_leveldb_ops;
 
 
 void starpu_disk_close(unsigned node, void *obj, size_t size);
 void starpu_disk_close(unsigned node, void *obj, size_t size);
 
 

+ 2 - 2
include/starpu_fxt.h

@@ -31,7 +31,7 @@ struct starpu_fxt_codelet_event
 {
 {
 	char symbol[256];
 	char symbol[256];
 	int workerid;
 	int workerid;
-	enum starpu_perfmodel_archtype archtype;
+	struct starpu_perfmodel_arch arch;
 	uint32_t hash;
 	uint32_t hash;
 	size_t size;
 	size_t size;
 	float time;
 	float time;
@@ -54,7 +54,7 @@ struct starpu_fxt_options
 	int file_rank;
 	int file_rank;
 
 
 	char worker_names[STARPU_NMAXWORKERS][256];
 	char worker_names[STARPU_NMAXWORKERS][256];
-	enum starpu_perfmodel_archtype worker_archtypes[STARPU_NMAXWORKERS];
+	struct starpu_perfmodel_arch worker_archtypes[STARPU_NMAXWORKERS];
 	int nworkers;
 	int nworkers;
 
 
 	struct starpu_fxt_codelet_event **dumped_codelets;
 	struct starpu_fxt_codelet_event **dumped_codelets;

+ 20 - 17
include/starpu_perfmodel.h

@@ -23,6 +23,7 @@
 #include <stdio.h>
 #include <stdio.h>
 
 
 #include <starpu_util.h>
 #include <starpu_util.h>
+#include <starpu_worker.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C"
 extern "C"
@@ -32,13 +33,14 @@ extern "C"
 struct starpu_task;
 struct starpu_task;
 struct starpu_data_descr;
 struct starpu_data_descr;
 
 
-enum starpu_perfmodel_archtype
+#define STARPU_NARCH STARPU_ANY_WORKER
+//char archtype_name[STARPU_NARCH] = {"cpu","cuda","opencl","mic","scc"};
+
+struct starpu_perfmodel_arch
 {
 {
-	STARPU_CPU_DEFAULT = 0,
-	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
-	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
+	enum starpu_worker_archtype type;
+	int devid;
+	int ncore;
 };
 };
 
 
 #ifdef __STDC_VERSION__
 #ifdef __STDC_VERSION__
@@ -60,8 +62,6 @@ _Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
 #  endif
 #  endif
 #endif
 #endif
 
 
-#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
-
 struct starpu_perfmodel_history_entry
 struct starpu_perfmodel_history_entry
 {
 {
 	double mean;
 	double mean;
@@ -113,8 +113,8 @@ struct starpu_perfmodel_history_table;
 struct starpu_perfmodel_per_arch
 struct starpu_perfmodel_per_arch
 {
 {
 	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
 	double (*cost_model)(struct starpu_data_descr *t) STARPU_DEPRECATED;
-	double (*cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-	size_t (*size_base)(struct starpu_task *, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+	size_t (*size_base)(struct starpu_task *, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_table *history;
 	struct starpu_perfmodel_history_list *list;
 	struct starpu_perfmodel_history_list *list;
@@ -142,7 +142,7 @@ struct starpu_perfmodel
 
 
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 
 
-	struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
+	struct starpu_perfmodel_per_arch**** per_arch; /*STARPU_MAXIMPLEMENTATIONS*/
 
 
 	const char *symbol;
 	const char *symbol;
 
 
@@ -151,20 +151,23 @@ struct starpu_perfmodel
 	starpu_pthread_rwlock_t model_rwlock;
 	starpu_pthread_rwlock_t model_rwlock;
 };
 };
 
 
-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid);
+void initialize_model(struct starpu_perfmodel *model);
+
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid);
 
 
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 
 
-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl);
-void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl);
+char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen, unsigned nimpl);
 
 
-double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint);
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, uint32_t footprint);
 int starpu_perfmodel_list(FILE *output);
 int starpu_perfmodel_list(FILE *output);
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
 
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured);
 
 
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 void starpu_bus_print_affinity(FILE *f);

+ 14 - 34
include/starpu_sched_ctx.h

@@ -50,37 +50,6 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
 
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids);
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids);
 
 
-struct starpu_sched_ctx_performance_counters
-{
-	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
-	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
-	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
-	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
-	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
-	void (*notify_delete_context)(unsigned sched_ctx);
-};
-
-#ifdef STARPU_USE_SC_HYPERVISOR
-void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
-void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
-#endif //STARPU_USE_SC_HYPERVISOR
-
-void starpu_sched_ctx_notify_hypervisor_exists(void);
-
-unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
-
-void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
-
-void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
-
-
-struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
-
-void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
-
-struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
-
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
 
 unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
 unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
@@ -95,8 +64,6 @@ unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
 
 
 void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 
 
-double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
-
 int starpu_sched_get_min_priority(void);
 int starpu_sched_get_min_priority(void);
 
 
 int starpu_sched_get_max_priority(void);
 int starpu_sched_get_max_priority(void);
@@ -118,9 +85,22 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 
 
 #define STARPU_DEFAULT_PRIO	0
 #define STARPU_DEFAULT_PRIO	0
 
 
-/* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
+
+void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
+
+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
+
+void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
+
+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
+
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
 
+#ifdef STARPU_USE_SC_HYPERVISOR
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+#endif //STARPU_USE_SC_HYPERVISOR
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 50 - 0
include/starpu_sched_ctx_hypervisor.h

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010 - 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_SCHED_CTX_HYPERVISOR_H__
+#define __STARPU_SCHED_CTX_HYPERVISOR_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+
+
+struct starpu_sched_ctx_performance_counters
+{
+	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
+	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
+	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
+	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
+	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
+	void (*notify_delete_context)(unsigned sched_ctx);
+};
+
+#ifdef STARPU_USE_SC_HYPERVISOR
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void* perf_counters);
+#endif //STARPU_USE_SC_HYPERVISOR
+
+void starpu_sched_ctx_notify_hypervisor_exists(void);
+
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_SCHED_CTX_HYPERVISOR_H__ */

+ 7 - 7
include/starpu_scheduler.h

@@ -64,17 +64,17 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 int starpu_get_prefetch_flag(void);
 int starpu_get_prefetch_flag(void);
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype);
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch);
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
 double starpu_task_expected_data_transfer_time(unsigned memory_node, struct starpu_task *task);
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
 double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_data_access_mode mode);
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
-double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
+double starpu_task_expected_conversion_time(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 
 
 void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
 void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
 #ifdef __cplusplus
 #ifdef __cplusplus

+ 2 - 2
include/starpu_worker.h

@@ -33,12 +33,12 @@ extern "C"
 
 
 enum starpu_worker_archtype
 enum starpu_worker_archtype
 {
 {
-	STARPU_ANY_WORKER,
 	STARPU_CPU_WORKER,
 	STARPU_CPU_WORKER,
 	STARPU_CUDA_WORKER,
 	STARPU_CUDA_WORKER,
 	STARPU_OPENCL_WORKER,
 	STARPU_OPENCL_WORKER,
 	STARPU_MIC_WORKER,
 	STARPU_MIC_WORKER,
-	STARPU_SCC_WORKER
+	STARPU_SCC_WORKER,
+	STARPU_ANY_WORKER
 };
 };
 
 
 struct starpu_sched_ctx_iterator
 struct starpu_sched_ctx_iterator

+ 2 - 2
sc_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -138,8 +138,8 @@ int main()
 
 
 	/* let starpu know which performance counters should use 
 	/* let starpu know which performance counters should use 
 	   to inform the hypervisor how the application and the resources are executing */
 	   to inform the hypervisor how the application and the resources are executing */
-	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
-	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, perf_counters);
 
 
 	/* register the contexts that should be managed by the hypervisor
 	/* register the contexts that should be managed by the hypervisor
 	   and indicate an approximate amount of workload if known;
 	   and indicate an approximate amount of workload if known;

+ 2 - 2
sc_hypervisor/examples/lp_test/lp_resize_test.c

@@ -99,8 +99,8 @@ int main()
 
 
 	/* let starpu know which performance counters should use 
 	/* let starpu know which performance counters should use 
 	   to inform the hypervisor how the application and the resources are executing */
 	   to inform the hypervisor how the application and the resources are executing */
-	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
-	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, perf_counters);
 
 
 	double flops1 = NTASKS*NINCR*1000000000.0;
 	double flops1 = NTASKS*NINCR*1000000000.0;
 	double flops2 = NTASKS*NINCR*1000000000.0;
 	double flops2 = NTASKS*NINCR*1000000000.0;

+ 2 - 2
sc_hypervisor/examples/lp_test/lp_test.c

@@ -98,8 +98,8 @@ int main()
 
 
 	/* let starpu know which performance counters should use 
 	/* let starpu know which performance counters should use 
 	   to inform the hypervisor how the application and the resources are executing */
 	   to inform the hypervisor how the application and the resources are executing */
-	starpu_sched_ctx_set_perf_counters(sched_ctx1, (struct starpu_sched_ctx_performance_counters*)perf_counters);
-	starpu_sched_ctx_set_perf_counters(sched_ctx2, (struct starpu_sched_ctx_performance_counters*)perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx1, perf_counters);
+	starpu_sched_ctx_set_perf_counters(sched_ctx2, perf_counters);
 
 
 	double flops1 = NTASKS*NINCR*1000000000.0;
 	double flops1 = NTASKS*NINCR*1000000000.0;
 	double flops2 = NTASKS*NINCR*1000000000.0;
 	double flops2 = NTASKS*NINCR*1000000000.0;

+ 1 - 1
sc_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -241,7 +241,7 @@ void construct_contexts(void (*bench)(float*, unsigned, unsigned))
 	struct sc_hypervisor_policy policy;
 	struct sc_hypervisor_policy policy;
 	policy.custom = 0;
 	policy.custom = 0;
 	policy.name = "idle";
 	policy.name = "idle";
-	struct starpu_sched_ctx_performance_counters *perf_counters = sc_hypervisor_init(&policy);
+	void *perf_counters = sc_hypervisor_init(&policy);
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers1 = cpu1 + gpu + gpu1;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	int nworkers2 = cpu2 + gpu + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;
 	unsigned n_all_gpus = gpu + gpu1 + gpu2;

+ 2 - 8
sc_hypervisor/include/sc_hypervisor.h

@@ -18,6 +18,7 @@
 #define SC_HYPERVISOR_H
 #define SC_HYPERVISOR_H
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include <starpu_sched_ctx_hypervisor.h>
 #include <sc_hypervisor_config.h>
 #include <sc_hypervisor_config.h>
 #include <sc_hypervisor_monitoring.h>
 #include <sc_hypervisor_monitoring.h>
 #include <math.h>
 #include <math.h>
@@ -30,13 +31,6 @@ extern "C"
 /* synchronise the hypervisor when several workers try to update its information */
 /* synchronise the hypervisor when several workers try to update its information */
 starpu_pthread_mutex_t act_hypervisor_mutex;
 starpu_pthread_mutex_t act_hypervisor_mutex;
 
 
-
-/* Forward declaration of an internal data structure
- * FIXME: Remove when no longer exposed.  */
-/* the resizing is not done instantly, a request is kept and executed 
-   when available */
-struct resize_request_entry;
-
 /* platform of resizing contexts */
 /* platform of resizing contexts */
 struct sc_hypervisor_policy
 struct sc_hypervisor_policy
 {
 {
@@ -75,7 +69,7 @@ struct sc_hypervisor_policy
 };
 };
 
 
 /* start the hypervisor indicating the resizing policy to user */
 /* start the hypervisor indicating the resizing policy to user */
-struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy);
+void* sc_hypervisor_init(struct sc_hypervisor_policy *policy);
 
 
 /* shutdown the hypervisor */
 /* shutdown the hypervisor */
 void sc_hypervisor_shutdown(void);
 void sc_hypervisor_shutdown(void);

+ 3 - 7
sc_hypervisor/include/sc_hypervisor_config.h

@@ -35,10 +35,9 @@ extern "C"
 #define SC_HYPERVISOR_MIN_TASKS -8
 #define SC_HYPERVISOR_MIN_TASKS -8
 #define SC_HYPERVISOR_NEW_WORKERS_MAX_IDLE -9
 #define SC_HYPERVISOR_NEW_WORKERS_MAX_IDLE -9
 #define SC_HYPERVISOR_TIME_TO_APPLY -10
 #define SC_HYPERVISOR_TIME_TO_APPLY -10
-#define SC_HYPERVISOR_EMPTY_CTX_MAX_IDLE -11
-#define SC_HYPERVISOR_NULL -12
-#define	SC_HYPERVISOR_ISPEED_W_SAMPLE -13
-#define SC_HYPERVISOR_ISPEED_CTX_SAMPLE -14
+#define SC_HYPERVISOR_NULL -11
+#define	SC_HYPERVISOR_ISPEED_W_SAMPLE -12
+#define SC_HYPERVISOR_ISPEED_CTX_SAMPLE -13
 
 
 
 
 #define MAX_IDLE_TIME 5000000000
 #define MAX_IDLE_TIME 5000000000
@@ -72,9 +71,6 @@ struct sc_hypervisor_policy_config
 	/* max idle for the workers that will be added during the resizing process*/
 	/* max idle for the workers that will be added during the resizing process*/
 	double new_workers_max_idle;
 	double new_workers_max_idle;
 
 
-	/* above this context we allow removing all workers */
-	double empty_ctx_max_idle[STARPU_NMAXWORKERS];
-
 	/* sample used to compute the instant speed per worker*/
 	/* sample used to compute the instant speed per worker*/
 	double ispeed_w_sample[STARPU_NMAXWORKERS];
 	double ispeed_w_sample[STARPU_NMAXWORKERS];
 
 

+ 0 - 13
sc_hypervisor/src/sc_config.c

@@ -31,7 +31,6 @@ static struct sc_hypervisor_policy_config* _create_config(void)
 		config->priority[i] = -1;
 		config->priority[i] = -1;
 		config->fixed_workers[i] = -1;
 		config->fixed_workers[i] = -1;
 		config->max_idle[i] = -1.0;
 		config->max_idle[i] = -1.0;
-		config->empty_ctx_max_idle[i] = -1.0;
 		config->min_working[i] = -1.0;
 		config->min_working[i] = -1.0;
 		config->ispeed_w_sample[i] = 0.0;
 		config->ispeed_w_sample[i] = 0.0;
 	}
 	}
@@ -52,7 +51,6 @@ static void _update_config(struct sc_hypervisor_policy_config *old, struct sc_hy
 		old->priority[i] = new->priority[i] != -1 ? new->priority[i] : old->priority[i];
 		old->priority[i] = new->priority[i] != -1 ? new->priority[i] : old->priority[i];
 		old->fixed_workers[i] = new->fixed_workers[i] != -1 ? new->fixed_workers[i] : old->fixed_workers[i];
 		old->fixed_workers[i] = new->fixed_workers[i] != -1 ? new->fixed_workers[i] : old->fixed_workers[i];
 		old->max_idle[i] = new->max_idle[i] != -1.0 ? new->max_idle[i] : old->max_idle[i];
 		old->max_idle[i] = new->max_idle[i] != -1.0 ? new->max_idle[i] : old->max_idle[i];
-		old->empty_ctx_max_idle[i] = new->empty_ctx_max_idle[i] != -1.0 ? new->empty_ctx_max_idle[i] : old->empty_ctx_max_idle[i];
 		old->min_working[i] = new->min_working[i] != -1.0 ? new->min_working[i] : old->min_working[i];
 		old->min_working[i] = new->min_working[i] != -1.0 ? new->min_working[i] : old->min_working[i];
 	}
 	}
 }
 }
@@ -85,7 +83,6 @@ void _add_config(unsigned sched_ctx)
 		config->priority[i] = 0;
 		config->priority[i] = 0;
 		config->fixed_workers[i] = 0;
 		config->fixed_workers[i] = 0;
 		config->max_idle[i] = MAX_IDLE_TIME;
 		config->max_idle[i] = MAX_IDLE_TIME;
-		config->empty_ctx_max_idle[i] = MAX_IDLE_TIME;
 		config->min_working[i] = MIN_WORKING_TIME;
 		config->min_working[i] = MIN_WORKING_TIME;
 	}
 	}
 
 
@@ -131,16 +128,6 @@ static struct sc_hypervisor_policy_config* _ctl(unsigned sched_ctx, va_list varg
 
 
 			break;
 			break;
 
 
-		case SC_HYPERVISOR_EMPTY_CTX_MAX_IDLE:
-			workerids = va_arg(varg_list, int*);
-			nworkers = va_arg(varg_list, int);
-			double empty_ctx_max_idle = va_arg(varg_list, double);
-
-			for(i = 0; i < nworkers; i++)
-				config->empty_ctx_max_idle[workerids[i]] = empty_ctx_max_idle;
-
-			break;
-
 		case SC_HYPERVISOR_MIN_WORKING:
 		case SC_HYPERVISOR_MIN_WORKING:
 			workerids = va_arg(varg_list, int*);
 			workerids = va_arg(varg_list, int*);
 			nworkers = va_arg(varg_list, int);
 			nworkers = va_arg(varg_list, int);

+ 25 - 3
sc_hypervisor/src/sc_hypervisor.c

@@ -99,6 +99,25 @@ static struct sc_hypervisor_policy *_find_hypervisor_policy_from_name(const char
 	return NULL;
 	return NULL;
 }
 }
 
 
+static void display_sched_help_message(void)
+{
+	const char* policy_name = getenv("SC_HYPERVISOR_POLICY");
+	if (policy_name && (strcmp(policy_name, "help") == 0))
+	{
+		fprintf(stderr, "SC_HYPERVISOR_POLICY can be either of\n");
+		/* display the description of all predefined policies */
+		unsigned i;
+		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
+		{
+			struct sc_hypervisor_policy *p = predefined_policies[i];
+			if (p->name)
+			{
+				fprintf(stderr, "%s\n", p->name);
+			}
+		}
+	}
+}
+
 static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervisor_policy* hypervisor_policy)
 static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervisor_policy* hypervisor_policy)
 {
 {
 	struct sc_hypervisor_policy *selected_policy = NULL;
 	struct sc_hypervisor_policy *selected_policy = NULL;
@@ -131,8 +150,11 @@ static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervis
 
 
 
 
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
-struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
-{
+void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
+{	
+/* Perhaps we have to display some help */
+	display_sched_help_message();
+
 	hypervisor.min_tasks = 0;
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
 	hypervisor.nsched_ctxs = 0;
 	char* vel_gap = getenv("SC_HYPERVISOR_MAX_SPEED_GAP");
 	char* vel_gap = getenv("SC_HYPERVISOR_MAX_SPEED_GAP");
@@ -198,7 +220,7 @@ struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hyper
 
 
 	starpu_sched_ctx_notify_hypervisor_exists();
 	starpu_sched_ctx_notify_hypervisor_exists();
 
 
-	return perf_counters;
+	return (void*)perf_counters;
 }
 }
 
 
 const char* sc_hypervisor_get_policy()
 const char* sc_hypervisor_get_policy()

+ 14 - 0
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -45,6 +45,20 @@ struct resize_request_entry
 	UT_hash_handle hh;
 	UT_hash_handle hh;
 };
 };
 
 
+/* structure to indicate when the moving of workers was actually done 
+   (moved workers can be seen in the new ctx ) */
+struct resize_ack
+{
+	/* receiver context */
+	int receiver_sched_ctx;
+	/* list of workers required to be moved */
+	int *moved_workers;
+	/* number of workers required to be moved */
+	int nmoved_workers;
+	/* list of workers that actually got in the receiver ctx */
+	int *acked_workers;
+};
+
 struct configuration_entry
 struct configuration_entry
 {
 {
 	/* Key: the tag of tasks concerned by this configuration.  */
 	/* Key: the tag of tasks concerned by this configuration.  */

+ 6 - 2
src/Makefile.am

@@ -52,7 +52,7 @@ lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ $(STARPU_RCCE_CPPFLAGS) -DBUILDING_STARPU
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ $(STARPU_RCCE_CPPFLAGS) -DBUILDING_STARPU
 
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS) $(STARPU_LEVELDB_LDFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
   -version-info $(libstarpu_so_version)
   -version-info $(libstarpu_so_version)
 
 
@@ -240,6 +240,11 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	top/starpu_top_connection.c                          	\
 	top/starpu_top_connection.c                          	\
 	worker_collection/worker_list.c
 	worker_collection/worker_list.c
 
 
+
+if STARPU_HAVE_LEVELDB
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_leveldb.cpp
+endif
+
 if STARPU_USE_CPU
 if STARPU_USE_CPU
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c
 endif
 endif
@@ -275,7 +280,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_unistd_o_d
 endif
 endif
 
 
 
 
-
 #########################################
 #########################################
 #										#
 #										#
 #        Generic MP compilation			#
 #        Generic MP compilation			#

+ 7 - 14
src/core/combined_workers.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,19 +101,12 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
 
 	combined_worker->worker_size = nworkers;
 	combined_worker->worker_size = nworkers;
 
 
-#ifdef STARPU_USE_MIC
-	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
-	{
-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
-		combined_worker->worker_mask = STARPU_MIC;
-	}
-#endif
-	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
-	{
-		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
-		combined_worker->worker_mask = STARPU_CPU;
-	}
-#ifdef STARPU_USE_MIC
+	combined_worker->perf_arch.type = config->workers[workerid_array[0]].perf_arch.type;
+	combined_worker->perf_arch.devid = config->workers[workerid_array[0]].perf_arch.devid; 
+	combined_worker->perf_arch.ncore = nworkers - 1;
+	combined_worker->worker_mask = config->workers[workerid_array[0]].worker_mask;
+	
+#ifdef STARPU_USE_MP
 	combined_worker->count = nworkers -1;
 	combined_worker->count = nworkers -1;
 	pthread_mutex_init(&combined_worker->count_mutex,NULL);
 	pthread_mutex_init(&combined_worker->count_mutex,NULL);
 #endif
 #endif

+ 10 - 10
src/core/detect_combined_workers.c

@@ -248,11 +248,11 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 	mic_id = malloc(sizeof(int)*nb_mics);
 	mic_id = malloc(sizeof(int)*nb_mics);
 	nmics_table = malloc(sizeof(unsigned)*nb_mics);
 	nmics_table = malloc(sizeof(unsigned)*nb_mics);
 	mic_workers = malloc(sizeof(int*)*nb_mics);
 	mic_workers = malloc(sizeof(int*)*nb_mics);
-	for(i=0; i<nb_mics; i++)
+	for(j=0; j<nb_mics; j++)
 	{
 	{
-		mic_id[i] = -1;
-		nmics_table[i] = 0;
-		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
+		mic_id[j] = -1;
+		nmics_table[j] = 0;
+		mic_workers[j] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
 	}
 	}
 #endif /* STARPU_USE_MIC */
 #endif /* STARPU_USE_MIC */
 
 
@@ -292,13 +292,13 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	if (mic_min < 2)
 	if (mic_min < 2)
 		mic_min = 2;
 		mic_min = 2;
-	for(i=0; i<nb_mics; i++)
+	for(j=0; j<nb_mics; j++)
 	{
 	{
 		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
 		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-		if (mic_max == -1 || mic_max > (int) nmics_table[i])
-			mic_max = nmics_table[i];
-		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
-		free(mic_workers[i]);
+		if (mic_max == -1 || mic_max > (int) nmics_table[j])
+			mic_max = nmics_table[j];
+		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,mic_max);
+		free(mic_workers[j]);
 	}
 	}
 	free(mic_id);
 	free(mic_id);
 	free(nmics_table);
 	free(nmics_table);
@@ -325,7 +325,7 @@ static void combine_all_cpu_workers(int *workerids, int nworkers)
 	{
 	{
 		worker = _starpu_get_worker_struct(workerids[i]);
 		worker = _starpu_get_worker_struct(workerids[i]);
 
 
-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		if (worker->arch == STARPU_CPU_WORKER)
 			cpu_workers[ncpus++] = workerids[i];
 			cpu_workers[ncpus++] = workerids[i];
 	}
 	}
 
 

+ 9 - 0
src/core/disk.h

@@ -23,6 +23,11 @@
 #define STARPU_DISK_ALL 1
 #define STARPU_DISK_ALL 1
 #define STARPU_DISK_NO_RECLAIM 2
 #define STARPU_DISK_NO_RECLAIM 2
 
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 #include <datawizard/copy_driver.h>
 #include <datawizard/copy_driver.h>
 
 
 /* interface to manipulate memory disk */
 /* interface to manipulate memory disk */
@@ -56,4 +61,8 @@ int _starpu_get_disk_flag(unsigned node);
 
 
 void _starpu_disk_unregister(void);
 void _starpu_disk_unregister(void);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __DISK_H__ */
 #endif /* __DISK_H__ */

+ 354 - 0
src/core/disk_ops/disk_leveldb.cpp

@@ -0,0 +1,354 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013 Corentin Salingue
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <leveldb/db.h>
+#include <leveldb/options.h>
+
+#include <starpu.h>
+#include <core/disk.h>
+#include <core/perfmodel/perfmodel.h>
+#include <datawizard/copy_driver.h>
+#include <datawizard/memory_manager.h>
+
+#define NITER	64
+
+/* ------------------- use leveldb to write on disk -------------------  */
+
+struct starpu_leveldb_obj {
+	char * key;
+	double size;
+	starpu_pthread_mutex_t mutex;
+};
+
+struct starpu_leveldb_base {
+	leveldb::DB* db;
+	/* if StarPU creates the leveldb */
+	bool created;
+};
+
+
+/* allocation memory on disk */
+static void * 
+starpu_leveldb_alloc (void *base, size_t size)
+{
+	struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+	struct starpu_leveldb_obj * obj = (struct starpu_leveldb_obj *) malloc(sizeof(struct starpu_leveldb_obj));
+	STARPU_ASSERT(obj != NULL);
+
+        STARPU_PTHREAD_MUTEX_INIT(&obj->mutex, NULL);
+
+	char * key = (char *) malloc(256*sizeof(char));
+	strcpy(key, "STARPU");
+	strcat(key,(char *) obj);
+
+	/* create and add a key with a small memory */
+	leveldb::Status s = base_tmp->db->Put(leveldb::WriteOptions(), key, "a");
+	STARPU_ASSERT(s.ok());
+
+	/* obj->size is the real size in the disk */
+	obj->key = key;
+	obj->size = sizeof(char);
+
+	return (void *) obj;
+}
+
+
+/* free memory on disk */
+static void
+starpu_leveldb_free (void *base , void *obj, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+	struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+
+	base_tmp->db->Delete(leveldb::WriteOptions(), tmp->key);
+
+	STARPU_PTHREAD_MUTEX_DESTROY(&tmp->mutex);
+
+	free(tmp->key);
+	free(tmp);
+}
+
+
+/* open an existing memory on disk */
+static void * 
+starpu_leveldb_open (void *base, void *pos, size_t size)
+{
+	struct starpu_leveldb_obj * obj = (struct starpu_leveldb_obj *) malloc(sizeof(struct starpu_leveldb_obj));
+	STARPU_ASSERT(obj != NULL);
+
+        STARPU_PTHREAD_MUTEX_INIT(&obj->mutex, NULL);
+
+	char * key = (char *) malloc((strlen((char *) pos)+1)*sizeof(char));
+	strcpy(key, (char *) pos);
+
+	obj->key = key;	
+	obj->size = size;
+
+	return (void *) obj;
+	
+}
+
+
+/* free memory without delete it */
+static void 
+starpu_leveldb_close (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+
+	STARPU_PTHREAD_MUTEX_DESTROY(&tmp->mutex);
+
+	free(tmp->key);
+	free(tmp);	
+}
+
+
+/* in the leveldb, we are obliged to read and to write the entire data 
+ * so, we have to use buffers to have offset and size options */
+static int 
+starpu_leveldb_read (void *base, void *obj, void *buf, off_t offset, size_t size, void * async_channel STARPU_ATTRIBUTE_UNUSED)
+{
+	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+	struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;	
+	
+	STARPU_PTHREAD_MUTEX_LOCK(&tmp->mutex);
+
+	/* leveldb need a string to store datas */
+	std::string value;
+	leveldb::Status s = base_tmp->db->Get(leveldb::ReadOptions(), tmp->key, &value);
+	uintptr_t value_read = (uintptr_t)(value.c_str());
+
+	/* use buffer */
+	if(s.ok())
+		memcpy(buf, (void *) (value_read+offset), size);
+	else
+		STARPU_ASSERT(s.ok());
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&tmp->mutex);
+
+	return 0;
+}
+
+static int
+starpu_leveldb_full_read(unsigned node, void *base, void * obj, void ** ptr, size_t * size)
+{
+        struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+        struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+
+	*size = tmp->size;
+	*ptr = (size_t *)malloc(*size);
+	return _starpu_disk_read(node, STARPU_MAIN_RAM, obj, *ptr, 0, *size, NULL);
+}
+
+/* write on the memory disk */
+static int 
+starpu_leveldb_write (void *base, void *obj, const void *buf, off_t offset, size_t size, void * async_channel)
+{
+        struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+        struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&tmp->mutex);
+
+	uintptr_t buf_tmp = (uintptr_t) buf;
+	void * buffer = (void *) malloc((tmp->size > size) ? tmp->size : size);
+
+	/* we read the data */
+        std::string value;
+
+        leveldb::Status s = base_tmp->db->Get(leveldb::ReadOptions(), tmp->key, &value);
+        uintptr_t value_read = (uintptr_t)(value.c_str());
+
+        if(s.ok())
+                memcpy(buffer, (void *) value_read, tmp->size);
+        else
+                STARPU_ASSERT(s.ok());
+
+	/* put the new data on their new place */
+	memcpy(buffer, (void *) (buf_tmp+offset), size); 
+
+	/* and write them */
+	s = base_tmp->db->Put(leveldb::WriteOptions(), tmp->key, (char *)buffer);
+	STARPU_ASSERT(s.ok());	
+
+	/* if the new size is higher than the old, we update it - first write after the alloc */
+	tmp->size = (tmp->size > size) ? tmp->size : size;
+	free(buffer);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&tmp->mutex);
+
+	return 0;
+}
+
+static int
+starpu_leveldb_full_write (unsigned node, void * base, void * obj, void * ptr, size_t size)
+{
+	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) obj;
+	struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+	
+	/* update file size to realise the next good full_read */
+	if(size != tmp->size)
+	{
+		_starpu_memory_manager_deallocate_size(tmp->size, node);
+		if (_starpu_memory_manager_can_allocate_size(size, node))
+			tmp->size = size;
+		else
+			STARPU_ASSERT_MSG(0, "Can't allocate size %u on the disk !", (int) size); 
+	}	
+	leveldb::WriteOptions write_options;
+	write_options.sync = true;
+
+        leveldb::Status s = base_tmp->db->Put(write_options, tmp->key, (char *)ptr);
+	STARPU_ASSERT(s.ok());
+	return 0;
+}
+
+
+/* create a new copy of parameter == base */
+static void * 
+starpu_leveldb_plug (void *parameter)
+{
+	struct starpu_leveldb_base * tmp = (struct starpu_leveldb_base *) malloc(sizeof(struct starpu_leveldb_base));
+	STARPU_ASSERT(tmp != NULL);
+
+	leveldb::Status status;
+	leveldb::DB* db;
+	leveldb::Options options;
+	options.create_if_missing = true;
+	
+	/* try to create the database */
+	options.error_if_exists = true;
+	status = leveldb::DB::Open(options, (char *) parameter, &db);
+	tmp->created = true;
+	
+	/* if it has already been created  before */
+	if (!status.ok())
+	{
+		options.error_if_exists = false;
+		status = leveldb::DB::Open(options, (char *) parameter, &db);
+                STARPU_ASSERT_MSG(status.ok(), "StarPU leveldb plug failed !");
+		tmp->created = false;
+	}
+
+	tmp->db = db;
+	STARPU_ASSERT(status.ok());
+	return (void *) tmp;	
+}
+
+
+/* free memory allocated for the base */
+static void
+starpu_leveldb_unplug (void *base)
+{
+	struct starpu_leveldb_base * base_tmp = (struct starpu_leveldb_base *) base;
+	if(base_tmp->created)
+		delete base_tmp->db;
+	free(base);
+}
+
+
+static int
+get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
+{
+
+	unsigned iter;
+	double timing_slowness, timing_latency;
+	struct timeval start;
+	struct timeval end;
+	
+	srand (time (NULL)); 
+	char * buf = (char *) malloc(SIZE_DISK_MIN*sizeof(char));
+	STARPU_ASSERT(buf != NULL);
+	
+	/* allocate memory */
+	void * mem = _starpu_disk_alloc(node, SIZE_DISK_MIN);
+	/* fail to alloc */
+	if (mem == NULL)
+		return 0;
+	struct starpu_leveldb_obj * tmp = (struct starpu_leveldb_obj *) mem;
+
+	/* Measure upload slowness */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; ++iter)
+	{
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, 0, SIZE_DISK_MIN, NULL);
+	}
+	gettimeofday(&end, NULL);
+	timing_slowness = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+
+	/* free memory */
+	free(buf);
+
+	buf = (char *) malloc(sizeof(char));
+	STARPU_ASSERT(buf != NULL);
+
+	/* Measure latency */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; ++iter)
+	{
+		_starpu_disk_write(STARPU_MAIN_RAM, node, mem, buf, rand() % (SIZE_DISK_MIN -1) , 1, NULL);
+	}
+	gettimeofday(&end, NULL);
+	timing_latency = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	_starpu_disk_free(node, mem, SIZE_DISK_MIN);
+	free(buf);
+
+	_starpu_save_bandwidth_and_latency_disk((NITER/timing_slowness)*1000000, (NITER/timing_slowness)*1000000,
+					       timing_latency/NITER, timing_latency/NITER, node);
+	return 1;
+}
+
+#if __cplusplus >= 201103L
+struct starpu_disk_ops starpu_disk_leveldb_ops = {
+	.alloc = starpu_leveldb_alloc,
+	.free = starpu_leveldb_free,
+	.open = starpu_leveldb_open,
+	.close = starpu_leveldb_close,
+	.read = starpu_leveldb_read,
+	.write = starpu_leveldb_write,
+	.async_write = NULL,
+	.async_read = NULL,
+	.plug = starpu_leveldb_plug,
+	.unplug = starpu_leveldb_unplug,
+	.copy = NULL,
+	.bandwidth = get_leveldb_bandwidth_between_disk_and_main_ram,
+	.wait_request = NULL,
+	.test_request = NULL,
+	.full_read = starpu_leveldb_full_read,
+	.full_write = starpu_leveldb_full_write
+};
+#else
+struct starpu_disk_ops starpu_disk_leveldb_ops = {
+	starpu_leveldb_alloc,
+	starpu_leveldb_free,
+	starpu_leveldb_open,
+	starpu_leveldb_close,
+	starpu_leveldb_read,
+	starpu_leveldb_write,
+	NULL,
+	NULL,
+	starpu_leveldb_plug,
+	starpu_leveldb_unplug,
+	NULL,
+	get_leveldb_bandwidth_between_disk_and_main_ram,
+	NULL,
+	NULL,
+	starpu_leveldb_full_read,
+	starpu_leveldb_full_write
+};
+#endif

+ 1 - 1
src/core/jobs.h

@@ -166,7 +166,7 @@ unsigned _starpu_enforce_deps_starting_from_task(struct _starpu_job *j);
 void _starpu_handle_job_termination(struct _starpu_job *j);
 void _starpu_handle_job_termination(struct _starpu_job *j);
 
 
 /* Get the sum of the size of the data accessed by the job. */
 /* Get the sum of the size of the data accessed by the job. */
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j);
 
 
 /* Get a task from the local pool of tasks that were explicitly attributed to
 /* Get a task from the local pool of tasks that were explicitly attributed to
  * that worker. */
  * that worker. */

+ 47 - 28
src/core/perfmodel/perfmodel.c

@@ -48,7 +48,7 @@ unsigned _starpu_get_calibrate_flag(void)
 	return calibrate_flag;
 	return calibrate_flag;
 }
 }
 
 
-enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
+struct starpu_perfmodel_arch* starpu_worker_get_perf_archtype(int workerid)
 {
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
@@ -56,26 +56,26 @@ enum starpu_perfmodel_archtype starpu_worker_get_perf_archtype(int workerid)
 	unsigned nworkers = config->topology.nworkers;
 	unsigned nworkers = config->topology.nworkers;
 
 
 	if (workerid < (int)config->topology.nworkers)
 	if (workerid < (int)config->topology.nworkers)
-		return config->workers[workerid].perf_arch;
+		return &config->workers[workerid].perf_arch;
 
 
 	/* We have a combined worker */
 	/* We have a combined worker */
 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
 	unsigned ncombinedworkers = config->topology.ncombinedworkers;
 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
 	STARPU_ASSERT(workerid < (int)(ncombinedworkers + nworkers));
-	return config->combined_workers[workerid - nworkers].perf_arch;
+	return &config->combined_workers[workerid - nworkers].perf_arch;
 }
 }
 
 
 /*
 /*
  * PER ARCH model
  * PER ARCH model
  */
  */
 
 
-static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double per_arch_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, struct starpu_task *task, unsigned nimpl)
 {
 {
 	double exp = NAN;
 	double exp = NAN;
-	double (*per_arch_cost_function)(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl);
+	double (*per_arch_cost_function)(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl);
 	double (*per_arch_cost_model)(struct starpu_data_descr *);
 	double (*per_arch_cost_model)(struct starpu_data_descr *);
 
 
-	per_arch_cost_function = model->per_arch[arch][nimpl].cost_function;
-	per_arch_cost_model = model->per_arch[arch][nimpl].cost_model;
+	per_arch_cost_function = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_function;
+	per_arch_cost_model = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].cost_model;
 
 
 	if (per_arch_cost_function)
 	if (per_arch_cost_function)
 		exp = per_arch_cost_function(task, arch, nimpl);
 		exp = per_arch_cost_function(task, arch, nimpl);
@@ -89,28 +89,31 @@ static double per_arch_task_expected_perf(struct starpu_perfmodel *model, enum s
  * Common model
  * Common model
  */
  */
 
 
-double starpu_worker_get_relative_speedup(enum starpu_perfmodel_archtype perf_archtype)
+double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arch)
 {
 {
-	if (perf_archtype < STARPU_CUDA_DEFAULT)
+	if (perf_arch->type == STARPU_CPU_WORKER)
 	{
 	{
-		return _STARPU_CPU_ALPHA * (perf_archtype + 1);
+		return _STARPU_CPU_ALPHA * (perf_arch->ncore + 1);
 	}
 	}
-	else if (perf_archtype < STARPU_OPENCL_DEFAULT)
+	else if (perf_arch->type == STARPU_CUDA_WORKER)
 	{
 	{
 		return _STARPU_CUDA_ALPHA;
 		return _STARPU_CUDA_ALPHA;
 	}
 	}
-	else if (perf_archtype < STARPU_NARCH_VARIATIONS)
+	else if (perf_arch->type == STARPU_OPENCL_WORKER)
 	{
 	{
 		return _STARPU_OPENCL_ALPHA;
 		return _STARPU_OPENCL_ALPHA;
 	}
 	}
-
+	else if (perf_arch->type == STARPU_MIC_WORKER)
+	{
+		return _STARPU_MIC_ALPHA;
+	}
 	STARPU_ABORT();
 	STARPU_ABORT();
 
 
 	/* Never reached ! */
 	/* Never reached ! */
 	return NAN;
 	return NAN;
 }
 }
 
 
-static double common_task_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct starpu_task *task, unsigned nimpl)
+static double common_task_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct starpu_task *task, unsigned nimpl)
 {
 {
 	double exp;
 	double exp;
 	double alpha;
 	double alpha;
@@ -147,6 +150,8 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 	if (!load_model)
 	if (!load_model)
 		return;
 		return;
 
 
+	initialize_model(model);
+
 	switch (model->type)
 	switch (model->type)
 	{
 	{
 		case STARPU_PER_ARCH:
 		case STARPU_PER_ARCH:
@@ -170,7 +175,7 @@ void _starpu_load_perfmodel(struct starpu_perfmodel *model)
 	model->is_loaded = 1;
 	model->is_loaded = 1;
 }
 }
 
 
-static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,  unsigned nimpl)
+static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,  unsigned nimpl)
 {
 {
 	if (model)
 	if (model)
 	{
 	{
@@ -203,19 +208,19 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 	return 0.0;
 	return 0.0;
 }
 }
 
 
-double starpu_task_expected_length(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 
 
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->model, arch, nimpl);
 }
 }
 
 
-double starpu_task_expected_power(struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_expected_power(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 	return starpu_model_expected_perf(task, task->cl->power_model, arch, nimpl);
 }
 }
 
 
 double starpu_task_expected_conversion_time(struct starpu_task *task,
 double starpu_task_expected_conversion_time(struct starpu_task *task,
-					    enum starpu_perfmodel_archtype arch,
+					    struct starpu_perfmodel_arch* arch,
 					    unsigned nimpl)
 					    unsigned nimpl)
 {
 {
 	unsigned i;
 	unsigned i;
@@ -230,14 +235,28 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 			continue;
-
-		if (arch < STARPU_CUDA_DEFAULT)
-			node_kind = STARPU_CPU_RAM;
-		else if (arch < STARPU_OPENCL_DEFAULT)
-			node_kind = STARPU_CUDA_RAM;
-		else
-			node_kind = STARPU_OPENCL_RAM;
-
+		
+		switch(arch->type)
+		{
+			case STARPU_CPU_WORKER:
+				node_kind = STARPU_CPU_RAM;
+				break;
+			case STARPU_CUDA_WORKER:
+				node_kind = STARPU_CUDA_RAM;
+				break;
+			case STARPU_OPENCL_WORKER:
+				node_kind = STARPU_OPENCL_RAM;
+				break;
+			case STARPU_MIC_WORKER:
+				node_kind = STARPU_MIC_RAM;
+				break;
+			case STARPU_SCC_WORKER:
+				node_kind = STARPU_SCC_RAM;
+				break;
+			default:
+				STARPU_ABORT();
+				break;
+		}
 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
 			continue;
 			continue;
 
 
@@ -297,7 +316,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 }
 }
 
 
 /* Return the expected duration of the entire task bundle in µs */
 /* Return the expected duration of the entire task bundle in µs */
-double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	double expected_length = 0.0;
 	double expected_length = 0.0;
 
 
@@ -328,7 +347,7 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 }
 }
 
 
 /* Return the expected power consumption of the entire task bundle in J */
 /* Return the expected power consumption of the entire task bundle in J */
-double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	double expected_power = 0.0;
 	double expected_power = 0.0;
 
 

+ 14 - 4
src/core/perfmodel/perfmodel.h

@@ -24,6 +24,11 @@
 #include <core/task_bundle.h>
 #include <core/task_bundle.h>
 #include <stdio.h>
 #include <stdio.h>
 
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 struct _starpu_perfmodel_list
 struct _starpu_perfmodel_list
 {
 {
 	struct _starpu_perfmodel_list *next;
 	struct _starpu_perfmodel_list *next;
@@ -39,7 +44,7 @@ void _starpu_get_perf_model_dir_codelets(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_bus(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 void _starpu_get_perf_model_dir_debug(char *path, size_t maxlen);
 
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 int _starpu_register_model(struct starpu_perfmodel *model);
 int _starpu_register_model(struct starpu_perfmodel *model);
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
 void _starpu_load_per_arch_based_model(struct starpu_perfmodel *model);
 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
 void _starpu_load_common_based_model(struct starpu_perfmodel *model);
@@ -49,10 +54,10 @@ void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
 double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl);
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch,
+					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
 				unsigned cpuid, double measured, unsigned nimpl);
 				unsigned cpuid, double measured, unsigned nimpl);
 
 
 void _starpu_create_sampling_directory_if_needed(void);
 void _starpu_create_sampling_directory_if_needed(void);
@@ -72,4 +77,9 @@ int *_starpu_get_opencl_affinity_vector(unsigned gpuid);
 
 
 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, 
 void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, 
 					    double latency_write, double latency_read, unsigned node);
 					    double latency_write, double latency_read, unsigned node);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif // __PERFMODEL_H__
 #endif // __PERFMODEL_H__

+ 312 - 289
src/core/perfmodel/perfmodel_history.c

@@ -52,13 +52,13 @@ struct starpu_perfmodel_history_table
 static starpu_pthread_rwlock_t registered_models_rwlock;
 static starpu_pthread_rwlock_t registered_models_rwlock;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 static struct _starpu_perfmodel_list *registered_models = NULL;
 
 
-size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
+size_t _starpu_job_get_data_size(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, struct _starpu_job *j)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 
 
-	if (model && model->per_arch[arch][nimpl].size_base)
+	if (model && model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
 	{
 	{
-		return model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+		return model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
 	}
 	}
 	else if (model && model->size_base)
 	else if (model && model->size_base)
 	{
 	{
@@ -103,11 +103,11 @@ static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, s
 	HASH_ADD_UINT32_T(*history_ptr, footprint, table);
 	HASH_ADD_UINT32_T(*history_ptr, footprint, table);
 }
 }
 
 
-static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, unsigned arch, unsigned nimpl)
+static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_per_arch *per_arch_model;
 
 
-	per_arch_model = &model->per_arch[arch][nimpl];
+	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 	struct starpu_perfmodel_regression_model *reg_model;
 	struct starpu_perfmodel_regression_model *reg_model;
 	reg_model = &per_arch_model->regression;
 	reg_model = &per_arch_model->regression;
 
 
@@ -238,9 +238,11 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 	int res = fscanf(f, "%u\n", &nentries);
 	int res = fscanf(f, "%u\n", &nentries);
 	STARPU_ASSERT_MSG(res == 1, "Incorrect performance model file");
 	STARPU_ASSERT_MSG(res == 1, "Incorrect performance model file");
 
 
+	_STARPU_DEBUG("nentries:%u\n", nentries);
+
 	scan_reg_model(f, &per_arch_model->regression);
 	scan_reg_model(f, &per_arch_model->regression);
 
 
-	/* parse cpu entries */
+	/* parse entries */
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < nentries; i++)
 	for (i = 0; i < nentries; i++)
 	{
 	{
@@ -261,135 +263,147 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch
 	}
 	}
 }
 }
 
 
-static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, unsigned archmin, unsigned archmax, unsigned skiparch)
+
+static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history,struct starpu_perfmodel_arch* arch)
 {
 {
 	struct starpu_perfmodel_per_arch dummy;
 	struct starpu_perfmodel_per_arch dummy;
-	int nimpls, implmax, skipimpl, impl;
-	unsigned ret, arch;
+	unsigned nimpls, implmax, impl, i, ret;
+	_STARPU_DEBUG("Parsing %s_%u_ncore_%u\n", 
+			starpu_perfmodel_get_archtype_name(arch->type),
+			arch->devid,
+			arch->ncore);	
 
 
-	for (arch = archmin; arch < archmax; arch++)
-	{
-		_STARPU_DEBUG("Parsing arch %u\n", arch);
-		_starpu_drop_comments(f);
-		ret = fscanf(f, "%d\n", &nimpls);
-		_STARPU_DEBUG("%d implementations\n", nimpls);
-		STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
-		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
-		for (impl = 0; impl < implmax; impl++)
-		{
-			parse_per_arch_model_file(f, &model->per_arch[arch][impl], scan_history);
-		}
-		if (skipimpl > 0)
-		{
-			for (impl = 0; impl < skipimpl; impl++)
-			{
-				parse_per_arch_model_file(f, &dummy, 0);
-			}
-		}
-	}
+	/* Parsing number of implementation */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &nimpls);
+	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+
+	/* Parsing each implementation */
+	implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+	for (impl = 0; impl < implmax; impl++)
+		parse_per_arch_model_file(f, &model->per_arch[arch->type][arch->devid][arch->ncore][impl], scan_history);
+
+	/* if the number of implementation is greater than STARPU_MAXIMPLEMENTATIONS
+	 * we skip the last implementation */
+	if (impl < nimpls)
+		for (i = impl; impl < nimpls; i++)
+			parse_per_arch_model_file(f, &dummy, 0);
 
 
-	if (skiparch > 0)
-	{
-		_starpu_drop_comments(f);
-		for (arch = 0; arch < skiparch; arch ++)
-		{
-			_STARPU_DEBUG("skipping arch %u\n", arch);
-			ret = fscanf(f, "%d\n", &nimpls);
-			_STARPU_DEBUG("%d implementations\n", nimpls);
-			STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-			implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
-			skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
-			for (impl = 0; impl < implmax; impl++)
-			{
-				parse_per_arch_model_file(f, &dummy, 0);
-			}
-			if (skipimpl > 0)
-			{
-				for (impl = 0; impl < skipimpl; impl++)
-				{
-					parse_per_arch_model_file(f, &dummy, 0);
-				}
-			}
-		}
-	}
 }
 }
 
 
-static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned scan_history)
+static void skip_parse_arch(FILE *f, struct starpu_perfmodel_arch* arch)
 {
 {
-	unsigned ret;
-	unsigned archmin = 0;
-	unsigned narchs;
-
-	/* We could probably write a clean loop here, but the code would not
-	 * really be easier to read. */
+	struct starpu_perfmodel_per_arch dummy;
+	unsigned nimpls, impl, ret;
+	_STARPU_DEBUG("Skiping %s_%u_ncore_%u\n", 
+			starpu_perfmodel_get_archtype_name(arch->type),
+			arch->devid,
+			arch->ncore);
 
 
-	/* Parsing CPUs */
+	/* Parsing number of implementation */
 	_starpu_drop_comments(f);
 	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &narchs);
+	ret = fscanf(f, "%u\n", &nimpls);
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 
 
-	_STARPU_DEBUG("Parsing %u CPUs\n", narchs);
-	if (narchs > 0)
-	{
-		parse_arch(f, model, scan_history,
-			   archmin,
-			   STARPU_MIN(narchs, STARPU_MAXCPUS),
-			   narchs > STARPU_MAXCPUS ? narchs - STARPU_MAXCPUS : 0);
-	}
+	/* Skiping each implementation */
+	for (impl = 0; impl < nimpls; impl++)
+		parse_per_arch_model_file(f, &dummy, 0);
+}
 
 
-	/* Parsing CUDA devs */
+
+static void parse_device(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype, unsigned devid)
+{
+	unsigned maxncore, ncore, i, ret;
+	struct starpu_perfmodel_arch arch;
+	arch.type = archtype;
+	arch.devid = devid;
+	_STARPU_DEBUG("Parsing device %s_%u arch\n",  
+			starpu_perfmodel_get_archtype_name(archtype),
+			devid);
+
+	/* Parsing maximun number of worker for this device */
 	_starpu_drop_comments(f);
 	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &narchs);
+	ret = fscanf(f, "%u\n", &maxncore);
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
-	archmin += STARPU_MAXCPUS;
-	_STARPU_DEBUG("Parsing %u CUDA devices\n", narchs);
-	if (narchs > 0)
+	
+	/* Parsing each arch */
+	for(ncore=0; ncore < maxncore && model->per_arch[archtype][devid][ncore] != NULL; ncore++)
 	{
 	{
-		parse_arch(f, model, scan_history,
-			   archmin,
-			   archmin + STARPU_MIN(narchs, STARPU_MAXCUDADEVS),
-			   narchs > STARPU_MAXCUDADEVS ? narchs - STARPU_MAXCUDADEVS : 0);
+		arch.ncore = ncore;
+		parse_arch(f,model,scan_history,&arch);
 	}
 	}
 
 
-	/* Parsing OpenCL devs */
+	/* if there is less workers on the current device than in the perfmodel_file
+	 * we skip the last workers */
+	if(ncore < maxncore)
+		for(i=ncore; i<maxncore; i++)
+		{
+			arch.ncore = ncore;
+			skip_parse_arch(f,&arch);
+		}
+}
+
+
+static void skip_parse_device(FILE *f, enum starpu_worker_archtype archtype, unsigned devid)
+{
+	unsigned maxncore, ncore, ret;
+	struct starpu_perfmodel_arch arch;
+	arch.type = archtype;
+	arch.devid = devid;
+	_STARPU_DEBUG("Skiping device %s_%u arch\n", 
+			starpu_perfmodel_get_archtype_name(archtype),
+			devid);
+
+	/* Parsing maximun number of worker for this device */
 	_starpu_drop_comments(f);
 	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &narchs);
+	ret = fscanf(f, "%u\n", &maxncore);
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
 
 
-	archmin += STARPU_MAXCUDADEVS;
-	_STARPU_DEBUG("Parsing %u OpenCL devices\n", narchs);
-	if (narchs > 0)
+	/* Skiping each arch */
+	for(ncore=0; ncore < maxncore; ncore++)
 	{
 	{
-		parse_arch(f, model, scan_history,
-			   archmin,
-			   archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
-			   narchs > STARPU_MAXOPENCLDEVS ? narchs - STARPU_MAXOPENCLDEVS : 0);
+		arch.ncore = ncore;
+		skip_parse_arch(f,&arch);
 	}
 	}
+}
+
+static void parse_archtype(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, enum starpu_worker_archtype archtype)
+{
+	unsigned ndevice, devid, i, ret;
+	_STARPU_DEBUG("Parsing %s arch\n", starpu_perfmodel_get_archtype_name(archtype));
 
 
-	/* Parsing MIC devs */
+	/* Parsing number of device for this archtype */
 	_starpu_drop_comments(f);
 	_starpu_drop_comments(f);
-	ret = fscanf(f, "%u\n", &narchs);
-	if (ret == 0)
-		narchs = 0;
+	ret = fscanf(f, "%u\n", &ndevice);
+	STARPU_ASSERT_MSG(ret == 1, "Incorrect performance model file");
+
+	/* Parsing each device for this archtype*/
+	for(devid=0; devid < ndevice && model->per_arch[archtype][devid] != NULL; devid++)
+		parse_device(f,model,scan_history,archtype,devid);
+
+	/* if there is more devices on the current machine than in the perfmodel_file
+	 * we skip the last devices */
+	if(devid < ndevice)
+		for(i=devid; i<ndevice; i++) 
+			skip_parse_device(f,archtype,devid);
 
 
-	archmin += STARPU_MAXOPENCLDEVS;
-	_STARPU_DEBUG("Parsing %u MIC devices\n", narchs);
-	if (narchs > 0)
-	{
-		parse_arch(f, model, scan_history,
-			   archmin,
-			   archmin + STARPU_MIN(narchs, STARPU_MAXMICDEVS),
-			   narchs > STARPU_MAXMICDEVS ? narchs - STARPU_MAXMICDEVS : 0);
-	}
 }
 }
 
 
+static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned scan_history)
+{
+	unsigned archtype;
+	_STARPU_DEBUG("Start parsing\n");
+
+	/* Parsing each kind of archtype */
+	for(archtype=0; archtype<STARPU_NARCH; archtype++)
+		parse_archtype(f, model, scan_history, archtype);
+}
 
 
-static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, unsigned arch, unsigned nimpl)
+static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl)
 {
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_per_arch *per_arch_model;
 
 
-	per_arch_model = &model->per_arch[arch][nimpl];
+	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 	/* count the number of elements in the lists */
 	/* count the number of elements in the lists */
 	struct starpu_perfmodel_history_list *ptr = NULL;
 	struct starpu_perfmodel_history_list *ptr = NULL;
 	unsigned nentries = 0;
 	unsigned nentries = 0;
@@ -407,7 +421,8 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
 
 	/* header */
 	/* header */
 	char archname[32];
 	char archname[32];
-	starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) arch, archname, 32, nimpl);
+	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
+	fprintf(f, "#####\n");
 	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
 
@@ -425,13 +440,13 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 		}
 		}
 	}
 	}
 
 
-	fprintf(f, "\n##################\n");
+	fprintf(f, "\n");
 }
 }
 
 
-static unsigned get_n_entries(struct starpu_perfmodel *model, unsigned arch, unsigned impl)
+static unsigned get_n_entries(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned impl)
 {
 {
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_per_arch *per_arch_model;
-	per_arch_model = &model->per_arch[arch][impl];
+	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][impl];
 	/* count the number of elements in the lists */
 	/* count the number of elements in the lists */
 	struct starpu_perfmodel_history_list *ptr = NULL;
 	struct starpu_perfmodel_history_list *ptr = NULL;
 	unsigned nentries = 0;
 	unsigned nentries = 0;
@@ -451,123 +466,94 @@ static unsigned get_n_entries(struct starpu_perfmodel *model, unsigned arch, uns
 
 
 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 {
 {
-	unsigned narch[4] = { 0, 0, 0, 0};
-	unsigned arch, arch_base = 0, my_narch = 0;
-	unsigned nimpl;
-	unsigned idx = 0;
-
-	/* Finding the number of archs to write for each kind of device */
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-	{
-		switch (arch)
-		{
-			case STARPU_CUDA_DEFAULT:
-			case STARPU_OPENCL_DEFAULT:
-			case STARPU_MIC_DEFAULT:
-				arch_base = arch;
-				idx++;
-				break;
-			default:
-				break;
-		}
-
-		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
-		{
-			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-				if (get_n_entries(model, arch, nimpl))
-				{
-					narch[idx]=arch-arch_base+1;
-					break;
-				}
-		}
-		else if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_PER_ARCH || model->type == STARPU_COMMON)
-		{
-			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-				if (model->per_arch[arch][nimpl].regression.nsample)
-				{
-					narch[idx]=arch-arch_base+1;
-					break;
-				}
-		}
-		else
-		{
-			STARPU_ASSERT_MSG(0, "Unknown history-based performance model %d", model->type);
-		}
-	}
-
-	/* Writing stuff */
-
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
 	char *name = "unknown";
 	char *name = "unknown";
-	unsigned substract_to_arch = 0;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-	{
-		unsigned char arch_already_visited = 0;
+	unsigned archtype, ndevice, *ncore, devid, nc, nimpl;
+	struct starpu_perfmodel_arch arch;
 
 
-		switch (arch)
+	for(archtype=0; archtype<STARPU_NARCH; archtype++)
+	{
+		arch.type = archtype;
+		switch (archtype)
 		{
 		{
-			case STARPU_CPU_DEFAULT:
+			case STARPU_CPU_WORKER:
+				ndevice = 1;
+				ncore = &conf->topology.ncpus;
 				name = "CPU";
 				name = "CPU";
-				my_narch = narch[0];
 				break;
 				break;
-			case STARPU_CUDA_DEFAULT:
+			case STARPU_CUDA_WORKER:
+				ndevice = conf->topology.ncudagpus;
+				ncore = NULL;
 				name = "CUDA";
 				name = "CUDA";
-				substract_to_arch = STARPU_MAXCPUS;
-				my_narch = narch[1];
 				break;
 				break;
-			case STARPU_OPENCL_DEFAULT:
+			case STARPU_OPENCL_WORKER:
+				ndevice = conf->topology.nopenclgpus;
+				ncore = NULL;
 				name = "OPENCL";
 				name = "OPENCL";
-				my_narch = narch[2];
 				break;
 				break;
-			case STARPU_MIC_DEFAULT:
+			case STARPU_MIC_WORKER:
+				ndevice = conf->topology.nmicdevices;
+				ncore = conf->topology.nmiccores;
 				name = "MIC";
 				name = "MIC";
-				my_narch = narch[3];
+				break;
+			case STARPU_SCC_WORKER:
+				ndevice = conf->topology.nsccdevices;
+				ncore = NULL;
+				name = "SCC";
 				break;
 				break;
 			default:
 			default:
-				/* The current worker arch was already written,
-				 * we don't need to write it again */
-				arch_already_visited = 1;
+				/* Unknown arch */
+				STARPU_ABORT();
 				break;
 				break;
 		}
 		}
-
-		if (!arch_already_visited)
+			
+		fprintf(f, "####################\n");
+		fprintf(f, "# %ss\n", name);
+		fprintf(f, "# number of %s devices\n", name);
+		fprintf(f, "%u\n", ndevice);
+		
+
+		for(devid=0; devid<ndevice; devid++)
 		{
 		{
-			arch_base = arch;
-			fprintf(f, "##################\n");
-			fprintf(f, "# %ss\n", name);
-			fprintf(f, "# number of %s architectures\n", name);
-			fprintf(f, "%u\n", my_narch);
-		}
+			arch.devid = devid;
+			fprintf(f, "###############\n");
+			fprintf(f, "# %s_%u\n", name, devid); 
+			fprintf(f, "# number of workers on %s_%d devices\n", name, devid);
+			if(ncore != NULL)
+				fprintf(f, "%u\n", ncore[devid]);
+			else
+				fprintf(f, "1\n");
+			for(nc=0; model->per_arch[archtype][devid][nc] != NULL; nc++)
+			{
 
 
-		unsigned max_impl = 0;
-		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
-		{
-			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-				if (get_n_entries(model, arch, nimpl))
-					max_impl = nimpl + 1;
-		}
-		else if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_PER_ARCH || model->type == STARPU_COMMON)
-		{
-			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-				if (model->per_arch[arch][nimpl].regression.nsample)
-					max_impl = nimpl + 1;
-		}
-		else
-			STARPU_ASSERT_MSG(0, "Unknown history-based performance model %u", arch);
+				arch.ncore = nc;
+				unsigned max_impl = 0;
+				if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+				{
+					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+						if (get_n_entries(model, &arch, nimpl))
+							max_impl = nimpl + 1;
+				}
+				else if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_PER_ARCH || model->type == STARPU_COMMON)
+				{
+					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+						if (model->per_arch[archtype][devid][nc][nimpl].regression.nsample)
+							max_impl = nimpl + 1;
+				}
+				else
+					STARPU_ASSERT_MSG(0, "Unknown history-based performance model %u", archtype);
 
 
-		if (arch >= my_narch + arch_base)
-			continue;
 
 
-		fprintf(f, "###########\n");
-		if (substract_to_arch)
-			fprintf(f, "# %s_%u\n", name, arch - substract_to_arch);
-		else
-			/* CPU */
-			fprintf(f, "# %u CPU(s) in parallel\n", arch + 1);
-		fprintf(f, "# number of implementations\n");
-		fprintf(f, "%u\n", max_impl);
-		for (nimpl = 0; nimpl < max_impl; nimpl++)
-		{
-			dump_per_arch_model_file(f, model, arch, nimpl);
+				fprintf(f, "##########\n");
+				fprintf(f, "# %u worker(s) in parallel\n", nc+1);
+
+				fprintf(f, "# number of implementations\n");
+				fprintf(f, "%u\n", max_impl);
+				for (nimpl = 0; nimpl < max_impl; nimpl++)
+				{
+					dump_per_arch_model_file(f, model, &arch, nimpl);
+				}
+			}
 		}
 		}
 	}
 	}
 }
 }
@@ -579,19 +565,48 @@ static void initialize_per_arch_model(struct starpu_perfmodel_per_arch *per_arch
 	per_arch_model->regression.nsample = 0;
 	per_arch_model->regression.nsample = 0;
 	per_arch_model->regression.valid = 0;
 	per_arch_model->regression.valid = 0;
 	per_arch_model->regression.nl_valid = 0;
 	per_arch_model->regression.nl_valid = 0;
+	per_arch_model->size_base = NULL;
 }
 }
 
 
-static void initialize_model(struct starpu_perfmodel *model)
+
+
+static struct starpu_perfmodel_per_arch*** initialize_arch_model(int maxdevid, unsigned* maxncore_table)
 {
 {
-	unsigned arch;
-	unsigned nimpl;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
+	int devid, ncore, nimpl;
+	struct starpu_perfmodel_per_arch *** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
+	arch_model[maxdevid] = NULL;
+	for(devid=0; devid<maxdevid; devid++)
 	{
 	{
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		int maxncore;
+		if(maxncore_table != NULL)
+			maxncore = maxncore_table[devid];
+		else
+			maxncore = 1;
+		
+		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
+		arch_model[devid][maxncore] = NULL;
+		for(ncore=0; ncore<maxncore; ncore++)
 		{
 		{
-			initialize_per_arch_model(&model->per_arch[arch][nimpl]);
+			arch_model[devid][ncore] = malloc(sizeof(*arch_model[devid][ncore])*STARPU_MAXIMPLEMENTATIONS);
+			for(nimpl=0; nimpl<STARPU_MAXIMPLEMENTATIONS; nimpl++)
+			{
+				initialize_per_arch_model(&arch_model[devid][ncore][nimpl]);
+			}
 		}
 		}
 	}
 	}
+	return arch_model;
+}
+
+void initialize_model(struct starpu_perfmodel *model)
+{
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
+	model->per_arch = malloc(sizeof(*model->per_arch)*(STARPU_NARCH));
+
+	model->per_arch[STARPU_CPU_WORKER] = initialize_arch_model(1,&conf->topology.ncpus); 
+	model->per_arch[STARPU_CUDA_WORKER] = initialize_arch_model(conf->topology.ncudagpus,NULL); 
+	model->per_arch[STARPU_OPENCL_WORKER] = initialize_arch_model(conf->topology.nopenclgpus,NULL); 
+	model->per_arch[STARPU_MIC_WORKER] = initialize_arch_model(conf->topology.nmicdevices,conf->topology.nmiccores); 
+	model->per_arch[STARPU_SCC_WORKER] = initialize_arch_model(conf->topology.nsccdevices,NULL); 
 }
 }
 
 
 static void get_model_debug_path(struct starpu_perfmodel *model, const char *arch, char *path, size_t maxlen)
 static void get_model_debug_path(struct starpu_perfmodel *model, const char *arch, char *path, size_t maxlen)
@@ -649,13 +664,13 @@ int _starpu_register_model(struct starpu_perfmodel *model)
 	unsigned arch;
 	unsigned arch;
 	unsigned nimpl;
 	unsigned nimpl;
 
 
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
-	{
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
-		{
-			starpu_perfmodel_debugfilepath(model, arch, model->per_arch[arch][nimpl].debug_path, 256, nimpl);
-		}
-	}
+	for (arch = 0; arch < STARPU_NARCH; arch++)
+		for(devid=0; model->per_arch[arch][devid] != NULL; devid++)
+			for(ncore=0; model->per_arch[arch][devid][ncore] != NULL; ncore++)
+				for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+				{
+					starpu_perfmodel_debugfilepath(model, arch, model->per_arch[arch][devid][ncore][nimpl].debug_path, 256, nimpl);
+				}
 #endif
 #endif
 
 
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
@@ -723,33 +738,44 @@ void _starpu_initialize_registered_performance_models(void)
 
 
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
 {
 {
-	unsigned arch;
-	unsigned nimpl;
+	unsigned arch, devid, ncore, nimpl;
 
 
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
+	for (arch = 0; arch < STARPU_NARCH; arch++)
 	{
 	{
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		if( model->per_arch[arch] != NULL)
 		{
 		{
-			struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[arch][nimpl];
-			struct starpu_perfmodel_history_list *list, *plist;
-			struct starpu_perfmodel_history_table *entry, *tmp;
-
-			HASH_ITER(hh, archmodel->history, entry, tmp)
+			for(devid=0; model->per_arch[arch][devid] != NULL; devid++)
 			{
 			{
-				HASH_DEL(archmodel->history, entry);
-				free(entry);
-			}
-			archmodel->history = NULL;
-
-			list = archmodel->list;
-			while (list)
-			{
-				free(list->entry);
-				plist = list;
-				list = list->next;
-				free(plist);
+				for(ncore=0; model->per_arch[arch][devid][ncore] != NULL; ncore++)
+				{
+					for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+					{	
+						struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[arch][devid][ncore][nimpl];
+						struct starpu_perfmodel_history_list *list, *plist;
+						struct starpu_perfmodel_history_table *entry, *tmp;
+
+						HASH_ITER(hh, archmodel->history, entry, tmp)
+						{
+							HASH_DEL(archmodel->history, entry);
+							free(entry);
+						}
+						archmodel->history = NULL;
+
+						list = archmodel->list;
+						while (list)
+						{
+							free(list->entry);
+							plist = list;
+							list = list->next;
+							free(plist);
+						}
+						archmodel->list = NULL;
+					}
+					free(model->per_arch[arch][devid][ncore]);
+				}
+				free(model->per_arch[arch][devid]);
 			}
 			}
-			archmodel->list = NULL;
+			free(model->per_arch[arch]);
 		}
 		}
 	}
 	}
 
 
@@ -865,6 +891,7 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 	if (already_loaded)
 	if (already_loaded)
 		return;
 		return;
 
 
+
 	/* The model is still not loaded so we grab the lock in write mode, and
 	/* The model is still not loaded so we grab the lock in write mode, and
 	 * if it's not loaded once we have the lock, we do load it. */
 	 * if it's not loaded once we have the lock, we do load it. */
 
 
@@ -913,6 +940,7 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 			f = fopen(path, "r");
 			f = fopen(path, "r");
 			STARPU_ASSERT(f);
 			STARPU_ASSERT(f);
 
 
+			initialize_model(model);
 			parse_model_file(f, model, scan_history);
 			parse_model_file(f, model, scan_history);
 
 
 			fclose(f);
 			fclose(f);
@@ -1019,48 +1047,42 @@ int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
 	return 0;
 	return 0;
 }
 }
 
 
-void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *archname, size_t maxlen,unsigned nimpl)
+char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 {
 {
-	if (arch < STARPU_CUDA_DEFAULT)
+	switch(archtype)
 	{
 	{
-		if (arch == STARPU_CPU_DEFAULT)
-		{
-			/* NB: We could just use cpu_1 as well ... */
-			snprintf(archname, maxlen, "cpu_impl_%u",nimpl);
-		}
-		else
-		{
-			/* For combined CPU workers */
-			int cpu_count = arch - STARPU_CPU_DEFAULT + 1;
-			snprintf(archname, maxlen, "cpu_%d_impl_%u", cpu_count,nimpl);
-		}
-	}
-	else if ((STARPU_CUDA_DEFAULT <= arch)
-		&& (arch < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS))
-	{
-		int devid = arch - STARPU_CUDA_DEFAULT;
-		snprintf(archname, maxlen, "cuda_%d_impl_%u", devid,nimpl);
-	}
-	else if ((STARPU_OPENCL_DEFAULT <= arch)
-		&& (arch < STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS))
-	{
-		int devid = arch - STARPU_OPENCL_DEFAULT;
-		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
-	}
-	else if ((STARPU_MIC_DEFAULT <= arch)
-		&& (arch < STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS))
-	{
-		int devid = arch - STARPU_MIC_DEFAULT;
-		snprintf(archname, maxlen, "mic_%d_impl_%u", devid, nimpl);
-	}
-	else
-	{
-		STARPU_ABORT();
+		case(STARPU_CPU_WORKER):
+			return "cpu";
+			break;
+		case(STARPU_CUDA_WORKER):
+			return "cuda";
+			break;
+		case(STARPU_OPENCL_WORKER):
+			return "opencl";
+			break;
+		case(STARPU_MIC_WORKER):
+			return "mic";
+			break;
+		case(STARPU_SCC_WORKER):
+			return "scc";
+			break;
+		default:
+			STARPU_ABORT();
+			break;
 	}
 	}
 }
 }
 
 
+void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned nimpl)
+{
+	snprintf(archname, maxlen, "%s_%d_ncore_%d_impl_%u", 
+			starpu_perfmodel_get_archtype_name(arch->type),
+			arch->devid, 
+			arch->ncore, 
+			nimpl);
+}
+
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
-				    enum starpu_perfmodel_archtype arch, char *path, size_t maxlen, unsigned nimpl)
+				    struct starpu_perfmodel_arch* arch, char *path, size_t maxlen, unsigned nimpl)
 {
 {
 	char archname[32];
 	char archname[32];
 	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
 	starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
@@ -1070,13 +1092,13 @@ void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model,
 	get_model_debug_path(model, archname, path, maxlen);
 	get_model_debug_path(model, archname, path, maxlen);
 }
 }
 
 
-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j, unsigned nimpl)
+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl)
 {
 {
 	double exp = NAN;
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_perfmodel_regression_model *regmodel;
 	struct starpu_perfmodel_regression_model *regmodel;
 
 
-	regmodel = &model->per_arch[arch][nimpl].regression;
+	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
 
 
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	if (regmodel->valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
                 exp = regmodel->alpha*pow((double)size, regmodel->beta);
@@ -1084,20 +1106,20 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 	return exp;
 	return exp;
 }
 }
 
 
-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j,unsigned nimpl)
+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
 {
 {
 	double exp = NAN;
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
 	struct starpu_perfmodel_regression_model *regmodel;
 	struct starpu_perfmodel_regression_model *regmodel;
 
 
-	regmodel = &model->per_arch[arch][nimpl].regression;
+	regmodel = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].regression;
 
 
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 	if (regmodel->nl_valid && size >= regmodel->minx * 0.9 && size <= regmodel->maxx * 1.1)
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 		exp = regmodel->a*pow((double)size, regmodel->b) + regmodel->c;
 	else
 	else
 	{
 	{
 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch][nimpl];
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 		struct starpu_perfmodel_history_table *history;
 		struct starpu_perfmodel_history_table *history;
 		struct starpu_perfmodel_history_table *entry;
 		struct starpu_perfmodel_history_table *entry;
 
 
@@ -1127,7 +1149,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	return exp;
 	return exp;
 }
 }
 
 
-double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, struct _starpu_job *j,unsigned nimpl)
+double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j,unsigned nimpl)
 {
 {
 	double exp;
 	double exp;
 	struct starpu_perfmodel_per_arch *per_arch_model;
 	struct starpu_perfmodel_per_arch *per_arch_model;
@@ -1136,7 +1158,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
 
 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
 
-	per_arch_model = &model->per_arch[arch][nimpl];
+	per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 
 
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 	history = per_arch_model->history;
 	history = per_arch_model->history;
@@ -1171,7 +1193,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 	return exp;
 	return exp;
 }
 }
 
 
-double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, uint32_t footprint)
+double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, uint32_t footprint)
 {
 {
 	struct _starpu_job j =
 	struct _starpu_job j =
 		{
 		{
@@ -1181,13 +1203,13 @@ double starpu_permodel_history_based_expected_perf(struct starpu_perfmodel *mode
 	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
 	return _starpu_history_based_job_expected_perf(model, arch, &j, j.nimpl);
 }
 }
 
 
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned nimpl)
 {
 {
 	if (model)
 	if (model)
 	{
 	{
 		STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 		STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 
 
-		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch][nimpl];
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 
 
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
 		{
@@ -1309,7 +1331,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 	}
 	}
 }
 }
 
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned cpuid, unsigned nimpl, double measured)
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
 {
 {
 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
 
 
@@ -1319,3 +1341,4 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 	/* and save perfmodel on termination */
 	/* and save perfmodel on termination */
 	_starpu_set_calibrate_flag(1);
 	_starpu_set_calibrate_flag(1);
 }
 }
+

+ 41 - 16
src/core/perfmodel/perfmodel_print.c

@@ -61,9 +61,9 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 	}
 	}
 }
 }
 
 
-void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
+void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
 {
 {
-	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][nimpl];
+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 	char archname[32];
 	char archname[32];
 
 
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
 	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
@@ -171,13 +171,22 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 	if (arch == NULL)
 	if (arch == NULL)
 	{
 	{
 		/* display all architectures */
 		/* display all architectures */
-		unsigned archid;
-		unsigned implid;
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
+		unsigned archtype, devid, ncore, implid;
+		struct starpu_perfmodel_arch perf_arch;
+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
 		{
 		{
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{ /* Display all codelets on each arch */
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
+			perf_arch.type = archtype;
+			for(devid = 0; model->per_arch[archtype] != NULL; devid++)
+			{
+				perf_arch.devid = devid;
+				for(ncore = 0; model->per_arch[archtype][ncore] != NULL; ncore++)
+				{
+					perf_arch.ncore = ncore;
+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+					{ /* Display all codelets on each arch */
+						starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
+					}
+				}
 			}
 			}
 		}
 		}
 	}
 	}
@@ -186,8 +195,12 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		if (strcmp(arch, "cpu") == 0)
 		if (strcmp(arch, "cpu") == 0)
 		{
 		{
 			unsigned implid;
 			unsigned implid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CPU_WORKER;
+			perf_arch.devid = 0;
+			perf_arch.ncore = 0;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, STARPU_CPU_DEFAULT,implid, parameter, footprint, output); /* Display all codelets on cpu */
+				starpu_perfmodel_print(model, &perf_arch,implid, parameter, footprint, output); /* Display all codelets on cpu */
 			return 0;
 			return 0;
 		}
 		}
 
 
@@ -202,23 +215,32 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 			}
 			}
 
 
 			unsigned implid;
 			unsigned implid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CPU_WORKER;
+			perf_arch.devid = 0;
+			perf_arch.ncore = k-1;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 			return 0;
 		}
 		}
 
 
 		if (strcmp(arch, "cuda") == 0)
 		if (strcmp(arch, "cuda") == 0)
 		{
 		{
-			unsigned archid;
+			unsigned devid;
 			unsigned implid;
 			unsigned implid;
-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CUDA_WORKER;
+			perf_arch.ncore = 0;
+
+			for (devid = 0; model->per_arch[STARPU_CUDA_WORKER] != NULL; devid++)
 			{
 			{
+				perf_arch.devid = devid;
 				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
 				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
 				{
 				{
 					char archname[32];
 					char archname[32];
-					starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) archid, archname, 32, implid);
+					starpu_perfmodel_get_arch_name(&perf_arch, archname, 32, implid);
 					fprintf(output, "performance model for %s\n", archname);
 					fprintf(output, "performance model for %s\n", archname);
-					starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
+					starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 				}
 				}
 			}
 			}
 			return 0;
 			return 0;
@@ -230,10 +252,13 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		nmatched = sscanf(arch, "cuda_%d", &gpuid);
 		if (nmatched == 1)
 		if (nmatched == 1)
 		{
 		{
-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
+			struct starpu_perfmodel_arch perf_arch;
+			perf_arch.type = STARPU_CUDA_WORKER;
+			perf_arch.devid = gpuid;
+			perf_arch.ncore = 0;
 			unsigned implid;
 			unsigned implid;
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				starpu_perfmodel_print(model, (enum starpu_perfmodel_archtype) archid, implid, parameter, footprint, output);
+				starpu_perfmodel_print(model, &perf_arch, implid, parameter, footprint, output);
 			return 0;
 			return 0;
 		}
 		}
 
 

+ 2 - 14
src/core/sched_ctx.c

@@ -26,7 +26,6 @@ static starpu_pthread_mutex_t finished_submit_mutex = STARPU_PTHREAD_MUTEX_INITI
 struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 starpu_pthread_key_t sched_ctx_key;
 starpu_pthread_key_t sched_ctx_key;
 unsigned with_hypervisor = 0;
 unsigned with_hypervisor = 0;
-double max_time_worker_on_ctx = -1.0;
 
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 
 
@@ -509,10 +508,10 @@ unsigned starpu_sched_ctx_create_with_custom_policy(struct starpu_sched_policy *
 }
 }
 
 
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
-void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters)
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, void* perf_counters)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	sched_ctx->perf_counters = perf_counters;
+	sched_ctx->perf_counters = (struct starpu_sched_ctx_performance_counters *)perf_counters;
 	return;
 	return;
 }
 }
 #endif
 #endif
@@ -718,10 +717,6 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 		config->sched_ctxs[i].id = STARPU_NMAX_SCHED_CTXS;
 		config->sched_ctxs[i].id = STARPU_NMAX_SCHED_CTXS;
 
 
-	char* max_time_on_ctx = getenv("STARPU_MAX_TIME_ON_CTX");
-	if (max_time_on_ctx != NULL)
-		max_time_worker_on_ctx = atof(max_time_on_ctx);
-
 	return;
 	return;
 }
 }
 
 
@@ -1022,8 +1017,6 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 
 
 unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 {
 {
-	if(max_time_worker_on_ctx == -1.0) return 1;
-
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	return worker->active_ctx == sched_ctx_id;
 	return worker->active_ctx == sched_ctx_id;
 }
 }
@@ -1053,11 +1046,6 @@ void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 	}
 	}
 }
 }
 
 
-double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
-{
-	return max_time_worker_on_ctx;
-}
-
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 {
 {
 	STARPU_ASSERT(inheritor < STARPU_NMAX_SCHED_CTXS);
 	STARPU_ASSERT(inheritor < STARPU_NMAX_SCHED_CTXS);

+ 1 - 0
src/core/sched_ctx.h

@@ -19,6 +19,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <starpu_sched_ctx.h>
 #include <starpu_sched_ctx.h>
+#include <starpu_sched_ctx_hypervisor.h>
 #include <starpu_scheduler.h>
 #include <starpu_scheduler.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/barrier_counter.h>
 #include <common/barrier_counter.h>

+ 10 - 5
src/core/topology.c

@@ -633,8 +633,10 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
 	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
 	{
 	{
 		int worker_idx = topology->nworkers + miccore_id;
 		int worker_idx = topology->nworkers + miccore_id;
-		enum starpu_perfmodel_archtype arch =
-			(enum starpu_perfmodel_archtype)((int)STARPU_MIC_DEFAULT + mic_idx);
+		struct starpu_perfmodel_arch arch;
+		arch.type = STARPU_MIC_WORKER;
+		arch.devid = mic_idx;
+		arch.ncore = 0; 
 		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
 		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].mp_nodeid = mic_idx;
 		config->workers[worker_idx].mp_nodeid = mic_idx;
@@ -971,7 +973,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 	{
 	{
 		int worker_idx = topology->nworkers + cpu;
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
-		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
+		config->workers[worker_idx].perf_arch.type = STARPU_CPU_WORKER;
+		config->workers[worker_idx].perf_arch.devid = 0;
+		config->workers[worker_idx].perf_arch.ncore = 0;
 		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
@@ -1095,9 +1099,10 @@ _starpu_bind_thread_on_cpus (
 	}
 	}
 #else
 #else
 #ifdef __GLIBC__
 #ifdef __GLIBC__
-	pthread_setaffinity_np(pthread_self(),sizeof(cpu_set_t),&combined_worker->cpu_set);
+	sched_setaffinity(0,sizeof(combined_worker->cpu_set),&combined_worker->cpu_set);
+#else
+#  warning no parallel worker CPU binding support
 #endif
 #endif
-#warning no parallel worker CPU binding support
 #endif
 #endif
 }
 }
 
 

+ 3 - 3
src/core/workers.h

@@ -58,7 +58,7 @@ struct _starpu_worker
         starpu_pthread_mutex_t mutex;
         starpu_pthread_mutex_t mutex;
 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
 	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
 			* node) */
 			* node) */
@@ -118,12 +118,12 @@ struct _starpu_worker
 
 
 struct _starpu_combined_worker
 struct _starpu_combined_worker
 {
 {
-	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
+	struct starpu_perfmodel_arch perf_arch; /* in case there are different models of the same arch */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	uint32_t worker_mask; /* what is the type of workers ? */
 	int worker_size;
 	int worker_size;
 	unsigned memory_node; /* which memory node is associated that worker to ? */
 	unsigned memory_node; /* which memory node is associated that worker to ? */
 	int combined_workerid[STARPU_NMAXWORKERS];
 	int combined_workerid[STARPU_NMAXWORKERS];
-#ifdef STARPU_USE_MIC 
+#ifdef STARPU_USE_MP
 	int count;
 	int count;
 	pthread_mutex_t count_mutex;
 	pthread_mutex_t count_mutex;
 #endif
 #endif

+ 10 - 0
src/datawizard/copy_driver.h

@@ -37,6 +37,11 @@
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #endif
 #endif
 
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 struct _starpu_data_request;
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
@@ -101,4 +106,9 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif // __COPY_DRIVER_H__
 #endif // __COPY_DRIVER_H__

+ 8 - 4
src/datawizard/footprint.c

@@ -19,7 +19,7 @@
 #include <starpu_hash.h>
 #include <starpu_hash.h>
 #include <core/task.h>
 #include <core/task.h>
 
 
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j)
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
 {
 {
 	if (j->footprint_is_computed)
 	if (j->footprint_is_computed)
 		return j->footprint;
 		return j->footprint;
@@ -29,9 +29,13 @@ uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum
 
 
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 
 
-	if (model && model->per_arch[arch][nimpl].size_base)
+	if (model != NULL && 
+			model->per_arch[arch->type] != NULL &&
+			model->per_arch[arch->type][arch->devid] != NULL &&
+			model->per_arch[arch->type][arch->devid][arch->ncore] != NULL &&
+			model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base)
 	{
 	{
-		size_t size = model->per_arch[arch][nimpl].size_base(task, arch, nimpl);
+		size_t size = model->per_arch[arch->type][arch->devid][arch->ncore][nimpl].size_base(task, arch, nimpl);
 		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
 		footprint = starpu_hash_crc32c_be_n(&size, sizeof(size), footprint);
 	}
 	}
 	else if (model && model->size_base)
 	else if (model && model->size_base)
@@ -68,7 +72,7 @@ uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle)
 	return starpu_hash_crc32c_be(handle_footprint, interfaceid);
 	return starpu_hash_crc32c_be(handle_footprint, interfaceid);
 }
 }
 
 
-uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
 {
 {
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	return _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 	return _starpu_compute_buffers_footprint(model, arch, nimpl, j);

+ 1 - 1
src/datawizard/footprint.h

@@ -24,7 +24,7 @@
 
 
 /* Compute the footprint that characterizes the job and cache it into the job
 /* Compute the footprint that characterizes the job and cache it into the job
  * structure. */
  * structure. */
-uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, unsigned nimpl, struct _starpu_job *j);
+uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j);
 
 
 /* Compute the footprint that characterizes the layout of the data handle. */
 /* Compute the footprint that characterizes the layout of the data handle. */
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);
 uint32_t _starpu_compute_data_footprint(starpu_data_handle_t handle);

+ 9 - 0
src/datawizard/memory_manager.h

@@ -19,6 +19,11 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 /**
 /**
  * Initialises the memory manager
  * Initialises the memory manager
  */
  */
@@ -56,4 +61,8 @@ void _starpu_memory_manager_deallocate_size(size_t size, unsigned node);
 
 
 int _starpu_memory_manager_test_allocate_size_(size_t size, unsigned node);
 int _starpu_memory_manager_test_allocate_size_(size_t size, unsigned node);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __MEMORY_MANAGER_H__ */
 #endif /* __MEMORY_MANAGER_H__ */

+ 4 - 4
src/drivers/cpu/driver_cpu.c

@@ -111,7 +111,7 @@ _starpu_cpu_discover_devices(struct _starpu_machine_config *config)
  * Handle binding CPUs on cores.
  * Handle binding CPUs on cores.
  * In the case of a combined worker WORKER_TASK != J->TASK */
  * In the case of a combined worker WORKER_TASK != J->TASK */
 
 
-static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, enum starpu_perfmodel_archtype perf_arch)
+static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_task, struct _starpu_worker *cpu_args, int rank, struct starpu_perfmodel_arch* perf_arch)
 {
 {
 	int ret;
 	int ret;
 	int is_parallel_task = (j->task_size > 1);
 	int is_parallel_task = (j->task_size > 1);
@@ -292,7 +292,7 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 	int rank = 0;
 	int rank = 0;
 	int is_parallel_task = (j->task_size > 1);
 	int is_parallel_task = (j->task_size > 1);
 
 
-	enum starpu_perfmodel_archtype perf_arch;
+	struct starpu_perfmodel_arch* perf_arch;
 
 
 	/* Get the rank in case it is a parallel task */
 	/* Get the rank in case it is a parallel task */
 	if (is_parallel_task)
 	if (is_parallel_task)
@@ -307,14 +307,14 @@ int _starpu_cpu_driver_run_once(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 		cpu_worker->combined_workerid = j->combined_workerid;
 		cpu_worker->combined_workerid = j->combined_workerid;
 		cpu_worker->worker_size = combined_worker->worker_size;
 		cpu_worker->worker_size = combined_worker->worker_size;
 		cpu_worker->current_rank = rank;
 		cpu_worker->current_rank = rank;
-		perf_arch = combined_worker->perf_arch;
+		perf_arch = &combined_worker->perf_arch;
 	}
 	}
 	else
 	else
 	{
 	{
 		cpu_worker->combined_workerid = cpu_worker->workerid;
 		cpu_worker->combined_workerid = cpu_worker->workerid;
 		cpu_worker->worker_size = 1;
 		cpu_worker->worker_size = 1;
 		cpu_worker->current_rank = 0;
 		cpu_worker->current_rank = 0;
-		perf_arch = cpu_worker->perf_arch;
+		perf_arch = &cpu_worker->perf_arch;
 	}
 	}
 
 
 	_starpu_set_current_task(j->task);
 	_starpu_set_current_task(j->task);

+ 3 - 3
src/drivers/driver_common/driver_common.c

@@ -71,7 +71,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	_STARPU_TRACE_START_CODELET_BODY(j);
 	_STARPU_TRACE_START_CODELET_BODY(j);
 }
 }
 
 
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_codelet *cl = task->cl;
@@ -100,7 +100,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	args->status = STATUS_UNKNOWN;
 	args->status = STATUS_UNKNOWN;
 }
 }
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
-					enum starpu_perfmodel_archtype perf_arch,
+					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling)
 {
 {
 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
 	struct starpu_profiling_task_info *profiling_info = j->task->profiling_info;
@@ -146,7 +146,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
 
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
 	if (profiling_info && profiling_info->power_consumed && cl->power_model && cl->power_model->benchmarking)
 	{
 	{
-		_starpu_update_perfmodel_history(j, j->task->cl->power_model,  perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
+		_starpu_update_perfmodel_history(j, j->task->cl->power_model, perf_arch, worker_args->devid, profiling_info->power_consumed,j->nimpl);
 	}
 	}
 }
 }
 
 

+ 2 - 2
src/drivers/driver_common/driver_common.h

@@ -25,10 +25,10 @@
 
 
 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
 void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
 			      struct timespec *codelet_start, int rank, int profiling);
 			      struct timespec *codelet_start, int rank, int profiling);
-void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, enum starpu_perfmodel_archtype perf_arch,
+void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			    struct timespec *codelet_end, int rank, int profiling);
 			    struct timespec *codelet_end, int rank, int profiling);
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
-					enum starpu_perfmodel_archtype perf_arch,
+					struct starpu_perfmodel_arch* perf_arch,
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 
 
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);

+ 6 - 5
src/drivers/mic/driver_mic_common.h

@@ -27,16 +27,17 @@
 #define STARPU_TO_MIC_ID(id) ((id) + 1)
 #define STARPU_TO_MIC_ID(id) ((id) + 1)
 
 
 /* TODO: rather allocate ports on the host and pass them as parameters to the device process */
 /* TODO: rather allocate ports on the host and pass them as parameters to the device process */
-#define STARPU_MIC_PORTS_BEGIN 1099
+// We use the last SCIF reserved port and add 1000 to be safe
+#define STARPU_MIC_PORTS_BEGIN SCIF_PORT_RSVD+1000
 
 
 #define STARPU_MIC_SOURCE_PORT_NUMBER STARPU_MIC_PORTS_BEGIN
 #define STARPU_MIC_SOURCE_PORT_NUMBER STARPU_MIC_PORTS_BEGIN
-#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN + 1)
+#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN)
 
 
-#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
-#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN)
+#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
 
 
 #define STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(me, peer_id) \
 #define STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(me, peer_id) \
-((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
 
 
 #define STARPU_MIC_PAGE_SIZE 0x1000
 #define STARPU_MIC_PAGE_SIZE 0x1000
 #define STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size) \
 #define STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size) \

+ 0 - 5
src/drivers/mp_common/sink_common.c

@@ -249,8 +249,6 @@ void _starpu_sink_common_worker(void)
 	starpu_pthread_key_t worker_key;
 	starpu_pthread_key_t worker_key;
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 
 
-
-	struct _starpu_machine_config *config;
 	while (!exit_starpu)
 	while (!exit_starpu)
 	{
 	{
 		/* If we have received a message */
 		/* If we have received a message */
@@ -264,7 +262,6 @@ void _starpu_sink_common_worker(void)
 					exit_starpu = 1;
 					exit_starpu = 1;
 					break;
 					break;
 				case STARPU_EXECUTE:
 				case STARPU_EXECUTE:
-					config = _starpu_get_machine_config();
 					node->execute(node, arg, arg_size);
 					node->execute(node, arg, arg_size);
 					break;
 					break;
 				case STARPU_SINK_NBCORES:
 				case STARPU_SINK_NBCORES:
@@ -314,7 +311,6 @@ void _starpu_sink_common_worker(void)
 			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
 			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
 			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
-			config = _starpu_get_machine_config();
 			_starpu_mp_common_send_command(node, message->type, 
 			_starpu_mp_common_send_command(node, message->type, 
 					&message->buffer, message->size);
 					&message->buffer, message->size);
 			mp_message_delete(message);
 			mp_message_delete(message);
@@ -378,7 +374,6 @@ static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, str
  */
  */
 static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
 static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
 {
 {
-	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
 	mp_message_list_push_front(node->message_queue,message);
 	mp_message_list_push_front(node->message_queue,message);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);

+ 2 - 2
src/drivers/mp_common/source_common.c

@@ -36,7 +36,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 	uint32_t mask = 0;
 	uint32_t mask = 0;
 	int profiling = starpu_profiling_status_get();
 	int profiling = starpu_profiling_status_get();
 	struct timespec codelet_end;
 	struct timespec codelet_end;
-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
+	_starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0,
 			profiling);
 			profiling);
 	
 	
 	int count = worker->current_rank;
 	int count = worker->current_rank;
@@ -57,7 +57,7 @@ static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starp
 	if(count == 0)
 	if(count == 0)
 	{
 	{
 
 
-		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
+		_starpu_driver_update_job_feedback(j, worker, &worker->perf_arch,
 				&j->cl_start, &codelet_end,
 				&j->cl_start, &codelet_end,
 				profiling);
 				profiling);
 
 

+ 41 - 15
src/profiling/bound.c

@@ -25,6 +25,7 @@
 #include <starpu_config.h>
 #include <starpu_config.h>
 #include <profiling/bound.h>
 #include <profiling/bound.h>
 #include <core/jobs.h>
 #include <core/jobs.h>
+#include <core/workers.h>
 
 
 #ifdef STARPU_HAVE_GLPK_H
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
 #include <glpk.h>
@@ -100,7 +101,7 @@ struct bound_task
 	int depsn;
 	int depsn;
 
 
 	/* Estimated duration */
 	/* Estimated duration */
-	double duration[STARPU_NARCH_VARIATIONS];
+	double** duration[STARPU_NARCH];
 
 
 	/* Other tasks */
 	/* Other tasks */
 	struct bound_task *next;
 	struct bound_task *next;
@@ -186,7 +187,31 @@ static int good_job(struct _starpu_job *j)
 		return 0;
 		return 0;
 	return 1;
 	return 1;
 }
 }
+static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
+{
+	int devid, maxncore;
+	double ** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
+	arch_model[maxdevid] = NULL;
+	for(devid=0; devid<maxdevid; devid++)
+	{
+		if(maxncore_table != NULL)
+			maxncore = maxncore_table[devid];
+		else
+			maxncore = 1;
+		arch_model[devid] = malloc(sizeof(*arch_model[devid])*(maxncore+1));
+	}
+	return arch_model;
+}
 
 
+static void initialize_duration(struct bound_task *task)
+{
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
+}
 /* Create a new task (either because it has just been submitted, or a
 /* Create a new task (either because it has just been submitted, or a
  * dependency was added before submission) */
  * dependency was added before submission) */
 static void new_task(struct _starpu_job *j)
 static void new_task(struct _starpu_job *j)
@@ -202,10 +227,11 @@ static void new_task(struct _starpu_job *j)
 	t->tag_id = j->task->tag_id;
 	t->tag_id = j->task->tag_id;
 	t->use_tag = j->task->use_tag;
 	t->use_tag = j->task->use_tag;
 	t->cl = j->task->cl;
 	t->cl = j->task->cl;
-	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
+	t->footprint = _starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
 	t->priority = j->task->priority;
 	t->priority = j->task->priority;
 	t->deps = NULL;
 	t->deps = NULL;
 	t->depsn = 0;
 	t->depsn = 0;
+	initialize_duration(t);
 	t->next = tasks;
 	t->next = tasks;
 	j->bound_task = t;
 	j->bound_task = t;
 	tasks = t;
 	tasks = t;
@@ -236,7 +262,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 	{
 	{
 		struct bound_task_pool *tp;
 		struct bound_task_pool *tp;
 
 
-		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_DEFAULT, 0, j);
+		_starpu_compute_buffers_footprint(NULL, STARPU_CPU_WORKER, 0, j);
 
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;
 			tp = last;
@@ -400,7 +426,7 @@ static void _starpu_get_tasks_times(int nw, int nt, double *times)
 				.footprint = tp->footprint,
 				.footprint = tp->footprint,
 				.footprint_is_computed = 1,
 				.footprint_is_computed = 1,
 			};
 			};
-			enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
+			struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
 			if (isnan(length))
 			if (isnan(length))
 				times[w*nt+t] = NAN;
 				times[w*nt+t] = NAN;
@@ -486,15 +512,15 @@ void starpu_bound_print_lp(FILE *output)
 			};
 			};
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 			{
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (_STARPU_IS_ZERO(t1->duration[arch]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (_STARPU_IS_ZERO(t1->duration[arch->type][arch->devid][arch->ncore]))
 				{
 				{
 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
 					if (isnan(length))
 					if (isnan(length))
 						/* Avoid problems with binary coding of doubles */
 						/* Avoid problems with binary coding of doubles */
-						t1->duration[arch] = NAN;
+						t1->duration[arch->type][arch->devid][arch->ncore] = NAN;
 					else
 					else
-						t1->duration[arch] = length / 1000.;
+						t1->duration[arch->type][arch->devid][arch->ncore] = length / 1000.;
 				}
 				}
 			}
 			}
 			nt++;
 			nt++;
@@ -519,8 +545,8 @@ void starpu_bound_print_lp(FILE *output)
 		{
 		{
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 			{
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch]))
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
 					fprintf(output, " +t%luw%d", t1->id, w);
 					fprintf(output, " +t%luw%d", t1->id, w);
 			}
 			}
 			fprintf(output, " = 1;\n");
 			fprintf(output, " = 1;\n");
@@ -533,9 +559,9 @@ void starpu_bound_print_lp(FILE *output)
 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
 			for (w = 0; w < nw; w++)
 			for (w = 0; w < nw; w++)
 			{
 			{
-				enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-				if (!isnan(t1->duration[arch]))
-					fprintf(output, " + %f t%luw%d", t1->duration[arch], t1->id, w);
+				struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+				if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
+					fprintf(output, " + %f t%luw%d", t1->duration[arch->type][arch->devid][arch->ncore], t1->id, w);
 			}
 			}
 			fprintf(output, ";\n");
 			fprintf(output, ";\n");
 		}
 		}
@@ -616,8 +642,8 @@ void starpu_bound_print_lp(FILE *output)
 				{
 				{
 					for (w = 0; w < nw; w++)
 					for (w = 0; w < nw; w++)
 					{
 					{
-						enum starpu_perfmodel_archtype arch = starpu_worker_get_perf_archtype(w);
-						if (!isnan(t1->duration[arch]))
+						struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w);
+						if (!isnan(t1->duration[arch->type][arch->devid][arch->ncore]))
 						{
 						{
 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
 							fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
 									t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);

+ 4 - 4
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -407,7 +407,7 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
 		double exp_start = STARPU_MAX(fifo->exp_start, starpu_timing_now());
@@ -543,7 +543,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 
 
 		/* Sometimes workers didn't take the tasks as early as we expected */
 		/* Sometimes workers didn't take the tasks as early as we expected */
@@ -760,7 +760,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	}
 	}
 	else if (task->bundle)
 	else if (task->bundle)
 	{
 	{
-		enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
+		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(best_in_ctx);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		unsigned memory_node = starpu_worker_get_memory_node(best);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		model_best = starpu_task_expected_length(task, perf_arch, selected_impl);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
 		transfer_model_best = starpu_task_expected_data_transfer_time(memory_node, task);
@@ -931,7 +931,7 @@ static void dmda_push_task_notify(struct starpu_task *task, int workerid, unsign
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 	/* Compute the expected penality */
 	/* Compute the expected penality */
-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch *perf_arch = starpu_worker_get_perf_archtype(workerid);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 	unsigned memory_node = starpu_worker_get_memory_node(workerid);
 
 
 	double predicted = starpu_task_expected_length(task, perf_arch,
 	double predicted = starpu_task_expected_length(task, perf_arch,

+ 2 - 2
src/sched_policies/parallel_heft.c

@@ -234,7 +234,7 @@ static double compute_expected_end(int workerid, double length)
 
 
 static double compute_ntasks_end(int workerid)
 static double compute_ntasks_end(int workerid)
 {
 {
-	enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(workerid);
+	struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(workerid);
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 	starpu_pthread_cond_t *sched_cond;
 
 
@@ -355,7 +355,7 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 			}
 			}
 
 
 
 
-			enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+			struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 
 
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
 

+ 1 - 1
src/sched_policies/random_policy.c

@@ -50,7 +50,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 		{
 		{
 			if(starpu_worker_can_execute_task(worker, task, impl))
 			if(starpu_worker_can_execute_task(worker, task, impl))
 			{
 			{
-				enum starpu_perfmodel_archtype perf_arch = starpu_worker_get_perf_archtype(worker);
+				struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
 				double speedup = starpu_worker_get_relative_speedup(perf_arch);
 				alpha_sum += speedup;
 				alpha_sum += speedup;
 				speedup_arr[size] = speedup;
 				speedup_arr[size] = speedup;

+ 1 - 1
src/starpu_parameters.h

@@ -29,5 +29,5 @@
 #define _STARPU_CPU_ALPHA	1.0f
 #define _STARPU_CPU_ALPHA	1.0f
 #define _STARPU_CUDA_ALPHA	13.33f
 #define _STARPU_CUDA_ALPHA	13.33f
 #define _STARPU_OPENCL_ALPHA	12.22f
 #define _STARPU_OPENCL_ALPHA	12.22f
-
+#define _STARPU_MIC_ALPHA	11.11f
 #endif /* _STARPU_PARAMETERS_H */
 #endif /* _STARPU_PARAMETERS_H */

+ 3 - 0
tests/microbenchs/async_tasks_overhead.c

@@ -102,6 +102,9 @@ int main(int argc, char **argv)
 
 
 	/* Create an array of tasks */
 	/* Create an array of tasks */
 	struct starpu_task **tasks = (struct starpu_task **) malloc(ntasks*sizeof(struct starpu_task *));
 	struct starpu_task **tasks = (struct starpu_task **) malloc(ntasks*sizeof(struct starpu_task *));
+	
+	int lala=1;
+while(lala);
 
 
 	for (i = 0; i < ntasks; i++)
 	for (i = 0; i < ntasks; i++)
 	{
 	{

+ 1 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -31,7 +31,7 @@ void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	int worker_size = starpu_combined_worker_get_size();
 	int worker_size = starpu_combined_worker_get_size();
 	STARPU_ASSERT(worker_size > 0);
 	STARPU_ASSERT(worker_size > 0);
 	usleep(1000/worker_size);
 	usleep(1000/worker_size);
-#if 0
+#if 1
 	int id = starpu_worker_get_id();
 	int id = starpu_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
 	int combined_id = starpu_combined_worker_get_id();
 	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
 	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);

+ 13 - 4
tests/perfmodels/feed.c

@@ -50,6 +50,9 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
+	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+		 return STARPU_TEST_SKIPPED;
+
 	starpu_task_init(&task);
 	starpu_task_init(&task);
 	task.cl = &cl;
 	task.cl = &cl;
 
 
@@ -66,12 +69,18 @@ int main(int argc, char **argv)
 		measured_fast = 0.002+size*0.00000001;
 		measured_fast = 0.002+size*0.00000001;
 		measured_slow = 0.001+size*0.0000001;
 		measured_slow = 0.001+size*0.0000001;
 
 
+		struct starpu_perfmodel_arch arch;
+		arch.type = STARPU_CUDA_WORKER;
+		arch.ncore = 0;
 		/* Simulate Fast GPU */
 		/* Simulate Fast GPU */
-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT, 0, 0, measured_fast);
+		arch.devid = 0;
+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
+		
 		/* Simulate Slow GPU */
 		/* Simulate Slow GPU */
-		starpu_perfmodel_update_history(&model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
-		starpu_perfmodel_update_history(&nl_model, &task, STARPU_CUDA_DEFAULT + 1, 0, 0, measured_slow);
+		arch.devid = 1;
+		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
+		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_slow);
 		starpu_task_clean(&task);
 		starpu_task_clean(&task);
 		starpu_data_unregister(handle);
 		starpu_data_unregister(handle);
 	}
 	}

+ 13 - 5
tests/perfmodels/valid_model.c

@@ -60,12 +60,14 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	int ret;
 	int ret;
 	int old_nsamples, new_nsamples;
 	int old_nsamples, new_nsamples;
 	struct starpu_conf conf;
 	struct starpu_conf conf;
-	unsigned archid;
+	unsigned archid, archtype, devid, ncore;
 
 
 	starpu_conf_init(&conf);
 	starpu_conf_init(&conf);
 	conf.sched_policy_name = "eager";
 	conf.sched_policy_name = "eager";
 	conf.calibrate = 1;
 	conf.calibrate = 1;
 
 
+	initialize_model(model);
+
 	ret = starpu_init(&conf);
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -75,8 +77,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	old_nsamples = 0;
 	old_nsamples = 0;
 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	if (ret != 1)
 	if (ret != 1)
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-			old_nsamples += lmodel.per_arch[archid][0].regression.nsample;
+		for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+			if(lmodel.per_arch[archtype] != NULL)
+				for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
+					for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
+						old_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
 
 
         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
         starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
 	for (loop = 0; loop < nloops; loop++)
 	for (loop = 0; loop < nloops; loop++)
@@ -97,8 +102,11 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	}
 	}
 
 
 	new_nsamples = 0;
 	new_nsamples = 0;
-	for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-		new_nsamples += lmodel.per_arch[archid][0].regression.nsample;
+	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
+		if(lmodel.per_arch[archtype] != NULL)
+			for(devid=0; lmodel.per_arch[archtype][devid] != NULL; devid++)
+				for(ncore=0; lmodel.per_arch[archtype][devid][ncore] != NULL; ncore++)
+					new_nsamples += lmodel.per_arch[archtype][devid][ncore][0].regression.nsample;
 
 
 	ret = starpu_perfmodel_unload_model(&lmodel);
 	ret = starpu_perfmodel_unload_model(&lmodel);
 	if (ret == 1)
 	if (ret == 1)

+ 20 - 11
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -40,7 +40,7 @@ dummy(void *buffers[], void *args)
  */
  */
 static double
 static double
 cpu_task_cpu(struct starpu_task *task,
 cpu_task_cpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 	     unsigned nimpl)
 {
 {
 	(void) task;
 	(void) task;
@@ -51,7 +51,7 @@ cpu_task_cpu(struct starpu_task *task,
 
 
 static double
 static double
 cpu_task_gpu(struct starpu_task *task,
 cpu_task_gpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 	     unsigned nimpl)
 {
 {
 	(void) task;
 	(void) task;
@@ -63,7 +63,7 @@ cpu_task_gpu(struct starpu_task *task,
 
 
 static double
 static double
 gpu_task_cpu(struct starpu_task *task,
 gpu_task_cpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 	     unsigned nimpl)
 {
 {
 	(void) task;
 	(void) task;
@@ -75,7 +75,7 @@ gpu_task_cpu(struct starpu_task *task,
 
 
 static double
 static double
 gpu_task_gpu(struct starpu_task *task,
 gpu_task_gpu(struct starpu_task *task,
-	     enum starpu_perfmodel_archtype arch,
+	     struct starpu_perfmodel_arch* arch,
 	     unsigned nimpl)
 	     unsigned nimpl)
 {
 {
 	(void) task;
 	(void) task;
@@ -99,17 +99,26 @@ static struct starpu_perfmodel model_gpu_task =
 static void
 static void
 init_perfmodels(void)
 init_perfmodels(void)
 {
 {
-	int i;
-	for (i = STARPU_CPU_DEFAULT; i < STARPU_CUDA_DEFAULT; i++)
+	unsigned devid, ncore;
+
+	for(devid=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid] != NULL; devid++)
 	{
 	{
-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_cpu;
-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_cpu;
+		for(ncore=0; model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore] != NULL; ncore++)
+		{
+			model_cpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
+			model_gpu_task.per_arch[STARPU_CPU_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
+		}
 	}
 	}
-	for (i = STARPU_CUDA_DEFAULT; i < STARPU_NARCH_VARIATIONS; i++)
+
+	for(devid=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid] != NULL; devid++)
 	{
 	{
-		model_cpu_task.per_arch[i][0].cost_function = cpu_task_gpu;
-		model_gpu_task.per_arch[i][0].cost_function = gpu_task_gpu;
+		for(ncore=0; model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore] != NULL; ncore++)
+		{
+			model_cpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = cpu_task_cpu;
+			model_gpu_task.per_arch[STARPU_CUDA_WORKER][devid][ncore][0].cost_function = gpu_task_cpu;
+		}
 	}
 	}
+
 }
 }
 
 
 /*
 /*

+ 1 - 1
tools/Makefile.am

@@ -16,7 +16,7 @@
 
 
 SUBDIRS =
 SUBDIRS =
 
 
-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
+AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
 AM_LDFLAGS = $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 AM_LDFLAGS = $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)

+ 186 - 83
tools/starpu_perfmodel_plot.c

@@ -29,6 +29,7 @@
 
 
 #include <starpu.h>
 #include <starpu.h>
 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
+#include <core/workers.h>
 
 
 #ifdef __MINGW32__
 #ifdef __MINGW32__
 #include <windows.h>
 #include <windows.h>
@@ -52,7 +53,7 @@ static struct starpu_fxt_options options;
 #endif
 #endif
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-static int archtype_is_found[STARPU_NARCH_VARIATIONS];
+static int **archtype_is_found[STARPU_NARCH];
 
 
 static char data_file_name[256];
 static char data_file_name[256];
 #endif
 #endif
@@ -163,7 +164,7 @@ static void parse_args(int argc, char **argv)
 	if (!symbol && !list)
 	if (!symbol && !list)
 	{
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
 		fprintf(stderr, "Incorrect usage, aborting\n");
-                usage(argv);
+                usage();
 		exit(-1);
 		exit(-1);
 	}
 	}
 
 
@@ -181,13 +182,13 @@ static void print_comma(FILE *gnuplot_file, int *first)
 	}
 	}
 }
 }
 
 
-static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch, int *first, unsigned nimpl)
+static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first, unsigned nimpl)
 {
 {
 	char arch_name[256];
 	char arch_name[256];
 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
 
 
 	struct starpu_perfmodel_per_arch *arch_model =
 	struct starpu_perfmodel_per_arch *arch_model =
-		&model->per_arch[arch][nimpl];
+		&model->per_arch[arch->type][arch->devid][arch->ncore][nimpl];
 
 
 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
 		fprintf(stderr,"Arch: %s\n", arch_name);
 		fprintf(stderr,"Arch: %s\n", arch_name);
@@ -227,11 +228,10 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 	}
 	}
 }
 }
 
 
-static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
+static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype* type, int* devid, int* ncore, int *first)
 {
 {
 	char *command;
 	char *command;
 	FILE *datafile;
 	FILE *datafile;
-	unsigned arch;
 	struct starpu_perfmodel_history_list *ptr;
 	struct starpu_perfmodel_history_list *ptr;
 	char arch_name[32];
 	char arch_name[32];
 	int col;
 	int col;
@@ -245,89 +245,169 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
 
 	col = 2;
 	col = 2;
 	unsigned implid;
 	unsigned implid;
-	for (arch = arch1; arch < arch2; arch++)
+
+	unsigned archmin, archmax, devmin, devmax, coremin, coremax;
+	if(type != NULL)
 	{
 	{
-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		archmin = *type;
+		archmax = *type +1;
+		if(devid != NULL)
 		{
 		{
-			struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-			starpu_perfmodel_get_arch_name((enum starpu_perfmodel_archtype) arch, arch_name, 32, implid);
-
-			//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
+			devmin = *devid;
+			devmax = *devid +1;
+			if(ncore != NULL)
+			{
+				coremin = *ncore;
+				coremax = *ncore +1;
+			}
+			else
+			{
+				coremin = 0;
+				coremax = 0;
+			}
+		}
+		else
+		{
+			devmin = 0;
+			devmax = 0;
+			coremin = 0;
+			coremax = 0;
+		}
+	}
+	else
+	{
+		archmin = 0;
+		archmax = STARPU_NARCH;
+		devmin = 0;
+		devmax = 0;
+		coremin = 0;
+		coremax = 0;
 
 
-			if (arch_model->list)
+	}
+	struct starpu_perfmodel_arch arch;
+	unsigned archtype, dev, core;
+	for (archtype = archmin; archtype < archmax; archtype++)
+	{
+		arch.type = archtype;
+		if(model->per_arch[archtype]!=NULL)
+		{
+			for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
 			{
 			{
-				print_comma(gnuplot_file, first);
-				fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
-				col += 2;
+				arch.devid = dev;
+
+				for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+				{
+					arch.ncore = core;
+					for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+					{
+						struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+						starpu_perfmodel_get_arch_name(&arch, arch_name, 32, implid);
+
+						//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
+
+						if (arch_model->list)
+						{
+							print_comma(gnuplot_file, first);
+							fprintf(gnuplot_file, "\"%s\" using 1:%d:%d with errorlines title \"Average %s\"", avg_file_name, col, col+1, arch_name);
+							col += 2;
+						}
+					}
+				}
 			}
 			}
 		}
 		}
 	}
 	}
 
 
+	/* Dump entries in size order */
 	while (1)
 	while (1)
 	{
 	{
 		last = minimum;
 		last = minimum;
 
 
 		minimum = ULONG_MAX;
 		minimum = ULONG_MAX;
 		/* Get the next minimum */
 		/* Get the next minimum */
-		for (arch = arch1; arch < arch2; arch++)
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{
-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
-				{
-					unsigned long size = ptr->entry->size;
-					if (size > last && size < minimum)
-						minimum = size;
-				}
-			}
+		for (archtype = archmin; archtype < archmax; archtype++)
+			if(model->per_arch[archtype]!=NULL)
+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+						{
+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
+							{
+								unsigned long size = ptr->entry->size;
+								if (size > last && size < minimum)
+									minimum = size;
+							}
+						}
 		if (minimum == ULONG_MAX)
 		if (minimum == ULONG_MAX)
 			break;
 			break;
 
 
 		fprintf(stderr, "%lu ", minimum);
 		fprintf(stderr, "%lu ", minimum);
 		fprintf(datafile, "%-15lu ", minimum);
 		fprintf(datafile, "%-15lu ", minimum);
-		for (arch = arch1; arch < arch2; arch++)
-		{
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{
-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
-				for (ptr = arch_model->list; ptr; ptr = ptr->next)
-				{
-					struct starpu_perfmodel_history_entry *entry = ptr->entry;
-					if (entry->size == minimum)
-					{
-						if (gflops)
-							fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
-									entry->flops / ((entry->mean + entry->deviation) * 1000) -
-									entry->flops / (entry->mean * 1000)
-									);
-						else
-							fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
-						break;
-					}
-				}
-				if (!ptr && arch_model->list)
-					/* No value for this arch. */
-					fprintf(datafile, "\t\"\"\t\"\"");
-			}
-		}
+		for (archtype = archmin; archtype < archmax; archtype++)
+			if(model->per_arch[archtype]!=NULL)
+				for(dev = devmin; model->per_arch[archtype][dev] != NULL && (devmax == 0 || dev < devmax);dev++)
+					for(core = coremin; model->per_arch[archtype][dev][core] != NULL && (coremax == 0 || core < coremax); core++)
+						for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+						{
+							struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[archtype][dev][core][implid];
+							for (ptr = arch_model->list; ptr; ptr = ptr->next)
+							{
+								struct starpu_perfmodel_history_entry *entry = ptr->entry;
+								if (entry->size == minimum)
+								{
+									if (gflops)
+										fprintf(datafile, "\t%-15le\t%-15le", entry->flops / (entry->mean * 1000),
+												entry->flops / ((entry->mean + entry->deviation) * 1000) -
+												entry->flops / (entry->mean * 1000)
+										       );
+									else
+										fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
+									break;
+								}
+							}
+							if (!ptr && arch_model->list)
+								/* No value for this arch. */
+								fprintf(datafile, "\t\"\"\t\"\"");
+						}
 		fprintf(datafile, "\n");
 		fprintf(datafile, "\n");
 	}
 	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, "\n");
 	fclose(datafile);
 	fclose(datafile);
 }
 }
 
 
-static void display_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_perfmodel_archtype arch1, enum starpu_perfmodel_archtype arch2, int *first)
+
+static void display_selected_arch_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, int *first)
 {
 {
-	unsigned arch;
 	unsigned implid;
 	unsigned implid;
-	for (arch = arch1; arch < arch2; arch++)
+	for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+		display_perf_model(gnuplot_file, model, arch, first, implid);
+}
+
+static void display_selected_device_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int devid, int *first)
+{
+	unsigned ncore;
+	struct starpu_perfmodel_arch arch;
+	arch.type = archtype;
+	arch.devid = devid;
+	for(ncore=0; model->per_arch[archtype][devid][ncore] != NULL; ncore++)
 	{
 	{
-		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-		{
-			display_perf_model(gnuplot_file, model, (enum starpu_perfmodel_archtype) arch, first, implid);
-		}
+		arch.ncore = ncore;
+		display_selected_arch_perf_models(gnuplot_file,model,&arch,first);
 	}
 	}
-	display_history_based_perf_models(gnuplot_file, model, arch1, arch2, first);
+}
+
+static void display_selected_archtype_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, enum starpu_worker_archtype archtype, int *first)
+{
+	unsigned devid;
+	for(devid=0; model->per_arch[archtype][devid] != NULL; devid++)
+		display_selected_device_perf_models(gnuplot_file,model,archtype,devid,first);
+}
+
+static void display_all_perf_models(FILE *gnuplot_file, struct starpu_perfmodel *model, int *first)
+{
+	unsigned archtype;
+	for(archtype = 0; archtype < STARPU_NARCH; archtype++)
+		display_selected_archtype_perf_models(gnuplot_file,model,archtype,first);
 }
 }
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
@@ -379,55 +459,72 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 	int first = 1;
 	int first = 1;
 	fprintf(gnuplot_file, "plot\t");
 	fprintf(gnuplot_file, "plot\t");
 
 
+	struct starpu_perfmodel_arch arch;
+	struct _starpu_machine_config *conf = _starpu_get_machine_config();
+
+
+
 	if (archname == NULL)
 	if (archname == NULL)
 	{
 	{
 		/* display all architectures */
 		/* display all architectures */
-		display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) 0, (enum starpu_perfmodel_archtype) STARPU_NARCH_VARIATIONS, &first);
+		display_all_perf_models(gnuplot_file, model, &first);
+		display_history_based_perf_models(gnuplot_file, model, NULL, NULL, NULL, &first);
 	}
 	}
 	else
 	else
 	{
 	{
 		if (strcmp(archname, "cpu") == 0)
 		if (strcmp(archname, "cpu") == 0)
 		{
 		{
-			unsigned impl;
-			for (impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
-			{
-				display_perf_model(gnuplot_file, model,
-						   STARPU_CPU_DEFAULT,
-						   &first, impl);
-			}
+			
+			arch.type = STARPU_CPU_WORKER;
+			arch.devid = 1;
+			arch.ncore = 0;
+
+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 			return;
 			return;
 		}
 		}
 
 
-		int k;
-		if (sscanf(archname, "cpu:%d", &k) == 1)
+		unsigned k;
+		if (sscanf(archname, "cpu:%u", &k) == 1)
 		{
 		{
 			/* For combined CPU workers */
 			/* For combined CPU workers */
-			if ((k < 1) || (k > STARPU_MAXCPUS))
+			if ((k < 1) || (k > conf->topology.ncpus))
 			{
 			{
 				fprintf(stderr, "Invalid CPU size\n");
 				fprintf(stderr, "Invalid CPU size\n");
 				exit(-1);
 				exit(-1);
 			}
 			}
 
 
-			display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k - 1), (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + k), &first);
+			arch.type = STARPU_CPU_WORKER;
+			arch.devid = 1;
+			arch.ncore = k - 1;
+
+			display_selected_arch_perf_models(gnuplot_file, model, &arch, &first); 
+			display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 			return;
 			return;
 		}
 		}
 
 
 		if (strcmp(archname, "cuda") == 0)
 		if (strcmp(archname, "cuda") == 0)
 		{
 		{
-			display_perf_models(gnuplot_file, model, STARPU_CUDA_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS), &first);
+			unsigned archtype = STARPU_CUDA_WORKER;
+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
 			return;
 			return;
 		}
 		}
 
 
 		/* There must be a cleaner way ! */
 		/* There must be a cleaner way ! */
-		int gpuid;
+		unsigned gpuid;
 		int nmatched;
 		int nmatched;
-		nmatched = sscanf(archname, "cuda_%d", &gpuid);
+		nmatched = sscanf(archname, "cuda_%u", &gpuid);
 		if (nmatched == 1)
 		if (nmatched == 1)
 		{
 		{
-			int archid = STARPU_CUDA_DEFAULT+ gpuid;
-			if (archid < STARPU_OPENCL_DEFAULT)
+			if (gpuid < conf->topology.ncudagpus)
 			{
 			{
-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
+				arch.type = STARPU_CUDA_WORKER;
+				arch.devid = gpuid;
+				arch.ncore = 0;
+
+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 				return;
 				return;
 			}
 			}
 			else
 			else
@@ -439,18 +536,24 @@ static void display_selected_models(FILE *gnuplot_file, struct starpu_perfmodel
 
 
 		if (strcmp(archname, "opencl") == 0)
 		if (strcmp(archname, "opencl") == 0)
 		{
 		{
-			display_perf_models(gnuplot_file, model, STARPU_OPENCL_DEFAULT, (enum starpu_perfmodel_archtype) (STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS), &first);
+			unsigned archtype = STARPU_OPENCL_WORKER;
+			display_selected_archtype_perf_models(gnuplot_file, model, archtype, &first);
+			display_history_based_perf_models(gnuplot_file, model,  &archtype, NULL, NULL, &first);
 			return;
 			return;
 		}
 		}
 
 
 		/* There must be a cleaner way ! */
 		/* There must be a cleaner way ! */
-		nmatched = sscanf(archname, "opencl_%d", &gpuid);
+		nmatched = sscanf(archname, "opencl_%u", &gpuid);
 		if (nmatched == 1)
 		if (nmatched == 1)
 		{
 		{
-			int archid = STARPU_OPENCL_DEFAULT+ gpuid;
-			if (archid < STARPU_NARCH_VARIATIONS)
+			if (gpuid < conf->topology.nopenclgpus)
 			{
 			{
-				display_perf_models(gnuplot_file, model, (enum starpu_perfmodel_archtype) archid, (enum starpu_perfmodel_archtype) (archid + 1), &first);
+				arch.type = STARPU_OPENCL_WORKER;
+				arch.devid = gpuid;
+				arch.ncore = 0;
+		
+				display_selected_arch_perf_models(gnuplot_file, model, &arch, &first);
+				display_history_based_perf_models(gnuplot_file, model,  &arch.type, &arch.devid, &arch.ncore, &first);
 				return;
 				return;
 			}
 			}
 			else
 			else