소스 검색

merge trunk

Samuel Thibault 12 년 전
부모
커밋
d5d7e412c3
100개의 변경된 파일96383개의 추가작업 그리고 357685개의 파일을 삭제
  1. 1 1
      Makefile.am
  2. 104 13
      configure.ac
  3. 7 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  4. 1 1
      doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
  5. 35 62
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  6. 1 10
      doc/doxygen/chapters/api/scheduling_policy.doxy
  7. 43 0
      doc/doxygen/chapters/api/workers.doxy
  8. 93452 356126
      doc/doxygen/chapters/data_trace.eps
  9. BIN
      doc/doxygen/chapters/data_trace.pdf
  10. BIN
      doc/doxygen/chapters/data_trace.png
  11. 53 0
      doc/doxygen/chapters/environment_variables.doxy
  12. 7 5
      doc/doxygen/chapters/performance_feedback.doxy
  13. 65 15
      doc/doxygen/chapters/scheduling_context_hypervisor.doxy
  14. 49 25
      doc/doxygen/chapters/scheduling_contexts.doxy
  15. 1 1
      examples/Makefile.am
  16. 7 1
      examples/stencil/stencil-tasks.c
  17. 1 0
      include/starpu.h
  18. 1 1
      include/starpu_deprecated_api.h
  19. 1 1
      include/starpu_perfmodel.h
  20. 30 34
      include/starpu_sched_ctx.h
  21. 41 10
      mic-configure
  22. 70 60
      mpi/src/starpu_mpi.c
  23. 0 1
      mpi/src/starpu_mpi_collective.c
  24. 1 1
      sc_hypervisor/examples/lp_test/lp_resize_test.c
  25. 1 1
      sc_hypervisor/examples/lp_test/lp_test.c
  26. 6 0
      sc_hypervisor/include/sc_hypervisor.h
  27. 4 0
      sc_hypervisor/include/sc_hypervisor_lp.h
  28. 1 1
      sc_hypervisor/src/Makefile.am
  29. 1 1
      sc_hypervisor/src/hypervisor_policies/app_driven_policy.c
  30. 4 7
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  31. 2 1
      sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
  32. 5 219
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  33. 3 7
      sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
  34. 11 11
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  35. 3 2
      sc_hypervisor/src/policies_utils/dichotomy.c
  36. 216 1
      sc_hypervisor/src/policies_utils/lp_programs.c
  37. 47 26
      sc_hypervisor/src/policies_utils/lp_tools.c
  38. 5 3
      sc_hypervisor/src/policies_utils/policy_tools.c
  39. 9 5
      sc_hypervisor/src/policies_utils/speed.c
  40. 82 19
      sc_hypervisor/src/sc_hypervisor.c
  41. 2 0
      sc_hypervisor/src/sc_hypervisor_intern.h
  42. 4 2
      src/Makefile.am
  43. 4 0
      src/common/barrier_counter.h
  44. 1 1
      src/common/uthash.h
  45. 24 4
      src/core/combined_workers.c
  46. 4 4
      src/core/debug.c
  47. 2 2
      src/core/dependencies/tags.c
  48. 5 1
      src/core/dependencies/task_deps.c
  49. 86 25
      src/core/detect_combined_workers.c
  50. 2 0
      src/core/disk.h
  51. 2 2
      src/core/disk_ops/disk_stdio.c
  52. 2 2
      src/core/disk_ops/unistd/disk_unistd_global.c
  53. 5 2
      src/core/jobs.c
  54. 26 91
      src/core/sched_ctx.c
  55. 1 6
      src/core/sched_ctx.h
  56. 86 0
      src/core/sched_ctx_list.c
  57. 32 0
      src/core/sched_ctx_list.h
  58. 33 55
      src/core/sched_policy.c
  59. 11 10
      src/core/topology.c
  60. 11 9
      src/core/workers.c
  61. 6 2
      src/core/workers.h
  62. 5 2
      src/datawizard/coherency.c
  63. 1 0
      src/datawizard/data_request.c
  64. 0 2
      src/datawizard/filters.c
  65. 9 11
      src/datawizard/reduction.c
  66. 60 13
      src/debug/traces/starpu_fxt.c
  67. 93 21
      src/drivers/driver_common/driver_common.c
  68. 1 1
      src/drivers/driver_common/driver_common.h
  69. 18 4
      src/drivers/mic/driver_mic_common.c
  70. 8 5
      src/drivers/mic/driver_mic_common.h
  71. 88 14
      src/drivers/mic/driver_mic_sink.c
  72. 5 3
      src/drivers/mic/driver_mic_sink.h
  73. 50 215
      src/drivers/mic/driver_mic_source.c
  74. 3 1
      src/drivers/mic/driver_mic_source.h
  75. 155 101
      src/drivers/mp_common/mp_common.c
  76. 88 29
      src/drivers/mp_common/mp_common.h
  77. 487 142
      src/drivers/mp_common/sink_common.c
  78. 9 2
      src/drivers/mp_common/sink_common.h
  79. 432 64
      src/drivers/mp_common/source_common.c
  80. 28 9
      src/drivers/mp_common/source_common.h
  81. 8 0
      src/drivers/scc/driver_scc_common.c
  82. 2 0
      src/drivers/scc/driver_scc_common.h
  83. 34 0
      src/drivers/scc/driver_scc_sink.c
  84. 5 0
      src/drivers/scc/driver_scc_sink.h
  85. 28 121
      src/drivers/scc/driver_scc_source.c
  86. 1 0
      src/drivers/scc/driver_scc_source.h
  87. 3 3
      src/sched_policies/deque_modeling_policy_data_aware.c
  88. 16 18
      src/sched_policies/parallel_eager.c
  89. 1 1
      src/sched_policies/parallel_heft.c
  90. 2 2
      src/sched_policies/random_policy.c
  91. 1 1
      src/sched_policies/work_stealing_policy.c
  92. 7 2
      src/util/starpu_data_cpy.c
  93. 3 3
      tests/Makefile.am
  94. 2 0
      tests/datawizard/acquire_cb_insert.c
  95. 6 2
      tests/datawizard/commute.c
  96. 2 0
      tests/datawizard/data_invalidation.c
  97. 1 1
      tests/datawizard/interfaces/coo/coo_interface.c
  98. 1 1
      tests/disk/disk_copy.c
  99. 0 1
      tests/errorcheck/invalid_blocking_calls.c
  100. 0 0
      tests/loader-cross.sh.in

+ 1 - 1
Makefile.am

@@ -123,7 +123,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README STARPU-REVISION
-EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
+EXTRA_DIST = AUTHORS COPYING.LGPL README STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
 
 DISTCLEANFILES = STARPU-REVISION
 

+ 104 - 13
configure.ac

@@ -344,11 +344,6 @@ if test x$enable_cpu = xyes; then
 	AC_DEFINE(STARPU_USE_CPU, [1], [CPU driver is activated])
 fi
 
-# How many parallel worker can we support ?
-nmaxcombinedworkers=`expr 2 \* $maxcpus`
-AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
-	[$nmaxcombinedworkers], [Maximum number of worker combinations])
-
 ###############################################################################
 #                                                                             #
 #                                 CUDA settings                               #
@@ -969,7 +964,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 AC_MSG_CHECKING(maximum number of MIC threads)
 AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
 			[maximum number of MIC threads])],
-			nmaxmicthreads=$enableval, nmaxmicthreads=128)
+			nmaxmicthreads=$enableval, nmaxmicthreads=940)
 AC_MSG_RESULT($nmaxmicthread)
 
 AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
@@ -998,7 +993,6 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     __coi_dir=$1
     __coi_include_dir=$2
     __coi_lib_dir=$3
-    __coi_lib_name=$4
 
     if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
 	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
@@ -1027,14 +1021,14 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
 
     if test "$have_valid_coi" = "yes" ; then
-	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
 
         if test "$have_valid_coi" = "no" ; then
             if test "$3" = "no" -a "$__coi_dir" != "no" ; then
 		# ${__coi_dir}/lib didn't work, let's try with lib64
                 __coi_lib_dir="$__coi_dir/lib64"
 		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
-	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
             fi
         fi
     fi
@@ -1043,8 +1037,89 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
         STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
     fi
 
-    if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
-        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
+    if test "$have_valid_coi" = "yes" ; then
+        if test "$__coi_lib_dir" != "no"; then
+	    STARPU_COI_LDFLAGS="-L$__coi_lib_dir"
+        fi
+	STARPU_COI_LDFLAGS="${STARPU_COI_LDFLAGS} -l$4"
+    fi
+
+    CPPFLAGS="${SAVED_CPPFLAGS}"
+    LDFLAGS="${SAVED_LDFLAGS}"
+])
+
+AC_ARG_WITH(scif-dir,
+	[AS_HELP_STRING([--with-scif-dir=<path>],
+	[specify the MIC's SCIF installation directory])],
+	[scif_dir="$withval"],
+	[scif_dir=no])
+
+AC_ARG_WITH(scif-include-dir,
+	[AS_HELP_STRING([--with-scif-include-dir=<path>],
+	[specify where the MIC's SCIF headers are installed])],
+	[scif_include_dir="$withval"],
+	[scif_include_dir=no])
+
+AC_ARG_WITH(scif-lib-dir,
+	[AS_HELP_STRING([--with-scif-lib-dir=<path>],
+	[specify where the MIC's SCIF libraries are installed])],
+	[scif_lib_dir="$withval"],
+	[scif_lib_dir=no])
+
+AC_DEFUN([STARPU_CHECK_SCIF_RUNTIME],
+[
+    __scif_dir=$1
+    __scif_include_dir=$2
+    __scif_lib_dir=$3
+
+    if test "$__scif_dir" != "no" -a "$__scif_dir" != "" ; then
+	AC_MSG_CHECKING(whether MIC's SCIF runtime is available in $__scif_dir)
+    else
+	AC_MSG_CHECKING(whether MIC's SCIF runtime is available)
+    fi
+    AC_MSG_RESULT()
+
+    if test "$__scif_include_dir" = "no" -a "$__scif_dir" != "no" ; then
+        __scif_include_dir="${__scif_dir}/include"
+    fi
+    if test "$__scif_lib_dir" = "no" -a "$__scif_dir" != "no" ; then
+        __scif_lib_dir="${__scif_dir}/lib"
+    fi
+
+    SAVED_CPPFLAGS="$CPPFLAGS"
+    SAVED_LDFLAGS="$LDFLAGS"
+
+    if test "$__scif_include_dir" != "no" ; then
+        CPPFLAGS="${CPPFLAGS} -I$__scif_include_dir"
+    fi
+    if test "$__scif_lib_dir" != "no" ; then
+	LDFLAGS="${LDFLAGS} -L$__scif_lib_dir"
+    fi
+
+#    AC_CHECK_HEADER([source/SCIFEngine_source.h],[have_valid_scif=yes],[have_valid_scif=no])
+
+#    if test "$have_valid_scif" = "yes" ; then
+	AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
+
+        if test "$have_valid_scif" = "no" ; then
+            if test "$3" = "no" -a "$__scif_dir" != "no" ; then
+		# ${__scif_dir}/lib didn't work, let's try with lib64
+                __scif_lib_dir="$__scif_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__scif_lib_dir"
+	        AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
+            fi
+        fi
+#    fi
+
+    if test "$have_valid_scif" = "yes" -a "$__scif_include_dir" != "no"; then
+        STARPU_SCIF_CPPFLAGS="-I$__scif_include_dir"
+    fi
+
+    if test "$have_valid_scif" = "yes" ; then
+        if test "$__scif_lib_dir" != "no"; then
+	    STARPU_SCIF_LDFLAGS="-L$__scif_lib_dir"
+        fi
+	STARPU_SCIF_LDFLAGS="${STARPU_SCIF_LDFLAGS} -lscif"
     fi
 
     CPPFLAGS="${SAVED_CPPFLAGS}"
@@ -1053,20 +1128,27 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
 if test x$enable_mic = xyes ; then
 
-    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
 
     # Host runtime is not compatible, we are probably cross-compiling
     # Let's have a look for the device runtime which lib has a different name
     if test "$have_valid_coi" = "no" ; then
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
     fi
 
+    STARPU_CHECK_SCIF_RUNTIME($scif_dir, $scif_include_dir, $scif_lib_dir)
+
     if test "$have_valid_coi" = "no" ; then
 	AC_MSG_ERROR([cannot find MIC's COI runtime])
     fi
+    if test "$have_valid_scif" = "no" ; then
+	AC_MSG_ERROR([cannot find MIC's SCIF runtime])
+    fi
 
     AC_SUBST(STARPU_COI_CPPFLAGS)
     AC_SUBST(STARPU_COI_LDFLAGS)
+    AC_SUBST(STARPU_SCIF_CPPFLAGS)
+    AC_SUBST(STARPU_SCIF_LDFLAGS)
 fi
 
 ###############################################################################
@@ -1478,6 +1560,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
+# Computes the maximun number of combined worker
+nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`  
+AC_MSG_CHECKING(Maximum number of workers combinations)
+AC_MSG_RESULT($nmaxcombinedworkers)
+AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
+	[$nmaxcombinedworkers], [Maximum number of worker combinations])
+
+
+
 # Computes the maximum number of implementations per arch
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],

+ 7 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -649,6 +649,13 @@ starpu_task_submit() can be called from anywhere, including codelet
 functions and callbacks, provided that the field
 starpu_task::synchronous is set to 0.
 
+\fn int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
+\ingroup API_Codelet_And_Tasks
+This function submits a task to StarPU to the context <c> sched_ctx_id </c>.
+By default starpu_task_submit submits the task to a global context that is
+created automatically by StarPU.
+
+
 \fn int starpu_task_wait_for_all(void)
 \ingroup API_Codelet_And_Tasks
 This function blocks until all the tasks that were submitted

+ 1 - 1
doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy

@@ -2,7 +2,7 @@
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2011, 2012, 2013 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
 

+ 35 - 62
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -1,4 +1,4 @@
-*
+/*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
@@ -16,47 +16,6 @@ starpu tasks to them and we schedule them with the policy assigned to
 the context. Scheduling contexts can be created, deleted and modified
 dynamically.
 
-\enum starpu_worker_collection_type
-\ingroup API_Scheduling_Contexts
-types of structures the worker collection can implement
-\var starpu_worker_collection_type::STARPU_WORKER_LIST
-\ingroup API_Scheduling_Contexts
-List of workers
-
-\struct starpu_sched_ctx_iterator
-\ingroup API_Scheduling_Contexts
-todo
-\var starpu_sched_ctx_iterator::cursor
-todo
-
-\struct starpu_worker_collection
-\ingroup API_Scheduling_Contexts
-A scheduling context manages a collection of workers that can
-be memorized using different data structures. Thus, a generic
-structure is available in order to simplify the choice of its type.
-Only the list data structure is available but further data
-structures(like tree) implementations are foreseen.
-\var starpu_worker_collection::workerids
-        The workerids managed by the collection
-\var starpu_worker_collection::nworkers
-        The number of workers in the collection
-\var starpu_worker_collection::type
-        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
-\var starpu_worker_collection::has_next
-        Checks if there is another element in collection
-\var starpu_worker_collection::get_next
-        return the next element in the collection
-\var starpu_worker_collection::add
-        add a new element in the collection
-\var starpu_worker_collection::remove
-        remove an element from the collection
-\var starpu_worker_collection::init
-        Initialize the collection
-\var starpu_worker_collection::deinit
-        Deinitialize the colection
-\var starpu_worker_collection::init_iterator
-        Initialize the cursor if there is one
-
 \struct starpu_sched_ctx_performance_counters
 Performance counters used by the starpu to indicate the
 hypervisor how the application and the resources are executing.
@@ -66,11 +25,16 @@ hypervisor how the application and the resources are executing.
 \var starpu_sched_ctx_performance_counters::notify_idle_end
         Informs the hypervisor that after a period of idle, the worker has just executed a task in the specified context. The idle counter it though reset.
 \var starpu_sched_ctx_performance_counters::notify_pushed_task
-        Notifies the hypervisor a task has been scheduled on the queue of the worker corresponding to the specified context
+        Notifies the hypervisor that a task has been scheduled on the queue of the worker corresponding to the specified context
 \var starpu_sched_ctx_performance_counters::notify_poped_task
-        Informs the hypervisor a task executing a specified number of instructions has been poped from the worker
+        Informs the hypervisor that a task executing a specified number of instructions has been poped from the worker
 \var starpu_sched_ctx_performance_counters::notify_post_exec_hook
-        Notifies the hypervisor a task has just been executed
+        Notifies the hypervisor that a task has just been executed
+\var starpu_sched_ctx_performance_counters::notify_submitted_job
+        Notifies the hypervisor that a task has just been submitted
+\var starpu_sched_ctx_performance_counters::notify_delete_context
+        Notifies the hypervisor that the context was deleted
+
 
 @name Scheduling Contexts Basic API
 \ingroup API_Scheduling_Contexts
@@ -99,11 +63,6 @@ tasks will be submitted to. The return value should be at most
 \ingroup API_Scheduling_Contexts
 Create a context indicating an approximate interval of resources
 
-\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
-\ingroup API_Scheduling_Contexts
-Delete scheduling context \p sched_ctx_id and transfer remaining
-workers to the inheritor scheduling context.
-
 \fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 This function adds dynamically the workers in \p workerids_ctx to the
@@ -116,6 +75,11 @@ This function removes the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than
 STARPU_NMAX_SCHED_CTXS.
 
+\fn void starpu_sched_ctx_delete(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Delete scheduling context \p sched_ctx_id and transfer remaining
+workers to the inheritor scheduling context.
+
 \fn void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 \ingroup API_Scheduling_Contexts
 Indicate which context whill inherit the resources of this context
@@ -134,12 +98,18 @@ Return the scheduling context the tasks are currently submitted to
 Stop submitting tasks from the empty context list until the next time
 the context has time to check the empty context list
 
-\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
+\fn void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Indicate starpu that the application finished submitting to this
 context in order to move the workers to the inheritor as soon as
 possible.
 
+\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
+\ingroup API_Scheduling_Contexts
+Returns the list of workers in the array \p workerids, the returned value is the 
+number of workers. The user should free the \p workerids table after finishing
+using it (it is allocated inside the function with the proper size)
+
 \fn unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Return the number of workers managed by the specified contexts
@@ -169,11 +139,6 @@ Manage sharing of resources between contexts: by default a round_robin
 strategy is executed but the user can interfere to tell which ctx has
 its turn to pop.
 
-\fn double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
-\ingroup API_Scheduling_Contexts
-Time sharing a resources, indicate how long a worker has been active
-in the current sched_ctx.
-
 @name Scheduling Context Priorities
 \ingroup API_Scheduling_Contexts
 
@@ -235,12 +200,6 @@ Delete the worker collection of the specified scheduling context
 \ingroup API_Scheduling_Contexts
 Return the worker collection managed by the indicated context
 
-\fn unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
-\ingroup API_Scheduling_Contexts
-Returns the list of workers in the array \p workerids, the returned value is the 
-number of workers. The user should free the \p workerids table after finishing
-using it (it is allocated inside the function with the proper size)
-
 @name Scheduling Context Link with Hypervisor
 \ingroup API_Scheduling_Contexts
 
@@ -261,4 +220,18 @@ Allow the hypervisor to let starpu know he's initialised
 \ingroup API_Scheduling_Contexts
 Ask starpu if he is informed if the hypervisor is initialised
 
+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
+\ingroup API_Scheduling_Contexts
+Allocate the scheduling policy data (private information of the scheduler like queues, variables,
+additional condition variables) the context
+
+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+Return the scheduling policy data (private information of the scheduler) of the contexts previously 
+assigned to.
+
+\fn void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id)
+\ingroup API_Scheduling_Contexts
+execute any parallel code on the workers of the sched_ctx (workers are blocked)
+
 */

+ 1 - 10
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -27,7 +27,7 @@ starpu_init().
         Insert a task into the scheduler.
 \var starpu_sched_policy::push_task_notify
         Notify the scheduler that a task was pushed on a given worker.
-	This method is called when a task that was explicitely
+	This method is called when a task that was explicitly
 	assigned to a worker becomes ready and is about to be executed
 	by the worker. This method therefore permits to keep the state
 	of the scheduler coherent even when StarPU bypasses the
@@ -73,15 +73,6 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
-\ingroup API_Scheduling_Policy
-Each scheduling policy uses some specific data (queues, variables,
-additional condition variables). It is memorize through a local
-structure. This function assigns it to a scheduling context.
-
-\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
-\ingroup API_Scheduling_Policy
-Returns the policy data previously assigned to a context
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy

+ 43 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -62,6 +62,49 @@ Intel MIC device
 Intel SCC device
 
 
+\struct starpu_worker_collection
+\ingroup API_Workers_Properties
+A scheduling context manages a collection of workers that can
+be memorized using different data structures. Thus, a generic
+structure is available in order to simplify the choice of its type.
+Only the list data structure is available but further data
+structures(like tree) implementations are foreseen.
+\var starpu_worker_collection::workerids
+        The workerids managed by the collection
+\var starpu_worker_collection::nworkers
+        The number of workers in the collection
+\var starpu_worker_collection::type
+        The type of structure (currently ::STARPU_WORKER_LIST is the only one available)
+\var starpu_worker_collection::has_next
+        Checks if there is another element in collection
+\var starpu_worker_collection::get_next
+        return the next element in the collection
+\var starpu_worker_collection::add
+        add a new element in the collection
+\var starpu_worker_collection::remove
+        remove an element from the collection
+\var starpu_worker_collection::init
+        Initialize the collection
+\var starpu_worker_collection::deinit
+        Deinitialize the colection
+\var starpu_worker_collection::init_iterator
+        Initialize the cursor if there is one
+
+\enum starpu_worker_collection_type
+\ingroup API_Workers_Properties
+Types of structures the worker collection can implement
+\var starpu_worker_collection_type::STARPU_WORKER_LIST
+\ingroup API_Workers_Properties
+The collection is an array
+
+\struct starpu_sched_ctx_iterator
+\ingroup API_Workers_Properties
+Structure needed to iterate on the collection
+\var starpu_sched_ctx_iterator::cursor
+The index of the current worker in the collection, needed when iterating on
+the collection.
+
+
 \fn unsigned starpu_worker_get_count(void)
 \ingroup API_Workers_Properties
 This function returns the number of workers (i.e. processing

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 93452 - 356126
doc/doxygen/chapters/data_trace.eps


BIN
doc/doxygen/chapters/data_trace.pdf


BIN
doc/doxygen/chapters/data_trace.png


+ 53 - 0
doc/doxygen/chapters/environment_variables.doxy

@@ -550,4 +550,57 @@ end of the execution of an application (\ref DataStatistics).
 
 </dl>
 
+\section ConfiguringTheHypervisor Configuring The Hypervisor
+
+<dl>
+
+<dt>SC_HYPERVISOR_POLICY</dt>
+<dd>
+\anchor SC_HYPERVISOR_POLICY
+\addindex __env__SC_HYPERVISOR_POLICY
+Choose between the different resizing policies proposed by StarPU for the hypervisor: 
+idle, app_driven, feft_lp, teft_lp; ispeed_lp, throughput_lp etc.
+
+Use <c>SC_HYPERVISOR_POLICY=help</c> to get the list of available policies for the hypervisor
+</dd>
+
+<dt>SC_HYPERVISOR_TRIGGER_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_TRIGGER_RESIZE
+\addindex __env__SC_HYPERVISOR_TRIGGER_RESIZE
+Choose how should the hypervisor be triggered: <c>speed</c> if the resizing algorithm should
+be called whenever the speed of the context does not correspond to an optimal precomputed value,
+<c>idle</c> it the resizing algorithm should be called whenever the workers are idle for a period
+longer than the value indicated when configuring the hypervisor.
+</dd>
+
+<dt>SC_HYPERVISOR_START_RESIZE</dt>
+<dd>
+\anchor SC_HYPERVISOR_START_RESIZE
+\addindex __env__SC_HYPERVISOR_START_RESIZE
+Indicate the moment when the resizing should be available. The value correspond to the percentage
+of the total time of execution of the application. The default value is the resizing frame.
+</dd>
+
+<dt>SC_HYPERVISOR_MAX_SPEED_GAP</dt>
+<dd>
+\anchor SC_HYPERVISOR_MAX_SPEED_GAP
+\addindex __env__SC_HYPERVISOR_MAX_SPEED_GAP
+Indicate the ratio of speed difference between contexts that should trigger the hypervisor.
+This situation may occur only when a theoretical speed could not be computed and the hypervisor
+has no value to compare the speed to. Otherwise the resizing of a context is not influenced by the 
+the speed of the other contexts, but only by the the value that a context should have.
+</dd>
+
+<dt>SC_HYPERVISOR_STOP_PRINT</dt>
+<dd>
+\anchor SC_HYPERVISOR_STOP_PRINT
+\addindex __env__SC_HYPERVISOR_STOP_PRINT
+By default the values of the speed of the workers is printed during the execution
+of the application. If the value 1 is given to this environment variable this printing
+is not done.
+
+</dd>
+
+</dl>
 */

+ 7 - 5
doc/doxygen/chapters/performance_feedback.doxy

@@ -10,7 +10,7 @@
 
 \section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
 
-StarPU can connect to Temanejo (see
+StarPU can connect to Temanejo >= 1.0rc2 (see
 http://www.hlrs.de/temanejo), to permit
 nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
 install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
@@ -585,11 +585,13 @@ Synthetic GFlops : 44.21
 \section DataTrace Data trace and tasks length
 It is possible to get statistics about tasks length and data size by using :
 \verbatim
-$starpu_fxt_data_trace filename
+$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
 \endverbatim
-Where filename is the FxT trace file. This will create 2 files : <c>data_total.txt</c> which
-shows each task length and total data size and <c>data_trace.gp</c> which can be plotted to 
-get a .eps image of these results. On the image, each point represents a task.
+Where filename is the FxT trace file and codeletX the names of the codelets you 
+want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
+This will create a file, <c>data_trace.gp</c> which
+can be plotted to get a .eps image of these results. On the image, each point represents a 
+task, and each color corresponds to a codelet.
 
 \image html data_trace.png
 \image latex data_trace.eps "" width=\textwidth

+ 65 - 15
doc/doxygen/chapters/scheduling_context_hypervisor.doxy

@@ -10,8 +10,8 @@
 
 \section WhatIsTheHypervisor What Is The Hypervisor
 
-StarPU proposes a platform for constructing Scheduling Contexts, for
-deleting and modifying them dynamically. A parallel kernel, can thus
+StarPU proposes a platform to construct Scheduling Contexts, to
+delete and modify them dynamically. A parallel kernel, can thus
 be isolated into a scheduling context and interferences between
 several parallel kernels are avoided. If the user knows exactly how
 many workers each scheduling context needs, he can assign them to the
@@ -31,11 +31,11 @@ platform for implementing additional custom ones is available.
 
 \section StartTheHypervisor Start the Hypervisor
 
-The Hypervisor must be initialised once at the beging of the
+The Hypervisor must be initialized once at the beginning of the
 application. At this point a resizing policy should be indicated. This
 strategy depends on the information the application is able to provide
 to the hypervisor as well as on the accuracy needed for the resizing
-procedure. For exemple, the application may be able to provide an
+procedure. For example, the application may be able to provide an
 estimation of the workload of the contexts. In this situation the
 hypervisor may decide what resources the contexts need. However, if no
 information is provided the hypervisor evaluates the behavior of the
@@ -46,17 +46,25 @@ The hypervisor resizes only the registered contexts.
 
 The runtime provides the hypervisor with information concerning the
 behavior of the resources and the application. This is done by using
-the performance_counters, some callbacks indicating when the resources
-are idle or not efficient, when the application submits tasks or when
-it becames to slow.
+the <c>performance_counters</c> which represent callbacks indicating 
+when the resources are idle or not efficient, when the application 
+submits tasks or when it becomes to slow.
 
 \section TriggerTheHypervisor Trigger the Hypervisor
 
-The resizing is triggered either when the application requires it or
+The resizing is triggered either when the application requires it 
+(<c> sc_hypervisor_resize_ctxs </c>) or
 when the initials distribution of resources alters the performance of
-the application( the application is to slow or the resource are idle
-for too long time, threashold indicated by the user). When this
-happens different resizing strategy are applied that target minimising
+the application (the application is to slow or the resource are idle
+for too long time). If the environment 
+variable <c>SC_HYPERVISOR_TRIGGER_RESIZE</c> is set to <c>speed</c> 
+the monitored speed of the contexts is compared to a theoretical value
+computed with a linear program, and the resizing is triggered
+whenever the two values do not correspond. Otherwise, if the environment 
+variable is set to <c>idle</c> the hypervisor triggers the resizing algorithm
+whenever the workers are idle for a period longer than the threshold 
+indicated by the programmer. When this
+happens different resizing strategy are applied that target minimizing
 the total execution of the application, the instant speed or the idle
 time of the resources.
 
@@ -100,8 +108,7 @@ sc_hypervisor_ctl(sched_ctx,
 \endcode
 
 
-The <b>Idleness</b> based strategy resizes the scheduling contexts every time one of their workers stays idle
-for a period longer than the one imposed by the user
+The <b>Idleness</b> based strategy moves workers unused in a certain context to another one needing them.
 (see \ref UsersInputInTheResizingProcess "Users’ Input In The Resizing Process")
 
 \code{.c}
@@ -114,7 +121,7 @@ sc_hypervisor_ctl(sched_ctx_id,
 \endcode
 
 The <b>Gflops rate</b> based strategy resizes the scheduling contexts such that they all finish at the same time.
-The speed of each of them is considered and once one of them is significantly slower the resizing process is triggered.
+The speed of each of them is computed and once one of them is significantly slower the resizing process is triggered.
 In order to do these computations the user has to input the total number of instructions needed to be executed by the
 parallel kernels and the number of instruction to be executed by each
 task.
@@ -142,4 +149,47 @@ starpu_insert_task(&codelet,
                     0);
 \endcode
 
-*/
+The <b>Feft</b> strategy uses a linear program to predict the best distribution of resources
+such that the application finishes in a minimum amount of time. As for the <b>Gflops rate </b>
+strategy the programmers has to indicate the total number of flops to be executed
+when registering the context. This number of flops may be updated dynamically during the execution
+of the application whenever this information is not very accurate from the beginning.
+The function <c>sc_hypervisor_update_diff_total_flop </c> is called in order add or remove
+a difference to the flops left to be executed.
+Tasks are provided also the number of flops corresponding to each one of them. During the 
+execution of the application the hypervisor monitors the consumed flops and recomputes
+the time left and the number of resources to use. The speed of each type of resource
+is (re)evaluated and inserter in the linear program in order to better adapt to the 
+needs of the application.
+
+The <b>Teft</b> strategy uses a linear program too, that considers all the types of tasks
+and the number of each of them and it tries to allocates resources such that the application
+finishes in a minimum amount of time. A previous calibration of StarPU would be useful
+in order to have good predictions of the execution time of each type of task.
+
+The types of tasks may be determines directly by the hypervisor when they are submitted.
+However there are applications that do not expose all the graph of tasks from the beginning.
+In this case in order to let the hypervisor know about all the tasks the function
+<c> sc_hypervisor_set_type_of_task </c> will just inform the hypervisor about future tasks
+without submitting them right away.
+
+The <b>Ispeed </b> strategy divides the execution of the application in several frames.
+For each frame the hypervisor computes the speed of the contexts and tries making them
+run at the same speed. The strategy requires less contribution from the user as
+the hypervisor requires only the size of the frame in terms of flops.
+
+\code{.c}
+int workerids[3] = {1, 3, 10};
+int workerids2[9] = {0, 2, 4, 5, 6, 7, 8, 9, 11};
+sc_hypervisor_ctl(sched_ctx_id,
+                  SC_HYPERVISOR_ISPEED_W_SAMPLE, workerids, 3, 2000000000.0,
+                  SC_HYPERVISOR_ISPEED_W_SAMPLE, workerids2, 9, 200000000000.0,
+                  SC_HYPERVISOR_ISPEED_CTX_SAMPLE, 60000000000.0,
+            NULL);
+\endcode
+
+The <b>Throughput </b> strategy focuses on maximizing the throughput of the resources
+and resizes the contexts such that the machine is running at its maximum efficiency
+(maximum instant speed of the workers).
+
+*/

+ 49 - 25
doc/doxygen/chapters/scheduling_contexts.doxy

@@ -1,6 +1,6 @@
 /*
  * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+//  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
@@ -19,34 +19,44 @@ to minimize interferences between the execution of multiple parallel
 kernels, by partitioning the underlying pool of workers using
 contexts.
 
+
 \section CreatingAContext Creating A Context
 
 By default, the application submits tasks to an initial context, which
-disposes of all the computation ressources available to StarPU (all
+disposes of all the computation resources available to StarPU (all
 the workers). If the application programmer plans to launch several
-parallel kernels simultaneusly, by default these kernels will be
+parallel kernels simultaneously, by default these kernels will be
 executed within this initial context, using a single scheduler
 policy(see \ref TaskSchedulingPolicy). Meanwhile, if the application
 programmer is aware of the demands of these kernels and of the
 specificity of the machine used to execute them, the workers can be
 divided between several contexts. These scheduling contexts will
 isolate the execution of each kernel and they will permit the use of a
-scheduling policy proper to each one of them. In order to create the
-contexts, you have to know the indentifiers of the workers running
-within StarPU. By passing a set of workers together with the
-scheduling policy to the function starpu_sched_ctx_create(), you will
-get an identifier of the context created which you will use to
+scheduling policy proper to each one of them. 
+
+Scheduling Contexts may be created in two ways: either the programmers indicates
+the set of workers corresponding to each context (providing he knows the 
+identifiers of the workers running within StarPU), or the programmer
+does not provide any worker list and leaves the Hypervisor assign
+workers to each context according to their needs (\ref SchedulingContextHypervisor)
+
+Both cases require a call to the function <c>starpu_sched_ctx_create</c>, which 
+requires as input the worker list (the exact list or a NULL pointer) and the scheduling
+policy. The latter one can be a character list corresponding to the name of a StarPU
+predefined policy or the pointer to a custom policy. The function returns 
+an identifier of the context created which you will use to
 indicate the context you want to submit the tasks to.
 
+
 \code{.c}
-/* the list of ressources the context will manage */
+/* the list of resources the context will manage */
 int workerids[3] = {1, 3, 10};
 
 /* indicate the scheduling policy to be used within the context, the list of
    workers assigned to it, the number of workers, the name of the context */
 int id_ctx = starpu_sched_ctx_create("dmda", workerids, 3, "my_ctx");
 
-/* let StarPU know that the folowing tasks will be submitted to this context */
+/* let StarPU know that the following tasks will be submitted to this context */
 starpu_sched_ctx_set_task_context(id);
 
 /* submit the task to StarPU */
@@ -77,19 +87,32 @@ starpu_sched_ctx_add_workers(workerids, 3, sched_ctx2);
 starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 \endcode
 
+\section SubmittingTasksToAContext Submitting Tasks To A Context
+The application may submit tasks to several contexts either 
+simultaneously or sequnetially. If several threads of submission
+are used the function <c>starpu_sched_ctx_set_context</c> may be called just
+before <c>starpu_task_submit</c>. Thus StarPU considers that 
+the current thread will submit tasks to the coresponding context.
+ 
+When the application may not assign a thread of submission to each
+context, the id of the context must be indicated by using the
+function <c>starpu_task_submit_to_ctx</c> or the field <c>STARPU_SCHED_CTX</c> 
+for <c>starpu_insert_task</c>.
+
 \section DeletingAContext Deleting A Context
 
 When a context is no longer needed it must be deleted. The application
 can indicate which context should keep the resources of a deleted one.
-All the tasks of the context should be executed before doing this. If
-the application need to avoid a barrier before moving the resources
-from the deleted context to the inheritor one, the application can
-just indicate when the last task was submitted. Thus, when this last
-task was submitted the resources will be move, but the context should
-still be deleted at some point of the application.
+All the tasks of the context should be executed before doing this. 
+Thus, the programmer may use either a barrier and then delete the context 
+directly, or just indicate
+that other tasks will not be submitted later on to the context (such that when 
+the last task is executed its workers will be moved to the inheritor)
+and delete the context at the end of the execution (when a barrier will
+be used eventually).
 
 \code{.c}
-/* when the context 2 will be deleted context 1 will be keep its resources */
+/* when the context 2 is deleted context 1 inherits its resources */
 starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
 
 /* submit tasks to context 2 */
@@ -98,7 +121,7 @@ for (i = 0; i < ntasks; i++)
 
 /* indicate that context 2 finished submitting and that */
 /* as soon as the last task of context 2 finished executing */
-/* its workers can be mobed to the inheritor context */
+/* its workers can be moved to the inheritor context */
 starpu_sched_ctx_finished_submit(sched_ctx1);
 
 /* wait for the tasks of both contexts to finish */
@@ -113,14 +136,15 @@ starpu_sched_ctx_delete(sched_ctx1);
 
 \section EmptyingAContext Emptying A Context
 
-A context may not have any resources at the begining or at a certain
+A context may have no resources at the begining or at a certain
 moment of the execution. Task can still be submitted to these contexts
-and they will execute them as soon as they will have resources. A list
+and they will be executed as soon as the contexts will have resources. A list
 of tasks pending to be executed is kept and when workers are added to
-the contexts the tasks are submitted. However, if no resources are
-allocated the program will not terminate. If these tasks have not much
-priority the programmer can forbid the application to submitted them
-by calling the function starpu_sched_ctx_stop_task_submission().
+the contexts these tasks start being submitted. However, if resources 
+are never allocated to the context the program will not terminate. 
+If these tasks have low
+priority the programmer can forbid the application to submit them
+by calling the function <c>starpu_sched_ctx_stop_task_submission()</c>.
 
 \section ContextsSharingWorkers Contexts Sharing Workers
 
@@ -129,7 +153,7 @@ efficiently enough alone on these workers or when the application
 decides to express a hierarchy of contexts. The workers apply an
 alogrithm of ``Round-Robin'' to chose the context on which they will
 ``pop'' next. By using the function
-starpu_sched_ctx_set_turn_to_other_ctx(), the programmer can impose
+<c>starpu_sched_ctx_set_turn_to_other_ctx</c>, the programmer can impose
 the <c>workerid</c> to ``pop'' in the context <c>sched_ctx_id</c>
 next.
 

+ 1 - 1
examples/Makefile.am

@@ -20,7 +20,7 @@ AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STAR
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 SUBDIRS = stencil
 

+ 7 - 1
examples/stencil/stencil-tasks.c

@@ -221,7 +221,8 @@ static struct starpu_codelet null =
 	.cpu_funcs_name = {"null_func", NULL},
 	.cuda_funcs = {null_func, NULL},
 	.opencl_funcs = {null_func, NULL},
-	.nbuffers = 2
+	.nbuffers = 2,
+	.name = "start"
 };
 
 void create_start_task(int z, int dir)
@@ -267,11 +268,15 @@ void create_tasks(int rank)
 	}
 
 	for (iter = 0; iter <= niter; iter++)
+	{
 	for (bz = 0; bz < nbz; bz++)
 	{
 		if ((iter > 0) && (get_block_mpi_node(bz) == rank))
 			create_task_update(iter, bz, rank);
 
+	}
+	for (bz = 0; bz < nbz; bz++)
+	{
 		if (iter != niter)
 		{
 			if ((get_block_mpi_node(bz) == rank) || (get_block_mpi_node(bz+1) == rank))
@@ -281,6 +286,7 @@ void create_tasks(int rank)
 				create_task_save(iter, bz, -1, rank);
 		}
 	}
+	}
 }
 
 /*

+ 1 - 0
include/starpu.h

@@ -140,6 +140,7 @@ void starpu_topology_print(FILE *f);
 int starpu_asynchronous_copy_disabled(void);
 int starpu_asynchronous_cuda_copy_disabled(void);
 int starpu_asynchronous_opencl_copy_disabled(void);
+int starpu_asynchronous_mic_copy_disabled(void);
 
 void starpu_display_stats();
 

+ 1 - 1
include/starpu_deprecated_api.h

@@ -24,7 +24,7 @@ extern "C"
 #endif
 
 #if defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
-#warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
+#warning Your application is using deprecated types. You may want to update to use the latest API, by using tools/dev/rename.sh.
 #endif /* defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API) */
 
 #ifdef STARPU_USE_DEPRECATED_ONE_ZERO_API

+ 1 - 1
include/starpu_perfmodel.h

@@ -38,7 +38,7 @@ enum starpu_perfmodel_archtype
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
 	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
 	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
+	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
 };
 
 #ifdef __STDC_VERSION__

+ 30 - 34
include/starpu_sched_ctx.h

@@ -50,37 +50,6 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids);
 
-struct starpu_sched_ctx_performance_counters
-{
-	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
-	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
-	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
-	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
-	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
-	void (*notify_delete_context)(unsigned sched_ctx);
-};
-
-#ifdef STARPU_USE_SC_HYPERVISOR
-void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
-void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
-#endif //STARPU_USE_SC_HYPERVISOR
-
-void starpu_sched_ctx_notify_hypervisor_exists(void);
-
-unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
-
-void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
-
-void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
-
-
-struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
-
-void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
-
-struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
-
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
@@ -95,8 +64,6 @@ unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id);
 
-double starpu_sched_ctx_get_max_time_worker_on_ctx(void);
-
 int starpu_sched_get_min_priority(void);
 
 int starpu_sched_get_max_priority(void);
@@ -118,7 +85,36 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 
 #define STARPU_DEFAULT_PRIO	0
 
-/* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
+
+void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
+
+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
+
+struct starpu_sched_ctx_performance_counters
+{
+	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
+	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
+	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, struct starpu_task *task, size_t data_size, uint32_t footprint);
+	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
+	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
+	void (*notify_delete_context)(unsigned sched_ctx);
+};
+
+#ifdef STARPU_USE_SC_HYPERVISOR
+void starpu_sched_ctx_set_perf_counters(unsigned sched_ctx_id, struct starpu_sched_ctx_performance_counters *perf_counters);
+void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
+#endif //STARPU_USE_SC_HYPERVISOR
+
+void starpu_sched_ctx_notify_hypervisor_exists(void);
+
+unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
+
+void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
+
+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
+
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
 #ifdef __cplusplus

+ 41 - 10
mic-configure

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-ROOT_DIR=$PWD
+ROOT_DIR=$(dirname $0)
 
 cat > ./mic-config.log << EOF
 This file was created by StarPU mic-configure
@@ -10,41 +10,72 @@ EOF
 
 prefix="/usr/local"
 coi_dir="/opt/intel/mic/coi"
+scif_dir="/opt/intel/mic/scif"
 mic_host="x86_64-k1om-linux"
 
 for arg in $*
 do
-	case $arg in 
+	case $arg in
 		--prefix=*)
 			prefix="${arg#--prefix=}"
 			;;
 		--with-coi-dir=*)
 			coi_dir="${arg#--with-coi-dir=}"
 			;;
+		--with-scif-dir=*)
+			scif_dir="${arg#--with-scif-dir=}"
+			;;
 		--mic-host=*)
 			mic_host="${arg#--mic-host=}"
 			;;
 	esac
-
 done
 
-for arch in mic host
+# Test gcc compiler
+x=$(type -t ${mic_host}-gcc)
+if [ -z "$x" ]
+then
+    # Test icc compiler
+    echo "int main(int argc, char **argv) { return 0; }" > /tmp/icc_$USER_$$.c
+    icc -mmic /tmp/icc_$USER_$$.c > /dev/null 2>/tmp/icc_$USER_$$.err
+    l=$(grep -c "invalid argument" /tmp/icc_$USER_$$.err)
+    if [ "$l" != "0" ]
+    then
+        echo "[error] no compiler found. please add path to either ${mic_host}-gcc or to an enabled mic icc compiler in your PATH"
+	exit 1
+    else
+	compiler="icc"
+    fi
+else
+    compiler="gcc"
+fi
+
+for arch in host mic
 do
 	# We call the configure script from a build directory further in the
 	# arborescence
 
-	command="${ROOT_DIR}/configure"
-	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
+	case $ROOT_DIR in
+		/*) command="${ROOT_DIR}/configure";;
+		*) command="../${ROOT_DIR}/configure";;
+	esac
+
+	if [ $compiler = "icc" -a "$arch" = "mic" ] ; then
+	    export CC="icc -mmic"
+	    export CXX="icc -mmic"
+	fi
+
+	params="--enable-mic --with-coi-dir=$coi_dir --with-scif-dir=$scif_dir --prefix=$prefix/$arch"
 
 	if test x$arch = xmic ; then
 		# TODO: fix hwloc detection to look for another pkg-config place, and not just believe in the host version of hwloc.pc...
-		params="$params --without-hwloc --with-coi-lib-dir=$coi_dir/device-linux-release/lib --host=$mic_host"
+		params="$params --without-hwloc --with-coi-lib-dir=$coi_dir/device-linux-release/lib --with-scif-lib-dir=$scif_dir/device-linux-release/lib --host=$mic_host"
 	else
-		params="$params --with-coi-lib-dir=$coi_dir/host-linux-release/lib"
+		params="$params --with-coi-lib-dir=$coi_dir/host-linux-release/lib --with-scif-lib-dir=$scif_dir/host-linux-release/lib"
 	fi
 
 	# If the build directory doesn't exist yet, create it
-	if [ ! -d "${ROOT_DIR}/build_${arch}" ] ; then
+	if [ ! -d "build_${arch}" ] ; then
 		mkdir "build_${arch}"
 	fi
 
@@ -59,7 +90,7 @@ do
 	then
 		exit $?
 	fi
-	cd "${ROOT_DIR}"
+	cd ..
 done
 
 cat > Makefile << EOF

+ 70 - 60
mpi/src/starpu_mpi.c

@@ -37,7 +37,8 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle,
 							int source, int mpi_tag, MPI_Comm comm,
 							unsigned detached, void (*callback)(void *), void *arg,
-							int sequential_consistency);
+							int sequential_consistency, int is_internal_req,
+							ssize_t psize);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
 /* The list of requests that have been newly submitted by the application */
@@ -76,59 +77,60 @@ struct _starpu_mpi_copy_handle
  /*                                                      */
  /********************************************************/
 
-static struct _starpu_mpi_req *_starpu_mpi_req_hashmap = NULL;
+/** stores application requests for which data have not been received yet */
+static struct _starpu_mpi_req *_starpu_mpi_app_req_hashmap = NULL;
 /** stores data which have been received by MPI but have not been requested by the application */
 static struct _starpu_mpi_copy_handle *_starpu_mpi_copy_handle_hashmap = NULL;
 
-static struct _starpu_mpi_req* find_req(int mpi_tag)
+static struct _starpu_mpi_req* find_app_req(int mpi_tag)
 {
-	struct _starpu_mpi_req* req; // = malloc(sizeof(struct _starpu_mpi_req));
+	struct _starpu_mpi_req* req;
 
-	HASH_FIND_INT(_starpu_mpi_req_hashmap, &mpi_tag, req);
+	HASH_FIND_INT(_starpu_mpi_app_req_hashmap, &mpi_tag, req);
 
 	return req;
 }
 
-static void add_req(struct _starpu_mpi_req *req)
+static void add_app_req(struct _starpu_mpi_req *req)
 {
 	struct _starpu_mpi_req *test_req;
 
-	test_req = find_req(req->mpi_tag);
+	test_req = find_app_req(req->mpi_tag);
 
 	if (test_req == NULL)
 	{
-		HASH_ADD_INT(_starpu_mpi_req_hashmap, mpi_tag, req);
-		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the hashmap. \n", req, req->mpi_tag);
+		HASH_ADD_INT(_starpu_mpi_app_req_hashmap, mpi_tag, req);
+		_STARPU_MPI_DEBUG(3, "Adding request %p with tag %d in the application request hashmap. \n", req, req->mpi_tag);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "Error add_req : request %p with tag %d already in the hashmap. \n", req, req->mpi_tag);
+		_STARPU_MPI_DEBUG(3, "[Error] request %p with tag %d already in the application request hashmap. \n", req, req->mpi_tag);
 		int seq_const = starpu_data_get_sequential_consistency_flag(req->data_handle);
 		if (seq_const &&  req->sequential_consistency)
 		{
-			STARPU_ASSERT_MSG(!test_req, "Error add_req : request %p with tag %d wanted to be added to the hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency is activated : this is not supported by StarPU.", req, req->mpi_tag, test_req);
 		}
 		else
 		{
-			STARPU_ASSERT_MSG(!test_req, "Error add_req : request %p with tag %d wanted to be added to the hashmap, while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, test_req);
+			STARPU_ASSERT_MSG(!test_req, "[Error] request %p with tag %d wanted to be added to the application request hashmap, while another request %p with the same tag is already in it. \n Sequential consistency isn't activated for this handle : you should want to add dependencies between requests for which the sequential consistency is deactivated.", req, req->mpi_tag, test_req);
 		}
 	}
 }
 
-static void delete_req(struct _starpu_mpi_req *req)
+static void delete_app_req(struct _starpu_mpi_req *req)
 {
 	struct _starpu_mpi_req *test_req;
 
-	test_req = find_req(req->mpi_tag);
+	test_req = find_app_req(req->mpi_tag);
 
 	if (test_req != NULL)
 	{
-		HASH_DEL(_starpu_mpi_req_hashmap, req);
-		_STARPU_MPI_DEBUG(3, "Deleting request %p with tag %d from the hashmap. \n", req, req->mpi_tag);
+		HASH_DEL(_starpu_mpi_app_req_hashmap, req);
+		_STARPU_MPI_DEBUG(3, "Deleting application request %p with tag %d from the application request hashmap. \n", req, req->mpi_tag);
 	}
 	else
 	{
-		_STARPU_MPI_DEBUG(3, "Warning delete_req : request %p with tag %d isn't in the hashmap. \n", req, req->mpi_tag);
+		_STARPU_MPI_DEBUG(3, "[Warning] request %p with tag %d is NOT in the application request hashmap. \n", req, req->mpi_tag);
 	}
 }
 
@@ -219,7 +221,7 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	req->is_internal_req = 0;
 	req->envelope = NULL;
 	req->sequential_consistency = 1;
- }
+}
 
  /********************************************************/
  /*                                                      */
@@ -232,8 +234,10 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 							       unsigned detached, void (*callback)(void *), void *arg,
 							       enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
 							       enum starpu_data_access_mode mode,
-							       int sequential_consistency)
- {
+							       int sequential_consistency,
+							       int is_internal_req,
+							       ssize_t psize)
+{
 
 	 _STARPU_MPI_LOG_IN();
 	 struct _starpu_mpi_req *req = malloc(sizeof(struct _starpu_mpi_req));
@@ -253,6 +257,8 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req *req)
 	 req->callback_arg = arg;
 	 req->func = func;
 	 req->sequential_consistency = sequential_consistency;
+	 req->is_internal_req = is_internal_req;
+	 req->count = psize;
 
 	 /* Asynchronously request StarPU to fetch the data in main memory: when
 	  * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
@@ -354,7 +360,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 							unsigned detached, void (*callback)(void *), void *arg,
 							int sequential_consistency)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R, sequential_consistency);
+	return _starpu_mpi_isend_irecv_common(data_handle, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R, sequential_consistency, 0, 0);
 }
 
 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
@@ -429,9 +435,9 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency)
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, ssize_t psize)
 {
-	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency);
+	return _starpu_mpi_isend_irecv_common(data_handle, source, mpi_tag, comm, detached, callback, arg, RECV_REQ, _starpu_mpi_irecv_data_func, STARPU_W, sequential_consistency, is_internal_req, psize);
 }
 
 int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
@@ -447,7 +453,7 @@ int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_re
 		starpu_data_set_tag(data_handle, mpi_tag);
 
 	struct _starpu_mpi_req *req;
-	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL, 1);
+	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL, 1, 0, 0);
 
 	STARPU_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
 	*public_req = req;
@@ -467,7 +473,7 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 	if (tag == -1)
 		starpu_data_set_tag(data_handle, mpi_tag);
 
-	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, 1);
+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, 1, 0, 0);
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }
@@ -475,7 +481,8 @@ int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int
 int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
 {
 	_STARPU_MPI_LOG_IN();
-	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, sequential_consistency);
+
+	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg, sequential_consistency, 0, 0);
 
 	_STARPU_MPI_LOG_OUT();
 	return 0;
@@ -888,33 +895,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
 	if (req->request_type == RECV_REQ)
 	{
-		/* test whether the receive request has already been submitted internally by StarPU-MPI*/
-		struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag);
-
-		/* Case : the request has already been submitted internally by StarPU.
-		 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-		 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
-		 * bring the data back to the original data handle associated to the request.*/
-		if (chandle && (req->data_handle != chandle->handle))
-		{
-			_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
-
-			req->internal_req = chandle->req;
-
-			struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
-			cb_args->data_handle = req->data_handle;
-			cb_args->copy_handle = chandle->handle;
-			cb_args->req = req;
-
-			_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
-			starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
-		}
 		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
 		 * incoming data without a matching pending receive already submitted by the application.
 		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
 		 * the list of new_requests, so as the real MPI request can be submitted before the next
 		 * submission of the envelope-catching request. */
-		else if (chandle && (req->data_handle == chandle->handle))
+		if (req->is_internal_req)
 		{
 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
 			if (req->user_datatype == 0)
@@ -924,7 +910,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			}
 			else
 			{
-				req->count = chandle->env->psize;
+				STARPU_ASSERT(req->count);
 				req->ptr = malloc(req->count);
 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 			}
@@ -940,11 +926,36 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			STARPU_PTHREAD_MUTEX_UNLOCK(&req->posted_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
-		/* Case : a classic receive request with no send received earlier than expected.
-		 * We just add the pending receive request to the requests' hashmap. */
 		else
 		{
-			add_req(req);
+			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
+			struct _starpu_mpi_copy_handle *chandle = find_chandle(req->mpi_tag);
+
+			/* Case : the request has already been submitted internally by StarPU.
+			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
+			 * the internal receive will be over, the _starpu_mpi_copy_cb function will be called to
+			 * bring the data back to the original data handle associated to the request.*/
+			if (chandle)
+			{
+				_STARPU_MPI_DEBUG(3, "The RECV request %p with tag %d has already been received, copying previously received data into handle's pointer..\n", req, req->mpi_tag);
+				STARPU_ASSERT(req->data_handle != chandle->handle);
+
+				req->internal_req = chandle->req;
+
+				struct _starpu_mpi_copy_cb_args *cb_args = malloc(sizeof(struct _starpu_mpi_copy_cb_args));
+				cb_args->data_handle = req->data_handle;
+				cb_args->copy_handle = chandle->handle;
+				cb_args->req = req;
+
+				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
+				starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
+			}
+			/* Case : a classic receive request with no send received earlier than expected.
+			 * We just add the pending receive request to the requests' hashmap. */
+			else
+			{
+				add_app_req(req);
+			}
 		}
 	}
 	else
@@ -1151,7 +1162,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_req_hashmap) == 0);
+		unsigned block = _starpu_mpi_req_list_empty(new_requests) && (HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0);
 
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1191,7 +1202,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
 		 * requests in our side, we resubmit a header request. */
 		MPI_Request header_req;
-		if ((HASH_COUNT(_starpu_mpi_req_hashmap) > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
+		if ((HASH_COUNT(_starpu_mpi_app_req_hashmap) > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0))
 		{
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop\n");
 			MPI_Irecv(recv_env, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _starpu_mpi_tag, MPI_COMM_WORLD, &header_req);
@@ -1215,9 +1226,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 			if (flag)
 			{
-				_STARPU_MPI_DEBUG(3, "Searching for request with tag %d (size %ld)\n", recv_env->mpi_tag, recv_env->psize);
+				_STARPU_MPI_DEBUG(3, "Searching for application request with tag %d (size %ld)\n", recv_env->mpi_tag, recv_env->psize);
 
-				struct _starpu_mpi_req *found_req = find_req(recv_env->mpi_tag);
+				struct _starpu_mpi_req *found_req = find_app_req(recv_env->mpi_tag);
 
 				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
 				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
@@ -1245,8 +1256,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					add_chandle(chandle);
 
 					_STARPU_MPI_DEBUG(3, "Posting internal detached irecv on copy_handle with tag %d from src %d ..\n", chandle->mpi_tag, status.MPI_SOURCE);
-					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1);
-					chandle->req->is_internal_req = 1;
+					chandle->req = _starpu_mpi_irecv_common(chandle->handle, status.MPI_SOURCE, chandle->mpi_tag, MPI_COMM_WORLD, 1, NULL, NULL, 1, 1, recv_env->psize);
 
 					// We wait until the request is pushed in the
 					// new_request list, that ensures that the next loop
@@ -1266,7 +1276,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 
-					delete_req(found_req);
+					delete_app_req(found_req);
 
 					_starpu_mpi_handle_allocate_datatype(found_req->data_handle, &found_req->datatype, &found_req->user_datatype);
 					if (found_req->user_datatype == 0)
@@ -1303,7 +1313,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
-	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_req_hashmap) == 0, "Number of receive requests left is not zero");
+	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_app_req_hashmap) == 0, "Number of receive requests left is not zero");
 	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0, "Number of copy requests left is not zero");
 	if (argc_argv->initialize_mpi)
 	{

+ 0 - 1
mpi/src/starpu_mpi_collective.c

@@ -57,7 +57,6 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 		callback_arg->nb = 0;
 		callback_arg->callback = (rank == root) ? scallback : rcallback;
 		callback_arg->arg = (rank == root) ? sarg : rarg;
-		if (callback_arg->callback == NULL)
 
 		for(x = 0; x < count ; x++)
 		{

+ 1 - 1
sc_hypervisor/examples/lp_test/lp_resize_test.c

@@ -73,7 +73,7 @@ void* submit_tasks_thread(void *arg)
 	}
 
 	starpu_task_wait_for_all();
-	return;
+	return NULL;
 }
 
 int main()

+ 1 - 1
sc_hypervisor/examples/lp_test/lp_test.c

@@ -72,7 +72,7 @@ void* submit_tasks_thread(void *arg)
 	}
 
 	starpu_task_wait_for_all();
-	return;
+	return NULL;
 }
 
 int main()

+ 6 - 0
sc_hypervisor/include/sc_hypervisor.h

@@ -128,6 +128,12 @@ unsigned sc_hypervisor_can_resize(unsigned sched_ctx);
 /* indicate the types of tasks a context will execute in order to better decide the sizing of ctxs */
 void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size);
 
+/* change dynamically the total number of flops of a context, move the deadline of the finishing time of the context */
+void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total_flops);
+
+/* change dynamically the number of the elapsed flops in a context, modify the past in order to better compute the speed */
+void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops);
+
 #ifdef __cplusplus
 }
 #endif

+ 4 - 0
sc_hypervisor/include/sc_hypervisor_lp.h

@@ -72,6 +72,10 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 					       double times[nw][nt], unsigned is_integer, double tmax, unsigned *in_sched_ctxs,
 					       struct sc_hypervisor_policy_task_pool *tmp_task_pools);
 
+/* linear program that simulates a distribution of flops over the workers on particular sample of the execution
+   of the application such that the entire sample would finish in a minimum amount of time */
+double sc_hypervisor_lp_simulate_distrib_flops_on_sample(int ns, int nw, double final_w_in_s[ns][nw], unsigned is_integer, double tmax, 
+							 double **speed, double flops[ns], double **final_flops_on_w);
 #endif // STARPU_HAVE_GLPK_H
 
 #ifdef __cplusplus

+ 1 - 1
sc_hypervisor/src/Makefile.am

@@ -37,7 +37,7 @@ libsc_hypervisor_la_SOURCES = 				\
 	hypervisor_policies/teft_lp_policy.c		\
 	hypervisor_policies/ispeed_policy.c		\
 	hypervisor_policies/ispeed_lp_policy.c		\
-	hypervisor_policies/debit_lp_policy.c
+	hypervisor_policies/throughput_lp_policy.c
 
 noinst_HEADERS = sc_hypervisor_intern.h		
 

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/app_driven_policy.c

@@ -15,7 +15,7 @@
  */
 #include <sc_hypervisor_policy.h>
 
-static void app_driven_handle_post_exec_hook(unsigned sched_ctx, int task_tag)
+static void app_driven_handle_post_exec_hook(unsigned sched_ctx, __attribute__((unused)) int task_tag)
 {
 	sc_hypervisor_policy_resize_to_unknown_receiver(sched_ctx, 1);
 }

+ 4 - 7
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -46,13 +46,14 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
 	long diff_us = end_time.tv_usec  - start_time.tv_usec;
 	
-	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+	__attribute__((unused))	float timing = (float)(diff_s*1000000 + diff_us)/1000;
 	
 	if(vmax != 0.0)
 	{
 		int nworkers_per_ctx_rounded[nsched_ctxs][nw];
 		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_ctx, nworkers_per_ctx_rounded);
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
+//		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
+		sc_hypervisor_lp_distribute_resources_in_ctxs(curr_sched_ctxs, ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, workers, curr_nworkers, tw);
 	}
 }
 
@@ -142,12 +143,8 @@ static void feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 		unsigned criteria = sc_hypervisor_get_resize_criteria();
 		if(criteria != SC_NOTHING && criteria == SC_IDLE)
 		{
-			
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
-			{
 				_try_resizing(NULL, -1, NULL, -1);
-//				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
-			}
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
@@ -182,7 +179,7 @@ struct sc_hypervisor_policy feft_lp_policy = {
 	.resize_ctxs = feft_lp_resize_ctxs,
 	.handle_poped_task = feft_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
-	.handle_idle_cycle = feft_lp_handle_idle_cycle, //NULL,
+	.handle_idle_cycle = feft_lp_handle_idle_cycle,
 	.handle_idle_end = NULL,
 	.handle_post_exec_hook = NULL,
 	.handle_submitted_job = NULL,

+ 2 - 1
sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c

@@ -289,7 +289,8 @@ static void gflops_rate_resize(unsigned sched_ctx)
 	}
 }
 
-static void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
+static void gflops_rate_handle_poped_task(unsigned sched_ctx, __attribute__((unused)) int worker, 
+					  __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
 	gflops_rate_resize(sched_ctx);
 }

+ 5 - 219
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -28,13 +28,9 @@ struct ispeed_lp_data
 	int *workers;
 };
 
-/*
- * GNU Linear Programming Kit backend
- */
 #ifdef STARPU_HAVE_GLPK_H
-#include <glpk.h>
-static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
-			    unsigned is_integer, double tmax, void *specific_data)
+static double _compute_workers_distrib(int ns, int nw, double final_w_in_s[ns][nw],
+					unsigned is_integer, double tmax, void *specific_data)
 {
 	struct ispeed_lp_data *sd = (struct ispeed_lp_data *)specific_data;
 
@@ -43,220 +39,11 @@ static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
 	
 	double **final_flops_on_w = sd->flops_on_w;
 	
-	double w_in_s[ns][nw];
-	double flops_on_w[ns][nw];
-
-	int w, s;
-	glp_prob *lp;
-
-//	printf("try with tmax %lf\n", tmax);
-	lp = glp_create_prob();
-	glp_set_prob_name(lp, "StarPU theoretical bound");
-	glp_set_obj_dir(lp, GLP_MAX);
-	glp_set_obj_name(lp, "total execution time");
-
-	{
-		int ne = 5 * ns * nw /* worker execution time */
-			+ 1; /* glp dumbness */
-		int n = 1;
-		int ia[ne], ja[ne];
-		double ar[ne];
-
-
-		/* Variables: number of flops assigned to worker w in context s, and 
-		 the acknwoledgment that the worker w belongs to the context s */
-		glp_add_cols(lp, 2*nw*ns);
-#define colnum(w, s) ((s)*nw+(w)+1)
-		for(s = 0; s < ns; s++)
-			for(w = 0; w < nw; w++)
-				glp_set_obj_coef(lp, nw*ns+colnum(w,s), 1.);
-		
-		for(s = 0; s < ns; s++)
-			for(w = 0; w < nw; w++)
-			{
-				char name[32];
-				snprintf(name, sizeof(name), "flopsw%ds%dn", w, s);
-				glp_set_col_name(lp, colnum(w,s), name);
-				glp_set_col_bnds(lp, colnum(w,s), GLP_LO, 0., 0.);
-
-				snprintf(name, sizeof(name), "w%ds%dn", w, s);
-				glp_set_col_name(lp, nw*ns+colnum(w,s), name);
-				if (is_integer)
-				{
-                                        glp_set_col_kind(lp, nw*ns+colnum(w, s), GLP_IV);
-					glp_set_col_bnds(lp, nw*ns+colnum(w,s), GLP_DB, 0, 1);
-				}
-				else
-					glp_set_col_bnds(lp, nw*ns+colnum(w,s), GLP_DB, 0.0, 1.0);
-
-			}
-
-
-		int curr_row_idx = 0;
-		/* Total worker execution time */
-		glp_add_rows(lp, nw*ns);
-
-		/*nflops[s][w]/v[s][w] < x[s][w]*tmax */
-		for(s = 0; s < ns; s++)
-		{
-			for (w = 0; w < nw; w++)
-			{
-				char name[32], title[64];
-				starpu_worker_get_name(w, name, sizeof(name));
-				snprintf(title, sizeof(title), "worker %s", name);
-				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
-
-				/* nflosp[s][w] */
-				ia[n] = curr_row_idx+s*nw+w+1;
-				ja[n] = colnum(w, s);
-				ar[n] = 1 / speed[s][w];
-
-				n++;
-				
-				/* x[s][w] = 1 | 0 */
-				ia[n] = curr_row_idx+s*nw+w+1;
-				ja[n] = nw*ns+colnum(w,s);
-				ar[n] = (-1) * tmax;
-				n++;
-				glp_set_row_bnds(lp, curr_row_idx+s*nw+w+1, GLP_UP, 0.0, 0.0);
-			}
-		}
-
-		curr_row_idx += nw*ns;
-
-		/* sum(flops[s][w]) = flops[s] */
-		glp_add_rows(lp, ns);
-		for (s = 0; s < ns; s++)
-		{
-			char name[32], title[64];
-			starpu_worker_get_name(w, name, sizeof(name));
-			snprintf(title, sizeof(title), "flops %lf ctx%d", flops[s], s);
-			glp_set_row_name(lp, curr_row_idx+s+1, title);
-			for (w = 0; w < nw; w++)
-			{
-				ia[n] = curr_row_idx+s+1;
-				ja[n] = colnum(w, s);
-				ar[n] = 1;
-				n++;
-			}
-			glp_set_row_bnds(lp, curr_row_idx+s+1, GLP_FX, flops[s], flops[s]);
-		}
-
-		curr_row_idx += ns;
-
-		/* sum(x[s][w]) = 1 */
-		glp_add_rows(lp, nw);
-		for (w = 0; w < nw; w++)
-		{
-			char name[32], title[64];
-			starpu_worker_get_name(w, name, sizeof(name));
-			snprintf(title, sizeof(title), "w%x", w);
-			glp_set_row_name(lp, curr_row_idx+w+1, title);
-			for(s = 0; s < ns; s++)
-			{
-				ia[n] = curr_row_idx+w+1;
-				ja[n] = nw*ns+colnum(w,s);
-				ar[n] = 1;
-				n++;
-			}
-			if(is_integer)				
-				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
-			else
-				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
-		}
-
-		curr_row_idx += nw;
-
-		/* sum(nflops[s][w]) > 0*/
-		glp_add_rows(lp, nw);
-		for (w = 0; w < nw; w++)
-		{
-			char name[32], title[64];
-			starpu_worker_get_name(w, name, sizeof(name));
-			snprintf(title, sizeof(title), "flopsw%x", w);
-			glp_set_row_name(lp, curr_row_idx+w+1, title);
-			for(s = 0; s < ns; s++)
-			{
-				ia[n] = curr_row_idx+w+1;
-				ja[n] = colnum(w,s);
-				ar[n] = 1;
-				n++;
-			}
-
-			glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_LO, 0.1, 0.);
-		}
-
-		if(n != ne)
-			printf("ns= %d nw = %d n = %d ne = %d\n", ns, nw, n, ne);
-		STARPU_ASSERT(n == ne);
-
-		glp_load_matrix(lp, ne-1, ia, ja, ar);
-	}
-
-	glp_smcp parm;
-	glp_init_smcp(&parm);
-	parm.msg_lev = GLP_MSG_OFF;
-	int ret = glp_simplex(lp, &parm);
-	if (ret)
-	{
-		glp_delete_prob(lp);
-		lp = NULL;
-		return 0.0;
-	}
-
-        if (is_integer)
-        {
-                glp_iocp iocp;
-                glp_init_iocp(&iocp);
-                iocp.msg_lev = GLP_MSG_OFF;
-                glp_intopt(lp, &iocp);
-		int stat = glp_mip_status(lp);
-		/* if we don't have a solution return */
-		if(stat == GLP_NOFEAS)
-		{
-			glp_delete_prob(lp);
-			lp = NULL;
-			return 0.0;
-		}
-        }
-
-	int stat = glp_get_prim_stat(lp);
-	/* if we don't have a solution return */
-	if(stat == GLP_NOFEAS)
-	{
-		glp_delete_prob(lp);
-		lp = NULL;
-		return 0.0;
-	}
-
-	double res = glp_get_obj_val(lp);
-
-	for(s = 0; s < ns; s++)
-		for(w = 0; w < nw; w++)
-		{
-			flops_on_w[s][w] = glp_get_col_prim(lp, colnum(w, s));
-			if (is_integer)
-				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum(w, s));
-			else
-				w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum(w,s));
-//			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
-		}
-
-	glp_delete_prob(lp);
-	for(s = 0; s < ns; s++)
-		for(w = 0; w < nw; w++)
-		{
-			final_w_in_s[s][w] = w_in_s[s][w];
-			final_flops_on_w[s][w] = flops_on_w[s][w];
-		}
-
-	return res;
+	return sc_hypervisor_lp_simulate_distrib_flops_on_sample(ns, nw, final_w_in_s, is_integer, tmax, speed, flops, final_flops_on_w);
 }
 
 static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, unsigned *sched_ctxs, int *workers)
 {
-//	double flops[ns];
-//	double speed[ns][nw];
 	double *flops = (double*)malloc(ns*sizeof(double));
 	double **speed = (double **)malloc(ns*sizeof(double*));
 	int i;
@@ -312,7 +99,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
         specific_data.workers = workers;
 
         unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
-								tmin, tmax, smallest_tmax, _glp_resolve);
+								tmin, tmax, smallest_tmax, _compute_workers_distrib);
 
 	for(i = 0; i < ns; i++)
 		free(speed[i]);
@@ -416,7 +203,6 @@ static void ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
                         {
                                 _try_resizing(NULL, -1, NULL, -1);
-//                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                                
                         }
                 }
                 starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
@@ -433,7 +219,7 @@ static void ispeed_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *w
 	}
 }
 
-static void ispeed_lp_end_ctx(unsigned sched_ctx)
+static void ispeed_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
 {
 /* 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); */
 /* 	int worker; */

+ 3 - 7
sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c

@@ -85,7 +85,6 @@ static void _size_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int
 		nt++;
 
 	double w_in_s[ns][nw];
-//	double tasks[nw][nt];
 	double **tasks=(double**)malloc(nw*sizeof(double*));
 	int i;
 	for(i = 0; i < nw; i++)
@@ -175,7 +174,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers,
 
 	int nt = 0; /* Number of different kinds of tasks */
 	
-//			starpu_pthread_mutex_lock(&mutex);
+//	starpu_pthread_mutex_lock(&mutex);
 	
 	/* we don't take the mutex bc a correct value of the number of tasks is
 	   not required but we do a copy in order to be sure
@@ -190,7 +189,6 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers,
 	
 	
 	double w_in_s[ns][nw];
-//			double tasks_per_worker[nw][nt];
 	double **tasks_per_worker=(double**)malloc(nw*sizeof(double*));
 	int i;
 	for(i = 0; i < nw; i++)
@@ -208,13 +206,13 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers,
 	   compute the nr of flops and not the tasks */
         /*lp computes it in s but it's converted to ms just before return */
 	double possible_tmax = sc_hypervisor_lp_get_tmax(nw, NULL);
-	double smallest_tmax = 0.0;//possible_tmax / 3;
+	double smallest_tmax = 0.0;
 	double tmax = possible_tmax * ns;
 	double tmin = smallest_tmax;
 
 	unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data, 
 								tmin, tmax, smallest_tmax, _compute_workers_distrib);
-//			starpu_pthread_mutex_unlock(&mutex);
+//	starpu_pthread_mutex_unlock(&mutex);
 	
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
@@ -282,7 +280,6 @@ static void teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			return;
 		}
 
-
 		unsigned criteria = sc_hypervisor_get_resize_criteria();
 		if(criteria != SC_NOTHING && criteria == SC_IDLE)
 		{
@@ -290,7 +287,6 @@ static void teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
 			{
 				_try_resizing(NULL, -1, NULL, -1);
-//				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
 			}
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);

+ 11 - 11
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -56,7 +56,7 @@ static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], unsign
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
 	long diff_us = end_time.tv_usec  - start_time.tv_usec;
 
-	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+	__attribute__((unused)) float timing = (float)(diff_s*1000000 + diff_us)/1000;
 
 	if(res > 0.0)
 		return 1;
@@ -282,7 +282,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers,
 	}
 }
 
-static void debit_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
+static void throughput_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
 				       __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
@@ -300,7 +300,7 @@ static void debit_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx
 	}
 }
 
-static void debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+static void throughput_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
         if(ret != EBUSY)
@@ -319,7 +319,7 @@ static void debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
         }
 }
 
-static void debit_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void throughput_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
@@ -329,7 +329,7 @@ static void debit_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *wo
 	}
 }
 
-static void debit_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
+static void throughput_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
 {
 /* 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); */
 /* 	int worker; */
@@ -339,18 +339,18 @@ static void debit_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
 	return;
 }
 
-struct sc_hypervisor_policy debit_lp_policy = {
+struct sc_hypervisor_policy throughput_lp_policy = {
 	.size_ctxs = NULL,
-	.resize_ctxs = debit_lp_resize_ctxs,
-	.handle_poped_task = debit_lp_handle_poped_task,
+	.resize_ctxs = throughput_lp_resize_ctxs,
+	.handle_poped_task = throughput_lp_handle_poped_task,
 	.handle_pushed_task = NULL,
-	.handle_idle_cycle = debit_lp_handle_idle_cycle,
+	.handle_idle_cycle = throughput_lp_handle_idle_cycle,
 	.handle_idle_end = NULL,
 	.handle_post_exec_hook = NULL,
 	.handle_submitted_job = NULL,
-	.end_ctx = debit_lp_end_ctx,
+	.end_ctx = throughput_lp_end_ctx,
 	.custom = 0,
-	.name = "debit_lp"
+	.name = "throughput_lp"
 };
 
 #endif /* STARPU_HAVE_GLPK_H */

+ 3 - 2
sc_hypervisor/src/policies_utils/dichotomy.c

@@ -19,6 +19,8 @@
 #include <math.h>
 #include <sys/time.h>
 
+/* executes the function lp_estimated_distrib_func over the interval [tmin, tmax] until it finds the lowest value that
+   still has solutions */
 unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw], unsigned solve_lp_integer, void *specific_data,
 					    double tmin, double tmax, double smallest_tmax,
 					    double (*lp_estimated_distrib_func)(int ns, int nw, double draft_w_in_s[ns][nw], 
@@ -80,9 +82,8 @@ unsigned sc_hypervisor_lp_execute_dichotomy(int ns, int nw, double w_in_s[ns][nw
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
 	long diff_us = end_time.tv_usec  - start_time.tv_usec;
 
-	float timing = (float)(diff_s*1000000 + diff_us)/1000;
+	__attribute__((unused)) float timing = (float)(diff_s*1000000 + diff_us)/1000;
 
-//        fprintf(stdout, "nd = %d total time: %f ms \n", nd, timing);
 	return found_sol;
 }
 

+ 216 - 1
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -292,7 +292,8 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
 	/*1/tmax should belong to the interval [0.0;1.0]*/
 	glp_set_col_name(lp, n, "vmax");
-	glp_set_col_bnds(lp, n, GLP_DB, 0.0, 1.0);
+//	glp_set_col_bnds(lp, n, GLP_DB, 0.0, 1.0);
+	glp_set_col_bnds(lp, n, GLP_LO, 0.0, 0.0);
 	/* Z = 1/tmax -> 1/tmax structural variable, nCPUs & nGPUs in ctx are auxiliar variables */
 	glp_set_obj_coef(lp, n, 1.0);
 
@@ -429,6 +430,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 
 	double vmax = glp_get_obj_val(lp);
 
+//	printf("vmax = %lf \n", vmax);
 	n = 1;
 	for(s = 0; s < ns; s++)
 	{
@@ -447,4 +449,217 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 	return vmax;
 }
 
+double sc_hypervisor_lp_simulate_distrib_flops_on_sample(int ns, int nw, double final_w_in_s[ns][nw], unsigned is_integer, double tmax, 
+							 double **speed, double flops[ns], double **final_flops_on_w)
+{
+	double w_in_s[ns][nw];
+	double flops_on_w[ns][nw];
+
+	int w, s;
+	glp_prob *lp;
+
+//	printf("try with tmax %lf\n", tmax);
+	lp = glp_create_prob();
+	glp_set_prob_name(lp, "StarPU theoretical bound");
+	glp_set_obj_dir(lp, GLP_MAX);
+	glp_set_obj_name(lp, "total execution time");
+
+	{
+		int ne = 5 * ns * nw /* worker execution time */
+			+ 1; /* glp dumbness */
+		int n = 1;
+		int ia[ne], ja[ne];
+		double ar[ne];
+
+
+		/* Variables: number of flops assigned to worker w in context s, and 
+		 the acknwoledgment that the worker w belongs to the context s */
+		glp_add_cols(lp, 2*nw*ns);
+#define colnum_sample(w, s) ((s)*nw+(w)+1)
+		for(s = 0; s < ns; s++)
+			for(w = 0; w < nw; w++)
+				glp_set_obj_coef(lp, nw*ns+colnum_sample(w,s), 1.);
+		
+		for(s = 0; s < ns; s++)
+			for(w = 0; w < nw; w++)
+			{
+				char name[32];
+				snprintf(name, sizeof(name), "flopsw%ds%dn", w, s);
+				glp_set_col_name(lp, colnum_sample(w,s), name);
+				glp_set_col_bnds(lp, colnum_sample(w,s), GLP_LO, 0., 0.);
+
+				snprintf(name, sizeof(name), "w%ds%dn", w, s);
+				glp_set_col_name(lp, nw*ns+colnum_sample(w,s), name);
+				if (is_integer)
+				{
+                                        glp_set_col_kind(lp, nw*ns+colnum_sample(w, s), GLP_IV);
+					glp_set_col_bnds(lp, nw*ns+colnum_sample(w,s), GLP_DB, 0, 1);
+				}
+				else
+					glp_set_col_bnds(lp, nw*ns+colnum_sample(w,s), GLP_DB, 0.0, 1.0);
+
+			}
+
+
+		int curr_row_idx = 0;
+		/* Total worker execution time */
+		glp_add_rows(lp, nw*ns);
+
+		/*nflops[s][w]/v[s][w] < x[s][w]*tmax */
+		for(s = 0; s < ns; s++)
+		{
+			for (w = 0; w < nw; w++)
+			{
+				char name[32], title[64];
+				starpu_worker_get_name(w, name, sizeof(name));
+				snprintf(title, sizeof(title), "worker %s", name);
+				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
+
+				/* nflosp[s][w] */
+				ia[n] = curr_row_idx+s*nw+w+1;
+				ja[n] = colnum_sample(w, s);
+				ar[n] = 1 / speed[s][w];
+
+				n++;
+				
+				/* x[s][w] = 1 | 0 */
+				ia[n] = curr_row_idx+s*nw+w+1;
+				ja[n] = nw*ns+colnum_sample(w,s);
+				ar[n] = (-1) * tmax;
+				n++;
+				glp_set_row_bnds(lp, curr_row_idx+s*nw+w+1, GLP_UP, 0.0, 0.0);
+			}
+		}
+
+		curr_row_idx += nw*ns;
+
+		/* sum(flops[s][w]) = flops[s] */
+		glp_add_rows(lp, ns);
+		for (s = 0; s < ns; s++)
+		{
+			char name[32], title[64];
+			starpu_worker_get_name(w, name, sizeof(name));
+			snprintf(title, sizeof(title), "flops %lf ctx%d", flops[s], s);
+			glp_set_row_name(lp, curr_row_idx+s+1, title);
+			for (w = 0; w < nw; w++)
+			{
+				ia[n] = curr_row_idx+s+1;
+				ja[n] = colnum_sample(w, s);
+				ar[n] = 1;
+				n++;
+			}
+			glp_set_row_bnds(lp, curr_row_idx+s+1, GLP_FX, flops[s], flops[s]);
+		}
+
+		curr_row_idx += ns;
+
+		/* sum(x[s][w]) = 1 */
+		glp_add_rows(lp, nw);
+		for (w = 0; w < nw; w++)
+		{
+			char name[32], title[64];
+			starpu_worker_get_name(w, name, sizeof(name));
+			snprintf(title, sizeof(title), "w%x", w);
+			glp_set_row_name(lp, curr_row_idx+w+1, title);
+			for(s = 0; s < ns; s++)
+			{
+				ia[n] = curr_row_idx+w+1;
+				ja[n] = nw*ns+colnum_sample(w,s);
+				ar[n] = 1;
+				n++;
+			}
+			if(is_integer)				
+				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
+			else
+				glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
+		}
+
+		curr_row_idx += nw;
+
+		/* sum(nflops[s][w]) > 0*/
+		glp_add_rows(lp, nw);
+		for (w = 0; w < nw; w++)
+		{
+			char name[32], title[64];
+			starpu_worker_get_name(w, name, sizeof(name));
+			snprintf(title, sizeof(title), "flopsw%x", w);
+			glp_set_row_name(lp, curr_row_idx+w+1, title);
+			for(s = 0; s < ns; s++)
+			{
+				ia[n] = curr_row_idx+w+1;
+				ja[n] = colnum_sample(w,s);
+				ar[n] = 1;
+				n++;
+			}
+
+			glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_LO, 0.1, 0.);
+		}
+
+		if(n != ne)
+			printf("ns= %d nw = %d n = %d ne = %d\n", ns, nw, n, ne);
+		STARPU_ASSERT(n == ne);
+
+		glp_load_matrix(lp, ne-1, ia, ja, ar);
+	}
+
+	glp_smcp parm;
+	glp_init_smcp(&parm);
+	parm.msg_lev = GLP_MSG_OFF;
+	int ret = glp_simplex(lp, &parm);
+	if (ret)
+	{
+		glp_delete_prob(lp);
+		lp = NULL;
+		return 0.0;
+	}
+
+        if (is_integer)
+        {
+                glp_iocp iocp;
+                glp_init_iocp(&iocp);
+                iocp.msg_lev = GLP_MSG_OFF;
+                glp_intopt(lp, &iocp);
+		int stat = glp_mip_status(lp);
+		/* if we don't have a solution return */
+		if(stat == GLP_NOFEAS)
+		{
+			glp_delete_prob(lp);
+			lp = NULL;
+			return 0.0;
+		}
+        }
+
+	int stat = glp_get_prim_stat(lp);
+	/* if we don't have a solution return */
+	if(stat == GLP_NOFEAS)
+	{
+		glp_delete_prob(lp);
+		lp = NULL;
+		return 0.0;
+	}
+
+	double res = glp_get_obj_val(lp);
+
+	for(s = 0; s < ns; s++)
+		for(w = 0; w < nw; w++)
+		{
+			flops_on_w[s][w] = glp_get_col_prim(lp, colnum_sample(w, s));
+			if (is_integer)
+				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum_sample(w, s));
+			else
+				w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum_sample(w,s));
+//			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
+		}
+
+	glp_delete_prob(lp);
+	for(s = 0; s < ns; s++)
+		for(w = 0; w < nw; w++)
+		{
+			final_w_in_s[s][w] = w_in_s[s][w];
+			final_flops_on_w[s][w] = flops_on_w[s][w];
+		}
+
+	return res;
+
+}
 #endif // STARPU_HAVE_GLPK_H

+ 47 - 26
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -20,11 +20,6 @@
 #include "sc_hypervisor_intern.h"
 #include <starpu_config.h>
 
-#ifdef STARPU_HAVE_GLPK_H
-
-
-#endif //STARPU_HAVE_GLPK_H
-
 double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], 
 					     int total_nw[ntypes_of_workers], struct types_of_workers *tw)
 {
@@ -39,23 +34,11 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 	for(i = 0; i < nsched_ctxs; i++)
 	{
 		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
-/* #ifdef STARPU_USE_CUDA */
-/* 		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER); */
-/* 		if(ncuda != 0) */
-/* 		{ */
-/* 			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER); */
-/* 			v[i][1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
-/* 		} */
-/* 		else */
-/* 			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
-/* #else */
-/* 		v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
-/* #endif // STARPU_USE_CUDA */
 		int w;
 		for(w = 0; w < nw; w++)
 			v[i][w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
 		
-		flops[i] = sc_w->remaining_flops < 0.0 ? 0.0 : sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
+		flops[i] = sc_w->remaining_flops < 0.0 ? 0.0 : sc_w->remaining_flops/1000000000; /* in gflops*/
 //		printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
 	}
 
@@ -171,12 +154,19 @@ void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched
 			if(nworkers_ctx > res_rounded[sched_ctx_idx][w])
 			{
 				int nworkers_to_move = nworkers_ctx - res_rounded[sched_ctx_idx][w];
-				if(target_res == 0.0 && nworkers_to_move > 0)
-					nworkers_to_move--;
 				int *workers_to_move = sc_hypervisor_get_idlest_workers(sched_ctx, &nworkers_to_move, arch);
 				int i;
-				for(i = 0; i < nworkers_to_move; i++)
-					tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
+				if(target_res == 0.0 && nworkers_to_move > 0)
+				{
+					tmp_workers_add[w][tmp_nw_add[w]++] = workers_to_move[0];
+					for(i = 1; i < nworkers_to_move; i++)
+						tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
+				}
+				else
+				{
+					for(i = 0; i < nworkers_to_move; i++)
+						tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
+				}
 				free(workers_to_move);
 			}
 		}
@@ -240,7 +230,7 @@ void _lp_find_workers_to_accept(int nw, int ns, unsigned sched_ctx, int sched_ct
 		
 		int nw_ctx2 = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch);
 		int nw_needed = res_rounded[sched_ctx_idx][w] - nw_ctx2;
-		
+
 		if( nw_needed > 0 && tmp_nw_move[w] > 0)
 		{
 			*nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
@@ -263,7 +253,7 @@ void _lp_find_workers_to_accept(int nw, int ns, unsigned sched_ctx, int sched_ct
 		int x = floor(needed);
 		double x_double = (double)x;
 		double diff = needed - x_double;
-		if(diff > 0.3 && tmp_nw_add[w] > 0)
+		if((diff > 0.3 || needed > 0.3) && tmp_nw_add[w] > 0)
 		{
 			*nw_add = tmp_nw_add[w];
 			int i = 0;
@@ -335,7 +325,6 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 		_lp_find_workers_to_give_away(nw, ns, sched_ctxs[s], s, 
 					      tmp_nw_move, tmp_workers_move, 
 					      tmp_nw_add, tmp_workers_add, res_rounded, res, tw);
-
 		for(s2 = 0; s2 < ns; s2++)
 		{
 			if(sched_ctxs[s2] != sched_ctxs[s])
@@ -381,6 +370,34 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 			sc_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
 	}
 }
+int _lp_get_unwanted_workers(int *workers_add, int nw_add, unsigned sched_ctx, int *workers_remove)
+{
+	int nw_remove = 0;
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
+	int worker;
+
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		int i;
+		unsigned found = 0;
+		for(i = 0; i < nw_add; i++)
+		{
+			if(worker == workers_add[i])
+			{
+				found = 1;
+				break;
+			}
+		}
+		if(!found)
+			workers_remove[nw_remove++] = worker;
+	}
+	return nw_remove;
+}
 
 void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers, struct types_of_workers *tw)
 {
@@ -406,8 +423,9 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns,
 				if(target_res == 0.0)
 				{
 					nworkers_to_add=1;
-					start[w]--;
+					int old_start = start[w];
 					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
+					start[w] = old_start;
 					int i;
 					for(i = 0; i < nworkers_to_add; i++)
 						workers_add[nw_add++] = workers_to_add[i];
@@ -455,6 +473,9 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns,
 		if(nw_add > 0)
 		{
 			sc_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s]);
+			int workers_remove[STARPU_NMAXWORKERS];
+			int nw_remove = _lp_get_unwanted_workers(workers_add, nw_add, sched_ctxs[s], workers_remove);
+			sc_hypervisor_remove_workers_from_sched_ctx(workers_remove, nw_remove, sched_ctxs[s], 0);
 			sc_hypervisor_start_resize(sched_ctxs[s]);
 		}
 

+ 5 - 3
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -352,7 +352,7 @@ double sc_hypervisor_get_fastest_ctx_exec_time(void)
 
 void sc_hypervisor_group_workers_by_type(struct types_of_workers *tw, int *total_nw)
 {
-	int w;
+	unsigned w;
 	for(w = 0; w < tw->nw; w++)
 		total_nw[w] = 0;
 
@@ -382,8 +382,9 @@ enum starpu_worker_archtype sc_hypervisor_get_arch_for_index(unsigned w, struct
 	else
 		if(tw->ncuda != 0)
 			return STARPU_CUDA_WORKER;
-}
 
+	return STARPU_CPU_WORKER;
+}
 
 unsigned sc_hypervisor_get_index_for_arch(enum starpu_worker_archtype arch, struct types_of_workers *tw)
 {
@@ -403,6 +404,7 @@ unsigned sc_hypervisor_get_index_for_arch(enum starpu_worker_archtype arch, stru
 				return 0;
 		}
 	}
+	return 0;
 }
 
 void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *workers, unsigned size_ctxs, struct sc_hypervisor_policy_task_pool *task_pools)
@@ -521,7 +523,7 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 				{
 					v[w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw));
 					
-					optimal_v[i] += nworkers_per_ctx[i][w];
+					optimal_v[i] += nworkers_per_ctx[i][w]*v[w];
 				}
 				_set_optimal_v(i, optimal_v[i]);
 			}

+ 9 - 5
sc_hypervisor/src/policies_utils/speed.c

@@ -112,7 +112,7 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 		double speed = 0.0;
 		unsigned nworkers = 0;
 		double all_workers_flops = 0.0;
-		double all_workers_idle_time = 0.0;
+		double max_workers_idle_time = 0.0;
 		while(workers->has_next(workers, &it))
 		{
 			worker = workers->get_next(workers, &it);
@@ -120,7 +120,8 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 			if(arch == req_arch)
 			{
 				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
-				all_workers_idle_time += sc_w->idle_time[worker]; /* in seconds */
+				if(max_workers_idle_time < sc_w->idle_time[worker])
+					max_workers_idle_time = sc_w->idle_time[worker]; /* in seconds */
 				nworkers++;
 			}
 		}			
@@ -131,7 +132,7 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 			
 			/* compute speed for the last frame */
 			double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
-			elapsed_time -= all_workers_idle_time;
+			elapsed_time -= max_workers_idle_time;
 			speed = (all_workers_flops / elapsed_time) / nworkers;
 		}
 		else
@@ -165,17 +166,20 @@ double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper*
 	return -1.0;
 }
 
+/* returns the speed necessary for the linear programs (either the monitored one either a default value) */
 double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_worker_archtype arch)
 {
-
+	/* monitored speed in the last frame */
 	double speed = sc_hypervisor_get_speed_per_worker_type(sc_w, arch);
 	if(speed == -1.0)
 	{
+		/* avg value of the monitored speed over the entier current execution */
 		speed = sc_hypervisor_get_ref_speed_per_worker_type(sc_w, arch);
 	}
 	if(speed == -1.0)
 	{
-		speed = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
+		/* a default value */
+		speed = arch == STARPU_CPU_WORKER ? SC_HYPERVISOR_DEFAULT_CPU_SPEED : SC_HYPERVISOR_DEFAULT_CUDA_SPEED;
 	}
        
 	return speed;

+ 82 - 19
sc_hypervisor/src/sc_hypervisor.c

@@ -38,7 +38,7 @@ extern struct sc_hypervisor_policy gflops_rate_policy;
 extern struct sc_hypervisor_policy feft_lp_policy;
 extern struct sc_hypervisor_policy teft_lp_policy;
 extern struct sc_hypervisor_policy ispeed_lp_policy;
-extern struct sc_hypervisor_policy debit_lp_policy;
+extern struct sc_hypervisor_policy throughput_lp_policy;
 #endif // STARPU_HAVE_GLPK_
 extern struct sc_hypervisor_policy ispeed_policy;
 
@@ -51,7 +51,7 @@ static struct sc_hypervisor_policy *predefined_policies[] =
 	&feft_lp_policy,
 	&teft_lp_policy,
 	&ispeed_lp_policy,
-	&debit_lp_policy,
+	&throughput_lp_policy,
 #endif // STARPU_HAVE_GLPK_H
 	&gflops_rate_policy,
 	&ispeed_policy
@@ -99,6 +99,25 @@ static struct sc_hypervisor_policy *_find_hypervisor_policy_from_name(const char
 	return NULL;
 }
 
+static void display_sched_help_message(void)
+{
+	const char* policy_name = getenv("SC_HYPERVISOR_POLICY");
+	if (policy_name && (strcmp(policy_name, "help") == 0))
+	{
+		fprintf(stderr, "SC_HYPERVISOR_POLICY can be either of\n");
+		/* display the description of all predefined policies */
+		unsigned i;
+		for (i = 0; i < sizeof(predefined_policies)/sizeof(predefined_policies[0]); i++)
+		{
+			struct sc_hypervisor_policy *p = predefined_policies[i];
+			if (p->name)
+			{
+				fprintf(stderr, "%s\n", p->name);
+			}
+		}
+	}
+}
+
 static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervisor_policy* hypervisor_policy)
 {
 	struct sc_hypervisor_policy *selected_policy = NULL;
@@ -132,7 +151,10 @@ static struct sc_hypervisor_policy *_select_hypervisor_policy(struct sc_hypervis
 
 /* initializez the performance counters that starpu will use to retrive hints for resizing */
 struct starpu_sched_ctx_performance_counters* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
-{
+{	
+/* Perhaps we have to display some help */
+	display_sched_help_message();
+
 	hypervisor.min_tasks = 0;
 	hypervisor.nsched_ctxs = 0;
 	char* vel_gap = getenv("SC_HYPERVISOR_MAX_SPEED_GAP");
@@ -247,7 +269,6 @@ static void _print_current_time()
 
 void sc_hypervisor_shutdown(void)
 {
-//	printf("shutdown\n");
 	int i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	{
@@ -344,8 +365,6 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 	hypervisor.sched_ctx_w[sched_ctx].sched_ctx = STARPU_NMAX_SCHED_CTXS;
 	_remove_config(sched_ctx);
 
-/* 	free(hypervisor.configurations[sched_ctx]); */
-/* 	free(hypervisor.resize_requests[sched_ctx]); */
 	starpu_pthread_mutex_destroy(&hypervisor.conf_mut[sched_ctx]);
 	starpu_pthread_mutex_destroy(&hypervisor.resize_mut[sched_ctx]);
 	if(hypervisor.nsched_ctxs == 1)
@@ -440,18 +459,26 @@ static void _reset_idle_time(unsigned sched_ctx)
 
 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
 {
-	/* info concerning only the gflops_rate strateg */
-	struct sc_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
-	struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
-	
 	double start_time =  starpu_timing_now();
-	sender_sc_w->start_time = start_time;
-	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
-	_reset_idle_time(sender_sched_ctx);
+	if(sender_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+	{
+		/* info concerning only the gflops_rate strateg */
+		struct sc_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
+		
+		sender_sc_w->start_time = start_time;
+		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
+		_reset_idle_time(sender_sched_ctx);
+	}
+
+	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+	{
 
-	receiver_sc_w->start_time = start_time;
-	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
-	_reset_idle_time(receiver_sched_ctx);
+		struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
+		
+		receiver_sc_w->start_time = start_time;
+		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
+		_reset_idle_time(receiver_sched_ctx);
+	}
 }
 
 /* actually move the workers: the cpus are moved, gpus are only shared  */
@@ -530,6 +557,7 @@ void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworke
 		unsigned i;
 		for(i = 0; i < nworkers_to_add; i++)
 			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
+		_reset_resize_sample_info(STARPU_NMAX_SCHED_CTXS, sched_ctx);
 
 	}
 	return;
@@ -557,6 +585,7 @@ void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigne
 			printf("\n");
 			
 			starpu_sched_ctx_remove_workers(workers_to_remove, nworkers_to_remove, sched_ctx);
+			_reset_resize_sample_info(sched_ctx, STARPU_NMAX_SCHED_CTXS);
 		}
 		else
 		{
@@ -601,7 +630,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 {
 	if(worker != -1 && !starpu_sched_ctx_contains_worker(worker, sched_ctx))
 		return 0;
-
+	
 	struct sc_hypervisor_resize_ack *resize_ack = NULL;
 	unsigned sender_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
@@ -621,6 +650,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 					if(sc_w->resize_ack.moved_workers[j] == worker)
 					{
 						only_remove = 1;
+						_reset_resize_sample_info(sched_ctx, STARPU_NMAX_SCHED_CTXS);
 						starpu_pthread_mutex_unlock(&sc_w->mutex);
 						break;
 					}
@@ -639,7 +669,9 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
 	/* if there is no ctx waiting for its ack return 1*/
 	if(resize_ack == NULL)
+	{
 		return 1;
+	}
 
 	int ret = starpu_pthread_mutex_trylock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
 	if(ret != EBUSY)
@@ -796,7 +828,14 @@ static void notify_poped_task(unsigned sched_ctx, int worker, struct starpu_task
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += task->flops;
-	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops; //sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
+	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops;
+/* 	if(hypervisor.sched_ctx_w[sched_ctx].remaining_flops < 0.0) */
+/* 		hypervisor.sched_ctx_w[sched_ctx].remaining_flops = 0.0; */
+//	double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
+/* 	printf("*****************STARPU_STARPU_STARPU: decrement %lf flops  remaining flops %lf total flops %lf elapseed flops %lf in ctx %d \n", */
+/* 	       task->flops, hypervisor.sched_ctx_w[sched_ctx].remaining_flops,  hypervisor.sched_ctx_w[sched_ctx].total_flops, ctx_elapsed_flops, sched_ctx); */
+	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
 	if(hypervisor.resize[sched_ctx])
 	{	
@@ -887,7 +926,7 @@ static void notify_delete_context(unsigned sched_ctx)
 void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	int curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
+	unsigned curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : (unsigned)nsched_ctxs;
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	unsigned s;
@@ -982,3 +1021,27 @@ struct types_of_workers* sc_hypervisor_get_types_of_workers(int *workers, unsign
         if(tw->ncuda > 0) tw->nw++;
 	return tw;
 }
+
+void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total_flops)
+{
+//	double diff = total_flops - hypervisor.sched_ctx_w[sched_ctx].total_flops;
+//	printf("*****************STARPU_STARPU_STARPU: update diff flops %lf to ctx %d \n", diff_total_flops, sched_ctx);
+	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+	hypervisor.sched_ctx_w[sched_ctx].total_flops += diff_total_flops;
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops += diff_total_flops;	
+/* 	printf("*****************STARPU_STARPU_STARPU: total flops %lf remaining flops %lf in ctx %d \n", */
+/* 	       hypervisor.sched_ctx_w[sched_ctx].total_flops, hypervisor.sched_ctx_w[sched_ctx].remaining_flops, sched_ctx); */
+	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+}
+
+void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_elapsed_flops)
+{
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1)
+	{
+		starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[workerid] += diff_elapsed_flops;
+		hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[workerid] += diff_elapsed_flops;
+		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+	}
+}

+ 2 - 0
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -18,6 +18,8 @@
 #include <common/uthash.h>
 
 #define SC_SPEED_MAX_GAP_DEFAULT 50
+#define SC_HYPERVISOR_DEFAULT_CPU_SPEED 5.0
+#define SC_HYPERVISOR_DEFAULT_CUDA_SPEED 100.0
 
 struct size_request
 {

+ 4 - 2
src/Makefile.am

@@ -51,8 +51,8 @@ lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ $(STARPU_RCCE_CPPFLAGS) -DBUILDING_STARPU
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS) $(STARPU_LEVELDB_LDFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS) $(STARPU_LEVELDB_LDFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
   -version-info $(libstarpu_so_version)
 
@@ -66,6 +66,7 @@ noinst_HEADERS = 						\
 	core/progress_hook.h                                    \
 	core/sched_policy.h					\
 	core/sched_ctx.h					\
+	core/sched_ctx_list.h					\
 	core/perfmodel/perfmodel.h				\
 	core/perfmodel/regression.h				\
 	core/jobs.h						\
@@ -170,6 +171,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	core/sched_policy.c					\
 	core/simgrid.c						\
 	core/sched_ctx.c					\
+	core/sched_ctx_list.c					\
 	core/parallel_task.c					\
 	core/detect_combined_workers.c				\
 	sched_policies/eager_central_policy.c			\

+ 4 - 0
src/common/barrier_counter.h

@@ -14,6 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#ifndef __BARRIER_COUNTER_H__
+#define __BARRIER_COUNTER_H__
+
 #include <common/utils.h>
 #include <common/barrier.h>
 
@@ -37,3 +40,4 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
 int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
 
+#endif

+ 1 - 1
src/common/uthash.h

@@ -229,7 +229,7 @@ do {
 #define HASH_FIND_STR(head,findstr,out)                                          \
     HASH_FIND(hh,head,findstr,strlen(findstr),out)
 #define HASH_ADD_STR(head,strfield,add)                                          \
-    HASH_ADD(hh,head,strfield,strlen(add->strfield),add)
+    HASH_ADD(hh,head,strfield[0],strlen(add->strfield),add)
 #define HASH_FIND_INT(head,findint,out)                                          \
     HASH_FIND(hh,head,findint,sizeof(int),out)
 #define HASH_ADD_INT(head,intfield,add)                                          \

+ 24 - 4
src/core/combined_workers.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -67,9 +67,14 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 	{
 		int id = workerid_array[i];
 
+#ifdef STARPU_USE_MIC
+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER || config->workers[id].arch == STARPU_MIC_WORKER);
+		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU || config->workers[id].worker_mask == STARPU_MIC);
+#else/* STARPU_USE_MIC */
 		/* We only combine CPUs */
-		STARPU_ASSERT(config->workers[id].perf_arch == STARPU_CPU_DEFAULT);
+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER);
 		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU);
+#endif /* STARPU_USE_MIC */
 
 		/* We only combine valid "basic" workers */
 		if ((id < 0) || (id >= basic_worker_count))
@@ -95,8 +100,23 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 		&config->combined_workers[combined_worker_id];
 
 	combined_worker->worker_size = nworkers;
-	combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
-	combined_worker->worker_mask = STARPU_CPU;
+
+#ifdef STARPU_USE_MIC
+	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
+	{
+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
+		combined_worker->worker_mask = STARPU_MIC;
+	}
+#endif
+	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
+	{
+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
+		combined_worker->worker_mask = STARPU_CPU;
+	}
+#ifdef STARPU_USE_MP
+	combined_worker->count = nworkers -1;
+	pthread_mutex_init(&combined_worker->count_mutex,NULL);
+#endif
 
 	/* We assume that the memory node should either be that of the first
 	 * entry, and it is very likely that every worker in the combination

+ 4 - 4
src/core/debug.c

@@ -82,7 +82,7 @@ int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl)
 	unsigned i;
 	const char *name;
 	if (!cl)
-		return -1;
+		return 0;
 	name = _starpu_codelet_get_model_name(cl);
 	STARPU_PTHREAD_MUTEX_LOCK(&ayudame_mutex);
 	for (i=0; i < ncodelets; i++)
@@ -92,7 +92,7 @@ int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl)
 				((name && codelets[i].name) && !strcmp(codelets[i].name, name))))
 		{
 			STARPU_PTHREAD_MUTEX_UNLOCK(&ayudame_mutex);
-			return i;
+			return i + 1;
 		}
 	}
 	if (ncodelets == ncodelets_alloc)
@@ -111,8 +111,8 @@ int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl)
 		codelets[ncodelets].name = NULL;
 	i = ncodelets++;
 	if (name)
-		AYU_event(AYU_REGISTERFUNCTION, i, (void*) name);
+		AYU_event(AYU_REGISTERFUNCTION, i+1, (void*) name);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&ayudame_mutex);
-	return i;
+	return i + 1;
 }
 #endif

+ 2 - 2
src/core/dependencies/tags.c

@@ -203,7 +203,7 @@ static struct _starpu_tag *_gettag_struct(starpu_tag_t id)
 #ifdef HAVE_AYUDAME_H
 		if (AYU_event)
 		{
-			int64_t AYU_data[2] = {-1, 0};
+			int64_t AYU_data[2] = {0, 0};
 			STARPU_ASSERT(id < AYUDAME_OFFSET);
 			AYU_event(AYU_ADDTASK, id + AYUDAME_OFFSET, AYU_data);
 		}
@@ -244,7 +244,7 @@ void _starpu_tag_set_ready(struct _starpu_tag *tag)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = -1;
+		intptr_t id = 0;
 		AYU_event(AYU_PRERUNTASK, tag->id + AYUDAME_OFFSET, &id);
 		AYU_event(AYU_POSTRUNTASK, tag->id + AYUDAME_OFFSET, NULL);
 	}

+ 5 - 1
src/core/dependencies/task_deps.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -65,10 +65,12 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 
 	job = _starpu_get_job_associated_to_task(task);
 
+	STARPU_PTHREAD_MUTEX_LOCK(&job->sync_mutex);
 	if (check)
 		STARPU_ASSERT_MSG(!job->submitted || !task->destroy || task->detach, "Task dependencies have to be set before submission (submitted %u destroy %d detach %d)", job->submitted, task->destroy, task->detach);
 	else
 		STARPU_ASSERT_MSG(job->terminated <= 1, "Task dependencies have to be set before termination (terminated %u)", job->terminated);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&job->sync_mutex);
 
 	struct _starpu_cg *cg = create_cg_task(ndeps, job);
 
@@ -81,6 +83,7 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 		dep_job = _starpu_get_job_associated_to_task(dep_task);
 
 		STARPU_ASSERT_MSG(dep_job != job, "A task must not depend on itself.");
+		STARPU_PTHREAD_MUTEX_LOCK(&dep_job->sync_mutex);
 		if (check)
 		{
 			STARPU_ASSERT_MSG(!dep_job->submitted || !dep_job->task->destroy || dep_job->task->detach, "Unless it is not to be destroyed automatically, a task dependencies have to be set before submission");
@@ -88,6 +91,7 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 			STARPU_ASSERT_MSG(!dep_job->submitted || !dep_job->task->regenerate, "For regenerated tasks, dependencies have to be set before first submission");
 		} else
 			STARPU_ASSERT_MSG(dep_job->terminated <= 1, "Task dependencies have to be set before termination (terminated %u)", dep_job->terminated);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&dep_job->sync_mutex);
 
 		_STARPU_TRACE_TASK_DEPS(dep_job, job);
 		_starpu_bound_task_dep(job, dep_job);

+ 86 - 25
src/core/detect_combined_workers.c

@@ -191,58 +191,119 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
 #else /* STARPU_HAVE_HWLOC */
 
+static void assign_combinations_without_hwloc(struct starpu_worker_collection* worker_collection, int* workers, unsigned n, int min, int max)
+{
+
+	int size,i,count =0;
+	//if the maximun number of worker is already reached
+	if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
+		return;
+
+	for (size = min; size <= max; size *= 2)
+	{
+		unsigned first;
+		for (first = 0; first < n; first += size)
+		{
+			if (first + size <= n)
+			{
+				int found_workerids[size];
+
+				for (i = 0; i < size; i++)
+					found_workerids[i] = workers[first + i];
+
+				/* We register this combination */
+				int newworkerid;
+				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
+				STARPU_ASSERT(newworkerid >= 0);
+				count++;
+				worker_collection->add(worker_collection, newworkerid);
+				//if the maximun number of worker is reached, then return
+				if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
+					return;
+			}
+		}
+	}
+}
+
+
 static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
 {
+	int i;
+	unsigned j;
 	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 		sched_ctx_id = 0;
-	int min;
-	int max;
+	int min, max, mic_min, mic_max;
 
 	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 
 	/* We put the id of all CPU workers in this array */
 	int cpu_workers[STARPU_NMAXWORKERS];
 	unsigned ncpus = 0;
+#ifdef STARPU_USE_MIC
+	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;
+	unsigned * nmics_table;
+	int * mic_id;
+	int ** mic_workers;
+	mic_id = malloc(sizeof(int)*nb_mics);
+	nmics_table = malloc(sizeof(unsigned)*nb_mics);
+	mic_workers = malloc(sizeof(int*)*nb_mics);
+	for(i=0; i<nb_mics; i++)
+	{
+		mic_id[i] = -1;
+		nmics_table[i] = 0;
+		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
+	}
+#endif /* STARPU_USE_MIC */
 
 	struct _starpu_worker *worker;
-	int i;
 	for (i = 0; i < nworkers; i++)
 	{
 		worker = _starpu_get_worker_struct(workerids[i]);
-
-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		if (worker->arch == STARPU_CPU_WORKER)
 			cpu_workers[ncpus++] = i;
+#ifdef STARPU_USE_MIC
+		else if(worker->arch == STARPU_MIC_WORKER)
+		{
+			for(j=0; mic_id[j] != worker->mp_nodeid && mic_id[j] != -1 && j<nb_mics; j++);
+			if(j<nb_mics)
+			{
+				if(mic_id[j] == -1)
+				{
+					mic_id[j] = worker->mp_nodeid;					
+				}
+				mic_workers[j][nmics_table[j]++] = i;
+			}
+		}
+#endif /* STARPU_USE_MIC */
+
 	}
 
+
 	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	if (min < 2)
 		min = 2;
 	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
 	if (max == -1 || max > (int) ncpus)
 		max = ncpus;
-
-	int size;
-	for (size = min; size <= max; size *= 2)
+	
+	assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);
+#ifdef STARPU_USE_MIC
+	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+	if (mic_min < 2)
+		mic_min = 2;
+	for(i=0; i<nb_mics; i++)
 	{
-		unsigned first_cpu;
-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-		{
-			if (first_cpu + size <= ncpus)
-			{
-				int found_workerids[size];
-
-				for (i = 0; i < size; i++)
-					found_workerids[i] = cpu_workers[first_cpu + i];
-
-				/* We register this combination */
-				int newworkerid;
-				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
-				STARPU_ASSERT(newworkerid >= 0);
-				workers->add(workers, newworkerid);
-			}
-		}
+		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
+		if (mic_max == -1 || mic_max > (int) nmics_table[i])
+			mic_max = nmics_table[i];
+		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
+		free(mic_workers[i]);
 	}
+	free(mic_id);
+	free(nmics_table);
+	free(mic_workers);
+#endif /* STARPU_USE_MIC */
 }
 
 #endif /* STARPU_HAVE_HWLOC */

+ 2 - 0
src/core/disk.h

@@ -28,6 +28,8 @@ extern "C"
 {
 #endif
 
+#include <datawizard/copy_driver.h>
+
 /* interface to manipulate memory disk */
 void * _starpu_disk_alloc (unsigned node, size_t size);
 

+ 2 - 2
src/core/disk_ops/disk_stdio.c

@@ -229,11 +229,11 @@ starpu_stdio_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, const void *b
 	int res = fseek(tmp->file, offset, SEEK_SET); 
 	STARPU_ASSERT_MSG(res == 0, "Stdio write failed");
 
-	ssize_t nb = fwrite (buf, 1, size, tmp->file);
+	fwrite (buf, 1, size, tmp->file);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&tmp->mutex);
 
-	return nb;
+	return 0;
 }
 
 static int

+ 2 - 2
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -224,12 +224,12 @@ starpu_unistd_global_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, const
 	int res = lseek(tmp->descriptor, offset, SEEK_SET); 
 	STARPU_ASSERT_MSG(res >= 0, "Starpu Disk unistd lseek for write failed: offset %lu got errno %d", (unsigned long) offset, errno);
 
-	ssize_t nb = write (tmp->descriptor, buf, size);
+	write (tmp->descriptor, buf, size);
 	STARPU_ASSERT_MSG(res >= 0, "Starpu Disk unistd write failed: size %lu got errno %d", (unsigned long) size, errno);
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&tmp->mutex);
 
-	return nb;
+	return 0;
 }
 
 

+ 5 - 2
src/core/jobs.c

@@ -129,9 +129,12 @@ void _starpu_wait_job(struct _starpu_job *j)
 	 * way, _starpu_wait_job won't return until the entire task was really
 	 * executed (so that we cannot destroy the task while it is still being
 	 * manipulated by the driver). */
+
 	while (j->terminated != 2)
+	{
 		STARPU_PTHREAD_COND_WAIT(&j->sync_cond, &j->sync_mutex);
-
+	}
+	
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
         _STARPU_LOG_OUT();
 }
@@ -281,7 +284,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 #ifdef HAVE_AYUDAME_H
 		if (AYU_event)
 		{
-			int64_t AYU_data[2] = {j->exclude_from_dag?-1:_starpu_ayudame_get_func_id(task->cl), task->priority > STARPU_MIN_PRIO};
+			int64_t AYU_data[2] = {j->exclude_from_dag?0:_starpu_ayudame_get_func_id(task->cl), task->priority > STARPU_MIN_PRIO};
 			AYU_event(AYU_ADDTASK, j->job_id, AYU_data);
 		}
 #endif

+ 26 - 91
src/core/sched_ctx.c

@@ -26,24 +26,18 @@ static starpu_pthread_mutex_t finished_submit_mutex = STARPU_PTHREAD_MUTEX_INITI
 struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 starpu_pthread_key_t sched_ctx_key;
 unsigned with_hypervisor = 0;
-double max_time_worker_on_ctx = -1.0;
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
-static unsigned _starpu_worker_get_first_free_sched_ctx(struct _starpu_worker *worker);
-
-static unsigned _starpu_worker_get_sched_ctx_id(struct _starpu_worker *worker, unsigned sched_ctx_id);
 
 static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
 {
-	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
+	unsigned ret_sched_ctx = _starpu_sched_ctx_list_get_sched_ctx(worker->sched_ctx_list, sched_ctx_id);
 	/* the worker was planning to go away in another ctx but finally he changed his mind & 
 	   he's staying */
-	if (worker_sched_ctx_id  == STARPU_NMAX_SCHED_CTXS)
+	if (ret_sched_ctx == STARPU_NMAX_SCHED_CTXS)
 	{
-		worker_sched_ctx_id = _starpu_worker_get_first_free_sched_ctx(worker);
-		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		/* add context to worker */
-		worker->sched_ctx[worker_sched_ctx_id] = sched_ctx;
+		_starpu_sched_ctx_list_add(&worker->sched_ctx_list, sched_ctx_id);
 		worker->nsched_ctxs++;
 		worker->active_ctx = sched_ctx_id;
 	}
@@ -53,12 +47,16 @@ static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_w
 
 void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
 {
-	unsigned worker_sched_ctx_id = _starpu_worker_get_sched_ctx_id(worker, sched_ctx_id);
+	unsigned ret_sched_ctx = _starpu_sched_ctx_list_get_sched_ctx(worker->sched_ctx_list, sched_ctx_id);
 	/* remove context from worker */
-	if(worker->sched_ctx[worker_sched_ctx_id]->sched_policy && worker->sched_ctx[worker_sched_ctx_id]->sched_policy->remove_workers)
-		worker->sched_ctx[worker_sched_ctx_id]->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
-	worker->sched_ctx[worker_sched_ctx_id] = NULL;
-	worker->nsched_ctxs--;
+	if(ret_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+	{
+		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
+			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
+		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
+		worker->nsched_ctxs--;
+	}
 	return;
 }
 
@@ -316,7 +314,9 @@ struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *
 		for(i = 0; i < nworkers; i++)
 		{
 			struct _starpu_worker *worker = _starpu_get_worker_struct(i);
-			worker->sched_ctx[_starpu_worker_get_first_free_sched_ctx(worker)] = sched_ctx;
+			worker->sched_ctx_list = (struct _starpu_sched_ctx_list*)malloc(sizeof(struct _starpu_sched_ctx_list));
+			_starpu_sched_ctx_list_init(worker->sched_ctx_list);
+			_starpu_sched_ctx_list_add(&worker->sched_ctx_list, sched_ctx->id);
 			worker->nsched_ctxs++;
 		}
 	}
@@ -527,6 +527,9 @@ static void _starpu_delete_sched_ctx(struct _starpu_sched_ctx *sched_ctx)
 	STARPU_PTHREAD_MUTEX_DESTROY(&sched_ctx->empty_ctx_mutex);
 	sem_destroy(&sched_ctx->parallel_code_sem);
 	sched_ctx->id = STARPU_NMAX_SCHED_CTXS;
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_bitmap_free(sched_ctx->hwloc_workers_set);
+#endif //STARPU_HAVE_HWLOC
 
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx_manag);
@@ -714,31 +717,9 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 		config->sched_ctxs[i].id = STARPU_NMAX_SCHED_CTXS;
 
-	char* max_time_on_ctx = getenv("STARPU_MAX_TIME_ON_CTX");
-	if (max_time_on_ctx != NULL)
-		max_time_worker_on_ctx = atof(max_time_on_ctx);
-
 	return;
 }
 
-/* unused sched_ctx pointers of a worker are NULL */
-void _starpu_init_sched_ctx_for_worker(unsigned workerid)
-{
-	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
-	worker->sched_ctx = (struct _starpu_sched_ctx**)malloc(STARPU_NMAX_SCHED_CTXS * sizeof(struct _starpu_sched_ctx*));
-	unsigned i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-		worker->sched_ctx[i] = NULL;
-
-	return;
-}
-
-void _starpu_delete_sched_ctx_for_worker(unsigned workerid)
-{
-	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
-	free(worker->sched_ctx);
-}
-
 /* sched_ctx aren't necessarly one next to another */
 /* for eg when we remove one its place is free */
 /* when we add  new one we reuse its place */
@@ -753,34 +734,6 @@ static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *
 	return STARPU_NMAX_SCHED_CTXS;
 }
 
-static unsigned _starpu_worker_get_first_free_sched_ctx(struct _starpu_worker *worker)
-{
-	unsigned i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-		if(worker->sched_ctx[i] == NULL)
-			return i;
-	STARPU_ASSERT(0);
-	return STARPU_NMAX_SCHED_CTXS;
-}
-
-static unsigned _starpu_worker_get_sched_ctx_id(struct _starpu_worker *worker, unsigned sched_ctx_id)
-{
-	unsigned to_be_deleted = STARPU_NMAX_SCHED_CTXS;
-	unsigned i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-	{
-		if(worker->sched_ctx[i] != NULL)
-		{
-			if(worker->sched_ctx[i]->id == sched_ctx_id)
-				return i;
-			else if(worker->sched_ctx[i]->id == STARPU_NMAX_SCHED_CTXS)
-				to_be_deleted = i;
-		}
-	}
-
-	return to_be_deleted;
-}
-
 int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
@@ -1000,13 +953,6 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
 
 unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
 {
-/* 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid); */
-/* 	unsigned i; */
-/* 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) */
-/* 	{ */
-/* 		if(worker->sched_ctx[i] && worker->sched_ctx[i]->id == sched_ctx_id) */
-/* 			return 1; */
-/* 	} */
         struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 
         struct starpu_worker_collection *workers = sched_ctx->workers;
@@ -1071,8 +1017,6 @@ unsigned starpu_sched_ctx_overlapping_ctxs_on_worker(int workerid)
 
 unsigned starpu_sched_ctx_is_ctxs_turn(int workerid, unsigned sched_ctx_id)
 {
-	if(max_time_worker_on_ctx == -1.0) return 1;
-
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	return worker->active_ctx == sched_ctx_id;
 }
@@ -1083,10 +1027,10 @@ void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 
 	struct _starpu_sched_ctx *other_sched_ctx = NULL;
 	struct _starpu_sched_ctx *active_sched_ctx = NULL;
-	int i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	struct _starpu_sched_ctx_list *l = NULL;
+        for (l = worker->sched_ctx_list; l; l = l->next)
 	{
-		other_sched_ctx = worker->sched_ctx[i];
+		other_sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		if(other_sched_ctx != NULL && other_sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
 		   other_sched_ctx->id != 0 && other_sched_ctx->id != sched_ctx_id)
 		{
@@ -1102,11 +1046,6 @@ void starpu_sched_ctx_set_turn_to_other_ctx(int workerid, unsigned sched_ctx_id)
 	}
 }
 
-double starpu_sched_ctx_get_max_time_worker_on_ctx(void)
-{
-	return max_time_worker_on_ctx;
-}
-
 void starpu_sched_ctx_set_inheritor(unsigned sched_ctx_id, unsigned inheritor)
 {
 	STARPU_ASSERT(inheritor < STARPU_NMAX_SCHED_CTXS);
@@ -1277,7 +1216,6 @@ void _starpu_sched_ctx_rebind_thread_to_its_cpu(unsigned cpuid)
 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 	struct starpu_sched_ctx_iterator it;
 	struct _starpu_worker *worker = NULL;
@@ -1294,7 +1232,7 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id)
 
 	while(workers->has_next(workers, &it))
 	{
-		int w = workers->get_next(workers, &it);
+		workers->get_next(workers, &it);
 		sem_wait(&sched_ctx->parallel_code_sem);
 	}
 	return;
@@ -1304,15 +1242,12 @@ void _starpu_sched_ctx_signal_worker_blocked(int workerid)
 {
 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
 	struct _starpu_sched_ctx *sched_ctx = NULL;
-	unsigned i;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	struct _starpu_sched_ctx_list *l = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
 	{
-		if(worker->sched_ctx[i] != NULL && worker->sched_ctx[i]->id != STARPU_NMAX_SCHED_CTXS
-			&& worker->sched_ctx[i]->id != 0)
-		{
-			sched_ctx = worker->sched_ctx[i];
+		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if(sched_ctx->id != 0)
 			sem_post(&sched_ctx->parallel_code_sem);
-		}
 	}	
 	return;
 }

+ 1 - 6
src/core/sched_ctx.h

@@ -24,6 +24,7 @@
 #include <common/barrier_counter.h>
 #include <profiling/profiling.h>
 #include <semaphore.h>
+#include "sched_ctx_list.h"
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -117,12 +118,6 @@ struct _starpu_machine_config;
 /* init sched_ctx_id of all contextes*/
 void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 
-/* init the list of contexts of the worker */
-void _starpu_init_sched_ctx_for_worker(unsigned workerid);
-
-/* free the list of contexts of the worker */
-void _starpu_delete_sched_ctx_for_worker(unsigned workerid);
-
 /* allocate all structures belonging to a context */
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name);
 

+ 86 - 0
src/core/sched_ctx_list.c

@@ -0,0 +1,86 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "sched_ctx_list.h"
+
+void _starpu_sched_ctx_list_init(struct _starpu_sched_ctx_list *list)
+{
+	list->next = NULL;
+	list->sched_ctx = STARPU_NMAX_SCHED_CTXS;
+}
+
+void _starpu_sched_ctx_list_add(struct _starpu_sched_ctx_list **list, unsigned sched_ctx)
+{
+	if((*list)->sched_ctx == STARPU_NMAX_SCHED_CTXS)
+		(*list)->sched_ctx = sched_ctx;
+	else
+	{
+		struct _starpu_sched_ctx_list *l = (struct _starpu_sched_ctx_list*)malloc(sizeof(struct _starpu_sched_ctx_list));
+		l->sched_ctx = sched_ctx;
+		l->next = *list;
+		*list = l;
+	}
+}
+
+void _starpu_sched_ctx_list_remove(struct _starpu_sched_ctx_list **list, unsigned sched_ctx)
+{
+	struct _starpu_sched_ctx_list *l = NULL;
+	struct _starpu_sched_ctx_list *prev = NULL;
+	for (l = (*list); l; l = l->next)
+	{
+		if(l->sched_ctx == sched_ctx)
+			break;
+		prev = l;
+	}
+	struct _starpu_sched_ctx_list *next = NULL;
+	if(l->next)
+		next = l->next;
+	free(l);
+	l = NULL;
+	
+	if(next)
+	{
+		if(prev)
+			prev->next = next;
+		else
+			*list = next;
+	}
+}
+
+unsigned _starpu_sched_ctx_list_get_sched_ctx(struct _starpu_sched_ctx_list *list, unsigned sched_ctx)
+{
+	struct _starpu_sched_ctx_list *l = NULL;
+	for (l = list; l; l = l->next)
+	{
+		if(l->sched_ctx == sched_ctx)
+			return sched_ctx;
+	}
+	return STARPU_NMAX_SCHED_CTXS;
+}
+
+void _starpu_sched_ctx_list_delete(struct _starpu_sched_ctx_list **list)
+{
+	while(*list)
+	{
+		struct _starpu_sched_ctx_list *next = (*list)->next;
+		free(*list);
+		*list = NULL;
+		if(next)
+			*list = next;
+	}
+		
+}

+ 32 - 0
src/core/sched_ctx_list.h

@@ -0,0 +1,32 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __SCHED_CONTEXT_LIST_H__
+#define __SCHED_CONTEXT_LIST_H__
+
+struct _starpu_sched_ctx_list
+{
+	struct _starpu_sched_ctx_list *next;
+	unsigned sched_ctx;
+};
+
+void _starpu_sched_ctx_list_init(struct _starpu_sched_ctx_list *list);
+void _starpu_sched_ctx_list_add(struct _starpu_sched_ctx_list **list, unsigned sched_ctx);
+void _starpu_sched_ctx_list_remove(struct _starpu_sched_ctx_list **list, unsigned sched_ctx);
+unsigned _starpu_sched_ctx_list_get_sched_ctx(struct _starpu_sched_ctx_list *list, unsigned sched_ctx);
+void _starpu_sched_ctx_list_delete(struct _starpu_sched_ctx_list **list);
+
+#endif // __SCHED_CONTEXT_H__

+ 33 - 55
src/core/sched_policy.c

@@ -209,19 +209,19 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 		starpu_prefetch_task_input_on_node(task, memory_node);
 
 	/* if we push a task on a specific worker, notify all the sched_ctxs the worker belongs to */
-	unsigned i;
 	struct _starpu_sched_ctx *sched_ctx;
-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-	{
-		sched_ctx = worker->sched_ctx[i];
-		if (sched_ctx != NULL && sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
+	struct _starpu_sched_ctx_list *l = NULL;
+        for (l = worker->sched_ctx_list; l; l = l->next)
+        {
+		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
 			sched_ctx->sched_policy->push_task_notify(task, workerid, sched_ctx->id);
 	}
 
 #ifdef STARPU_USE_SC_HYPERVISOR
 	starpu_sched_ctx_call_pushed_task_cb(workerid, task->sched_ctx);
 #endif //STARPU_USE_SC_HYPERVISOR
-
+	unsigned i;
 	if (is_basic_worker)
 	{
 		unsigned node = starpu_worker_get_memory_node(workerid);
@@ -326,7 +326,7 @@ int _starpu_push_task(struct _starpu_job *j)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = -1;
+		intptr_t id = -1;
 		AYU_event(AYU_ADDTASKTOQUEUE, j->job_id, &id);
 	}
 #endif
@@ -536,40 +536,33 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 }
 
 struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
-{
-	while(1)
+{	
+	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
+	unsigned smallest_counter =  worker->nsched_ctxs;
+	struct _starpu_sched_ctx_list *l = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
 	{
-		struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
-		unsigned smallest_counter =  worker->nsched_ctxs;
-		unsigned i;
-		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if(worker->removed_from_ctx[sched_ctx->id])
+			return sched_ctx;
+		if(sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
+		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
 		{
-			sched_ctx = worker->sched_ctx[i];
-			
-			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS && worker->removed_from_ctx[sched_ctx->id])
-				return sched_ctx;
-			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
-			   sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
-			   smallest_counter > sched_ctx->pop_counter[worker->workerid])
-			{
-				good_sched_ctx = sched_ctx;
-				smallest_counter = sched_ctx->pop_counter[worker->workerid];
-			}
+			good_sched_ctx = sched_ctx;
+			smallest_counter = sched_ctx->pop_counter[worker->workerid];
 		}
-		
-		if(good_sched_ctx == NULL)
+	}
+	
+	if(good_sched_ctx == NULL)
+	{
+		for (l = worker->sched_ctx_list; l; l = l->next)
 		{
-			for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-			{
-				sched_ctx = worker->sched_ctx[i];
-				if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
-					sched_ctx->pop_counter[worker->workerid] = 0;
-			}
-			
-			continue;
+			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+			sched_ctx->pop_counter[worker->workerid] = 0;
 		}
-		return good_sched_ctx;
+		return _starpu_get_sched_ctx_struct(worker->sched_ctx_list->sched_ctx);
 	}
+	return good_sched_ctx;
 }
 
 struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
@@ -592,46 +585,31 @@ pick:
 
 	/* get tasks from the stacks of the strategy */
 	if(!task)
-	{
-		struct _starpu_sched_ctx *sched_ctx;
-
-		//unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
-
-		int been_here[STARPU_NMAX_SCHED_CTXS];
-		int i;
-		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
-			been_here[i] = 0;
+	{		
+		struct _starpu_sched_ctx *sched_ctx ;
 
-		while(!task)
+		if(!task)
 		{
 			if(worker->nsched_ctxs == 1)
 				sched_ctx = _starpu_get_initial_sched_ctx();
 			else
 				sched_ctx = _get_next_sched_ctx_to_pop_into(worker);
 
-			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+			if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 			{
 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
 				{
 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
-					//lucky_ctx = sched_ctx->id;
 				}
 			}
 
-			if(!task && worker->removed_from_ctx[sched_ctx->id])
+			if(!task && sched_ctx && worker->removed_from_ctx[sched_ctx->id])
 			{
 				_starpu_worker_gets_out_of_ctx(sched_ctx->id, worker);
 				worker->removed_from_ctx[sched_ctx->id] = 0;
 			}
 
-			if((!task && sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
-				break;
-
-
-			been_here[sched_ctx->id] = 1;
-
 			sched_ctx->pop_counter[worker->workerid]++;
-
 		}
 	  }
 

+ 11 - 10
src/core/topology.c

@@ -370,9 +370,9 @@ _starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
 					    suffixes);
 
 	if (0 != mic_file_found) {
-		fprintf(stderr, "No MIC program specified, use the environment"
-			"variable STARPU_MIC_SINK_PROGRAM_NAME or the environment"
-			"or the field 'starpu_conf.mic_sink_program_path'"
+		fprintf(stderr, "No MIC program specified, use the environment\n"
+			"variable STARPU_MIC_SINK_PROGRAM_NAME or the environment\n"
+			"or the field 'starpu_conf.mic_sink_program_path'\n"
 			"to define it.\n");
 
 		return -1;
@@ -641,7 +641,6 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 		config->workers[worker_idx].devid = miccore_id;
 		config->workers[worker_idx].worker_mask = STARPU_MIC;
 		config->worker_mask |= STARPU_MIC;
-		_starpu_init_sched_ctx_for_worker(config->workers[worker_idx].workerid);
 	}
 
 	topology->nworkers += topology->nmiccores[mic_idx];
@@ -683,8 +682,8 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 		if (0 == _starpu_init_mic_node (config, i, &handles[i], &process[i]))
 			topology->nmicdevices++;
 
-	i = 0;
-	for (; i < topology->nmicdevices; i++)
+	
+	for (i = 0; i < topology->nmicdevices; i++)
 		_starpu_init_mic_config (config, user_conf, i);
 #endif
 }
@@ -779,7 +778,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
-		_starpu_init_sched_ctx_for_worker(config->workers[worker_idx].workerid);
 		config->worker_mask |= STARPU_CUDA;
 
 		struct handle_entry *entry;
@@ -854,7 +852,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
-		_starpu_init_sched_ctx_for_worker(config->workers[worker_idx].workerid);
 		config->worker_mask |= STARPU_OPENCL;
 	}
 
@@ -979,7 +976,6 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
-		_starpu_init_sched_ctx_for_worker(config->workers[worker_idx].workerid);
 	}
 
 	topology->nworkers += topology->ncpus;
@@ -1098,7 +1094,11 @@ _starpu_bind_thread_on_cpus (
 		}
 	}
 #else
-#warning no parallel worker CPU binding support
+#ifdef __GLIBC__
+	sched_setaffinity(0,sizeof(combined_worker->cpu_set),&combined_worker->cpu_set);
+#else
+#  warning no parallel worker CPU binding support
+#endif
 #endif
 }
 
@@ -1264,6 +1264,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
 		if (is_a_set_of_accelerators)
 		{
+/* TODO: il faudrait changer quand on change de device */
 			if (accelerator_bindid == -1)
 				accelerator_bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 

+ 11 - 9
src/core/workers.c

@@ -293,10 +293,15 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 	}
 	else
 	{
-		if ((cl->type == STARPU_SPMD)
+		if ((cl->type == STARPU_SPMD) 
 #ifdef STARPU_HAVE_HWLOC
 				|| (cl->type == STARPU_FORKJOIN)
+#else
+#ifdef __GLIBC__
+				|| (cl->type == STARPU_FORKJOIN)
+#endif
 #endif
+
 				)
 		{
 			/* TODO we should add other types of constraints */
@@ -432,13 +437,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 
 #ifdef HAVE_AYUDAME_H
-	if (AYU_event)
-	{
-		unsigned long n = nworkers;
-		AYU_event(AYU_INIT, 0, (void*) &n);
-	}
+	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 #endif
-
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -1102,7 +1102,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
 out:
 		STARPU_ASSERT(starpu_task_list_empty(&worker->local_tasks));
-		_starpu_delete_sched_ctx_for_worker(workerid);
+		_starpu_sched_ctx_list_delete(&worker->sched_ctx_list);
 		_starpu_job_list_delete(worker->terminated_jobs);
 	}
 }
@@ -1422,13 +1422,15 @@ unsigned starpu_worker_is_combined_worker(int id)
 
 struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
 {
-	if (id == STARPU_NMAX_SCHED_CTXS) return NULL;
+	if(id == STARPU_NMAX_SCHED_CTXS) return NULL;
 	return &config.sched_ctxs[id];
 }
 
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 {
 	unsigned basic_worker_count = starpu_worker_get_count();
+	
+	//_STARPU_DEBUG("basic_worker_count:%d\n",basic_worker_count);
 
 	STARPU_ASSERT(id >= basic_worker_count);
 	return &config.combined_workers[id - basic_worker_count];

+ 6 - 2
src/core/workers.h

@@ -29,7 +29,7 @@
 #include <core/topology.h>
 #include <core/errorcheck.h>
 #include <core/sched_ctx.h>
-
+#include <core/sched_ctx_list.h>
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 #endif
@@ -84,7 +84,7 @@ struct _starpu_worker
 	char short_name[10];
 	unsigned run_by_starpu; /* Is this run by StarPU or directly by the application ? */
 
-	struct _starpu_sched_ctx **sched_ctx;
+	struct _starpu_sched_ctx_list *sched_ctx_list;
 	unsigned nsched_ctxs; /* the no of contexts a worker belongs to*/
 	struct _starpu_barrier_counter tasks_barrier; /* wait for the tasks submitted */
        
@@ -123,6 +123,10 @@ struct _starpu_combined_worker
 	int worker_size;
 	unsigned memory_node; /* which memory node is associated that worker to ? */
 	int combined_workerid[STARPU_NMAXWORKERS];
+#ifdef STARPU_USE_MP
+	int count;
+	pthread_mutex_t count_mutex;
+#endif
 
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;

+ 5 - 2
src/datawizard/coherency.c

@@ -102,10 +102,13 @@ unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destinati
 			 * 	Other should be ok */
 
 			if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
-			    starpu_node_get_kind(i) == STARPU_OPENCL_RAM)
+			    starpu_node_get_kind(i) == STARPU_OPENCL_RAM ||
+			    starpu_node_get_kind(i) == STARPU_MIC_RAM)
 				i_gpu = i;
 
-			if (starpu_node_get_kind(i) == STARPU_CPU_RAM)
+			if (starpu_node_get_kind(i) == STARPU_CPU_RAM || 
+			    starpu_node_get_kind(i) == STARPU_SCC_RAM ||
+			    starpu_node_get_kind(i) == STARPU_SCC_SHM)
 				i_ram = i;
 			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)			
 				i_disk = i;

+ 1 - 0
src/datawizard/data_request.c

@@ -358,6 +358,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	/* perform the transfer */
 	/* the header of the data must be locked by the worker that submitted the request */
 
+
 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
 						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
 

+ 0 - 2
src/datawizard/filters.c

@@ -258,9 +258,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		void *ptr;
 		ptr = starpu_data_handle_to_pointer(child, 0);
 		if (ptr != NULL)
-		{
 			_starpu_data_register_ram_pointer(child, ptr);
-		}
 	}
 	/* now let the header */
 	_starpu_spin_unlock(&initial_handle->header_lock);

+ 9 - 11
src/datawizard/reduction.c

@@ -86,18 +86,16 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 #ifdef STARPU_USE_MIC
 	if (starpu_worker_get_type(workerid) == STARPU_MIC_WORKER)
 	{
-		const struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
-		enum _starpu_mp_command answer;
-		void *arg = NULL;
-		int arg_size = 0;
-
-		// XXX: give the correct coreid.
+		struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
+		int devid = _starpu_get_worker_struct(workerid)->devid;
+		void * arg;
+		int arg_size;
 		_starpu_src_common_execute_kernel(node,
-						  (void(*)(void))init_func, 0,
-						  &handle, &(replicate->data_interface), 1,
-						  NULL, 0);
-		answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
-		STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
+						 (void(*)(void))init_func, devid,
+						 STARPU_SEQ, 0, 0, &handle, 
+						 &(replicate->data_interface), 1,
+						 NULL, 0);
+		_starpu_src_common_wait_completed_execution(node,devid,&arg,&arg_size);
 	}
 	else
 #endif

+ 60 - 13
src/debug/traces/starpu_fxt.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/uthash.h>
+#include <string.h>
 
 #ifdef STARPU_HAVE_POTI
 #include <poti.h>
@@ -1890,25 +1891,52 @@ void starpu_fxt_generate_trace(struct starpu_fxt_options *options)
 	options->nworkers = nworkers;
 }
 
-static FILE *out_data_total_trace_file;
+#define DATA_STR_MAX_SIZE 15
 
 struct parse_task
 {
 	unsigned exec_time;
 	unsigned data_total;
+	char *codelet_name;
 };
 
 static struct parse_task tasks[STARPU_NMAXWORKERS];
 
+struct starpu_data_trace_kernel
+{
+	UT_hash_handle hh;
+	char *name;
+	FILE *file;
+} *kernels;
+
 #define NANO_SEC_TO_MILI_SEC 0.000001
 
+static FILE *codelet_list;
+
 static void write_task(struct parse_task pt)
 {
+	struct starpu_data_trace_kernel *kernel;
+	char *codelet_name = pt.codelet_name;
+	HASH_FIND_STR(kernels, codelet_name, kernel);
+	//fprintf(stderr, "%p %p %s\n", kernel, kernels, codelet_name);
+	if(kernel == NULL)
+	{
+		kernel = malloc(sizeof(*kernel));
+		kernel->name = strdup(codelet_name);
+		//fprintf(stderr, "%s\n", kernel->name);
+		kernel->file = fopen(codelet_name, "w+");
+		if(!kernel->file)
+		{
+			perror("open failed :");
+			exit(-1);
+		}
+		HASH_ADD_STR(kernels, name, kernel); 
+		fprintf(codelet_list, "%s\n", codelet_name);
+	}
 	double time = pt.exec_time * NANO_SEC_TO_MILI_SEC;
-	fprintf(out_data_total_trace_file, "%lf %d\n", time, pt.data_total);
+	fprintf(kernel->file, "%lf %d\n", time, pt.data_total);
 }
 
-
 void starpu_fxt_write_data_trace(char *filename_in)
 {
 	int fd_in;
@@ -1927,17 +1955,21 @@ void starpu_fxt_write_data_trace(char *filename_in)
 	        exit(-1);
 	}
 
+	codelet_list = fopen("codelet_list", "w+");
+	if(!codelet_list)
+	{
+		perror("open failed :");
+		exit(-1);
+	}
+
 	fxt_blockev_t block;
 	block = fxt_blockev_enter(fut);
 
-	out_data_total_trace_file = fopen("data_total.txt", "w+");
-	if(!out_data_total_trace_file)
-        {
-                perror("open failed :");
-                exit(-1);
-        }
-
 	struct fxt_ev_64 ev;
+
+	unsigned workerid;
+	unsigned long has_name = 0;
+
 	while(1)
 	{
 		int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
@@ -1946,8 +1978,6 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			break;
 		}
 		
-		unsigned workerid;
-
 		switch (ev.code)
 		{
 		case _STARPU_FUT_WORKER_INIT_START:
@@ -1957,6 +1987,9 @@ void starpu_fxt_write_data_trace(char *filename_in)
 		case _STARPU_FUT_START_CODELET_BODY:
 			workerid = find_worker_id(ev.param[2]);
 			tasks[workerid].exec_time = ev.time;
+			has_name = ev.param[3];
+			tasks[workerid].codelet_name = strdup(has_name ? (char *) &ev.param[4] : "unknow");
+			//fprintf(stderr, "start codelet :[%d][%s]\n", workerid, tasks[workerid].codelet_name);
 			break;
 			
 		case _STARPU_FUT_END_CODELET_BODY:
@@ -1985,11 +2018,25 @@ void starpu_fxt_write_data_trace(char *filename_in)
 	        exit(-1);
 	}
 	
-	if(fclose(out_data_total_trace_file))
+	if(fclose(codelet_list))
 	{
 		perror("close failed :");
 		exit(-1);
 	}
+	
+	struct starpu_data_trace_kernel *kernel, *tmp;	
 
+	HASH_ITER(hh, kernels, kernel, tmp)
+	{
+		if(fclose(kernel->file))
+		{ 
+			perror("close failed :");
+			exit(-1);
+		}
+		HASH_DEL(kernels, kernel);
+		free(kernel->name);
+		free(kernel);
+	}
+		
 }
 #endif // STARPU_USE_FXT

+ 93 - 21
src/drivers/driver_common/driver_common.c

@@ -95,7 +95,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	}
 
 	if (starpu_top)
-	  _starpu_top_task_ended(task,workerid,codelet_end);
+		_starpu_top_task_ended(task,workerid,codelet_end);
 
 	args->status = STATUS_UNKNOWN;
 }
@@ -129,9 +129,9 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 			profiling_info->workerid = workerid;
 
 			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
-				profiling_info->used_cycles,
-				profiling_info->stall_cycles,
-				profiling_info->power_consumed);
+								       profiling_info->used_cycles,
+								       profiling_info->stall_cycles,
+								       profiling_info->power_consumed);
 			updated =  1;
 		}
 
@@ -150,6 +150,29 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 	}
 }
 
+
+
+static void _starpu_worker_set_status_sleeping(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
+	{
+		_STARPU_TRACE_WORKER_SLEEP_START;
+		_starpu_worker_restart_sleeping(workerid);
+		_starpu_worker_set_status(workerid, STATUS_SLEEPING);
+	}
+
+}
+
+static void _starpu_worker_set_status_wakeup(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
+	{
+		_STARPU_TRACE_WORKER_SLEEP_END;
+		_starpu_worker_stop_sleeping(workerid);
+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
+	}
+}
+
 /* Workers may block when there is no work to do at all. */
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode)
 {
@@ -175,12 +198,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		 * driver may go block just after the scheduler got a new task to be
 		 * executed, and thus hanging. */
 
-		if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_START;
-			_starpu_worker_restart_sleeping(workerid);
-			_starpu_worker_set_status(workerid, STATUS_SLEEPING);
-		}
+		_starpu_worker_set_status_sleeping(workerid);
 
 		if (_starpu_worker_can_block(memnode))
 			STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
@@ -206,11 +224,11 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 #ifdef STARPU_USE_SC_HYPERVISOR
 		struct _starpu_sched_ctx *sched_ctx = NULL;
 		struct starpu_sched_ctx_performance_counters *perf_counters = NULL;
-		int j;
-		for(j = 0; j < STARPU_NMAX_SCHED_CTXS; j++)
+		struct _starpu_sched_ctx_list *l = NULL;
+		for (l = args->sched_ctx_list; l; l = l->next)
 		{
-			sched_ctx = args->sched_ctx[j];
-			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
+			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+			if(sched_ctx->id != 0)
 			{
 				perf_counters = sched_ctx->perf_counters;
 				if(perf_counters != NULL && perf_counters->notify_idle_cycle)
@@ -235,20 +253,74 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
 #endif //STARPU_USE_SC_HYPERVISOR
 
-	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
-	{
-		_STARPU_TRACE_WORKER_SLEEP_END;
-		_starpu_worker_stop_sleeping(workerid);
-		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
-	}
+	_starpu_worker_set_status_wakeup(workerid);
 
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = workerid;
+		intptr_t id = workerid;
 		AYU_event(AYU_PRERUNTASK, _starpu_get_job_associated_to_task(task)->job_id, &id);
 	}
 #endif
 
 	return task;
 }
+
+
+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworkers)
+{
+	int i, count = 0;
+	struct _starpu_job * j;
+	int is_parallel_task;
+	struct _starpu_combined_worker *combined_worker;
+	/*for each worker*/
+	for (i = 0; i < nworkers; i++)
+	{
+		/*if the worker is already executinf a task then */
+		if(workers[i].current_task)
+		{
+			tasks[i] = NULL;
+		}
+		/*else try to pop a task*/
+		else
+		{
+			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
+			_starpu_set_local_worker_key(&workers[i]);
+			tasks[i] = _starpu_pop_task(&workers[i]);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
+			if(tasks[i] != NULL)
+			{
+				count ++;
+				j = _starpu_get_job_associated_to_task(tasks[i]);
+				is_parallel_task = (j->task_size > 1);
+				workers[i].current_task = j->task;
+				/* Get the rank in case it is a parallel task */
+				if (is_parallel_task)
+				{
+
+					STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+					workers[i].current_rank = j->active_task_alias_count++;
+					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+					
+					combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
+					workers[i].combined_workerid = j->combined_workerid;
+					workers[i].worker_size = combined_worker->worker_size;
+				}
+				else
+				{
+					workers[i].combined_workerid = workers[i].workerid;
+					workers[i].worker_size = 1;
+					workers[i].current_rank = 0;
+				}
+
+				_starpu_worker_set_status_wakeup(workers[i].workerid);
+			}
+			else
+			{
+				_starpu_worker_set_status_sleeping(workers[i].workerid);
+			}
+		}
+	}
+	return count;
+}
+

+ 1 - 1
src/drivers/driver_common/driver_common.h

@@ -32,5 +32,5 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
-
+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworker);
 #endif // __DRIVER_COMMON_H__

+ 18 - 4
src/drivers/mic/driver_mic_common.c

@@ -19,7 +19,6 @@
 #include <drivers/mp_common/mp_common.h>
 #include <drivers/mic/driver_mic_common.h>
 
-
 void _starpu_mic_common_report_scif_error(const char *func, const char *file, const int line, const int status)
 {
 	const char *errormsg = strerror(status);
@@ -33,10 +32,25 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, co
 
 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
 {
-	if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+  if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
 }
 
+
+/* Teel is the mic endpoint is ready
+ * return 1 if a message has been receive, 0 if no message has been receive
+ */
+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
+{
+  struct scif_pollepd pollepd;
+  pollepd.epd = mp_node->mp_connection.mic_endpoint;
+  pollepd.events = SCIF_POLLIN;
+  pollepd.revents = 0;
+  return  scif_poll(&pollepd,1,0);
+	
+}
+
+
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
  */
@@ -49,7 +63,7 @@ void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
- */
+x */
 void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
 {
 	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
@@ -114,7 +128,7 @@ void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number)
 	_STARPU_DEBUG("MIC accepting connection on %u...\n", port_number);
 	if ((scif_accept(init_epd, &portID, endpoint, SCIF_ACCEPT_SYNC)) < 0)
 		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
-	_STARPU_DEBUG("done\n", init_epd);
+	_STARPU_DEBUG("done : %d\n", init_epd);
 
 	scif_close(init_epd);
 }

+ 8 - 5
src/drivers/mic/driver_mic_common.h

@@ -27,16 +27,17 @@
 #define STARPU_TO_MIC_ID(id) ((id) + 1)
 
 /* TODO: rather allocate ports on the host and pass them as parameters to the device process */
-#define STARPU_MIC_PORTS_BEGIN 1099
+// We use the last SCIF reserved port and add 1000 to be safe
+#define STARPU_MIC_PORTS_BEGIN SCIF_PORT_RSVD+1000
 
 #define STARPU_MIC_SOURCE_PORT_NUMBER STARPU_MIC_PORTS_BEGIN
-#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN + 1)
+#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN)
 
-#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
-#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN)
+#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
 
 #define STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(me, peer_id) \
-((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
 
 #define STARPU_MIC_PAGE_SIZE 0x1000
 #define STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size) \
@@ -53,6 +54,8 @@ struct _starpu_mic_free_command
 
 void _starpu_mic_common_report_scif_error(const char *func, const char *file, int line, const int status);
 
+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
+
 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
 
 void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);

+ 88 - 14
src/drivers/mic/driver_mic_sink.c

@@ -16,12 +16,14 @@
 
 
 #include <errno.h>
+#include <dlfcn.h>
 
 #include <common/COISysInfo_common.h>
 
 #include <starpu.h>
 #include <drivers/mp_common/mp_common.h>
 #include <drivers/mp_common/sink_common.h>
+#include <datawizard/interfaces/data_interface.h>
 
 #include "driver_mic_common.h"
 #include "driver_mic_sink.h"
@@ -29,17 +31,27 @@
 /* Initialize the MIC sink, initializing connection to the source
  * and to the other devices (not implemented yet).
  */
-
 void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 {
-	//unsigned int i;
-	
+	pthread_t self;
+	cpu_set_t cpuset;
+
+	/*Bind on the first core*/
+	self = pthread_self();
+	CPU_ZERO(&cpuset);
+	CPU_SET(241,&cpuset);
+	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
+
+
 	/* Initialize connection with the source */
 	_starpu_mic_common_accept(&node->mp_connection.mic_endpoint,
 					 STARPU_MIC_SOURCE_PORT_NUMBER);
 
 	_starpu_mic_common_accept(&node->host_sink_dt_connection.mic_endpoint,
 									 STARPU_MIC_SOURCE_DT_PORT_NUMBER);
+	
+	node->nb_cores = COISysGetHardwareThreadCount() - COISysGetHardwareThreadCount() / COISysGetCoreCount();
+	node->thread_table = malloc(sizeof(pthread_t)*node->nb_cores);
 
 	//node->sink_sink_dt_connections = malloc(node->nb_mp_sinks * sizeof(union _starpu_mp_connection));
 
@@ -54,11 +66,58 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i));
 }
 
-/* Deinitialize the MIC sink, close all the connections.
+/* Launch all workers on the mic
  */
+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node)
+{
+	int i, ret;
+	struct arg_sink_thread * arg;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+	pthread_t thread;
+	
+	/*for each core init the mutex, the task pointer and launch the thread */
+	for(i=0; i<node->nb_cores; i++)
+	{
+		//init the set
+		CPU_ZERO(&cpuset);
+		CPU_SET(i,&cpuset);
+
+		ret = pthread_attr_init(&attr);
+		STARPU_ASSERT(ret == 0);
+		ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+		STARPU_ASSERT(ret == 0);
+
+		/*prepare the argument for the thread*/
+		arg= malloc(sizeof(struct arg_sink_thread));
+		arg->coreid = i;
+		arg->node = node;
+		
+		ret = pthread_create(&thread, &attr, _starpu_sink_thread, arg);
+		STARPU_ASSERT(ret == 0);
+		((pthread_t *)node->thread_table)[i] = thread;
+	}
+
+}
 
+/* Deinitialize the MIC sink, close all the connections.
+ */
 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
 {
+	
+	int i;
+	node->is_running = 0;
+	for(i=0; i<node->nb_cores; i++)
+	{
+		sem_post(&node->sem_run_table[i]);
+		pthread_join(((pthread_t *)node->thread_table)[i],NULL);
+	}
+
+	free(node->thread_table);
+
+	scif_close(node->host_sink_dt_connection.mic_endpoint);
+	scif_close(node->mp_connection.mic_endpoint);
+
 	//unsigned int i;
 
 	//for (i = 0; i < node->nb_mp_sinks; ++i)
@@ -69,14 +128,11 @@ void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
 
 	//free(node->sink_sink_dt_connections);
 
-	scif_close(node->host_sink_dt_connection.mic_endpoint);
-	scif_close(node->mp_connection.mic_endpoint);
 }
 
 /* Report an error which occured when using a MIC device
  * and print this error in a human-readable style
  */
-
 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status)
 {
 	const char *errormsg = strerror(status);
@@ -84,13 +140,6 @@ void _starpu_mic_sink_report_error(const char *func, const char *file, const int
 	STARPU_ASSERT(0);
 }
 
-/* Return the number of cores on the callee, a MIC device or Processor Xeon
- */
-unsigned int _starpu_mic_sink_get_nb_core(void)
-{
-	return (unsigned int) COISysGetCoreCount();
-}
-
 /* Allocate memory on the MIC.
  * Memory is register for remote direct access. */
 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size)
@@ -133,3 +182,28 @@ void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUT
 #endif
 	free(addr);
 }
+
+
+/* bind the thread to a core
+ */
+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core)
+{
+	cpu_set_t cpuset;
+	int i;
+
+  	//init the set
+	CPU_ZERO(&cpuset);
+
+	//adding the core to the set
+	for(i=0;i<nb_core;i++)
+		CPU_SET(core_table[i],&cpuset);
+
+	pthread_setaffinity_np(((pthread_t*)mp_node->thread_table)[coreid],sizeof(cpu_set_t),&cpuset);
+}
+
+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
+{
+	void *dl_handle = dlopen(NULL, RTLD_NOW);
+	return dlsym(dl_handle, func_name);
+}
+

+ 5 - 3
src/drivers/mic/driver_mic_sink.h

@@ -34,13 +34,15 @@
 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status);
 
 void _starpu_mic_sink_init(struct _starpu_mp_node *node);
-
+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node);
 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node);
 
-unsigned int _starpu_mic_sink_get_nb_core(void);
-
 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
 void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core);
+
+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED,
+			char* func_name))(void);
 
 #endif /* STARPU_USE_MIC */
 

+ 50 - 215
src/drivers/mic/driver_mic_source.c

@@ -73,7 +73,7 @@ starpu_pthread_mutex_t nb_mic_worker_init_mutex = PTHREAD_MUTEX_INITIALIZER;
 //	return config->workers[workerid].devid;
 //}
 
-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
 {
 	struct _starpu_worker *actual_worker = _starpu_get_local_worker_key();
 	STARPU_ASSERT(actual_worker);
@@ -110,96 +110,6 @@ void _starpu_mic_clear_kernels(void)
 	}
 }
 
-static int
-_starpu_mic_src_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
-{
-	uint32_t mask = 0;
-	int profiling = starpu_profiling_status_get();
-	struct timespec codelet_end;
-
-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
-			       profiling);
-
-	_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
-					   &j->cl_start, &codelet_end,
-					   profiling);
-
-	_starpu_push_task_output (j, mask);
-
-	_starpu_handle_job_termination(j);
-
-	return 0;
-}
-
-static int
-_starpu_mic_src_process_completed_job (struct _starpu_worker_set *workerset)
-{
-	struct _starpu_mp_node *node = mic_nodes[workerset->workers[0].mp_nodeid];
-	enum _starpu_mp_command answer;
-	void *arg;
-	int arg_size;
-
-	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
-	STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
-
-	void *arg_ptr = arg;
-	int coreid;
-
-	coreid = *(int *) arg_ptr;
-	arg_ptr += sizeof (coreid); // Useless.
-
-	struct _starpu_worker *worker = &workerset->workers[coreid];
-	struct starpu_task *task = worker->current_task;
-	struct _starpu_job *j = _starpu_get_job_associated_to_task (task);
-
-	_starpu_mic_src_finalize_job (j, worker);
-
-	worker->current_task = NULL;
-
-	return 0;
-}
-
-
-static int _starpu_mic_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
-{
-	int ret;
-	uint32_t mask = 0;
-
-	STARPU_ASSERT(j);
-	struct starpu_task *task = j->task;
-
-	//struct timespec codelet_end;
-
-	int profiling = starpu_profiling_status_get();
-	unsigned calibrate_model = 0;
-
-	STARPU_ASSERT(task);
-	struct starpu_codelet *cl = task->cl;
-	STARPU_ASSERT(cl);
-
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
-	ret = _starpu_fetch_task_input(j, mask);
-	if (ret != 0)
-	{
-		/* there was not enough memory, so the input of
-		 * the codelet cannot be fetched ... put the
-		 * codelet back, and try it later */
-		return -EAGAIN;
-	}
-
-
-	starpu_mic_kernel_t kernel = _starpu_mic_src_get_kernel_from_codelet(j->task->cl, j->nimpl);
-
-	_starpu_driver_start_job (args, j, &j->cl_start, 0, profiling);
-
-	_starpu_src_common_execute_kernel_from_task(mic_nodes[args->mp_nodeid],
-						    (void (*)(void)) kernel, args->devid, task);
-
-	return 0;
-}
-
 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
 {
 	unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
@@ -248,9 +158,11 @@ int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char
 	return 0;
 }
 
+
 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol)
 {
 	int workerid = starpu_worker_get_id();
+	
 	/* This function has to be called in the codelet only, by the thread
 	 * which will handle the task */
 	if (workerid < 0)
@@ -365,6 +277,43 @@ starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codele
 	return kernel;
 }
 
+
+
+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void)
+{
+	starpu_mic_kernel_t kernel = NULL;
+
+	starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(j->task->cl, j->nimpl);
+	if (func)
+	{
+		/* We execute the function contained in the codelet, it must return a
+		 * pointer to the function to execute on the device, either specified
+		 * directly by the user or by a call to starpu_mic_get_func().
+		 */
+		kernel = func();
+	}
+	else
+	{
+		/* If user dont define any starpu_mic_fun_t in cl->mic_func we try to use
+		 * cpu_func_name.
+		 */
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+		if (func_name)
+		{
+			starpu_mic_func_symbol_t symbol;
+
+			_starpu_mic_src_register_kernel(&symbol, func_name);
+
+			kernel = _starpu_mic_src_get_kernel(symbol);
+		}
+	}
+	STARPU_ASSERT(kernel);
+
+	return (void (*)(void))kernel;
+}
+
+
+
 /* Initialize the node structure describing the MIC source.
  */
 void _starpu_mic_src_init(struct _starpu_mp_node *node)
@@ -553,18 +502,20 @@ int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
 	return 1;
 }
 
+
+
 void *_starpu_mic_src_worker(void *arg)
 {
-	struct _starpu_worker_set *args = arg;
+	struct _starpu_worker_set *worker_set = arg;
 	/* As all workers of a set share common data, we just use the first
 	 * one for intializing the following stuffs. */
-	struct _starpu_worker *baseworker = &args->workers[0];
+	struct _starpu_worker *baseworker = &worker_set->workers[0];
 	struct _starpu_machine_config *config = baseworker->config;
 	unsigned baseworkerid = baseworker - config->workers;
 	unsigned mp_nodeid = baseworker->mp_nodeid;
 	unsigned i;
 
-	unsigned memnode = baseworker->memory_node;
+	/* unsigned memnode = baseworker->memory_node; */
 
 	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
 
@@ -582,131 +533,15 @@ void *_starpu_mic_src_worker(void *arg)
 	_STARPU_TRACE_WORKER_INIT_END;
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->set_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
-
+	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+	worker_set->set_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
 
-	while (_starpu_machine_is_running())
-	{
-		int res;
-		struct starpu_task *task = NULL;
-		struct _starpu_job * j;
-		unsigned micworkerid = 0;
-
-		_STARPU_TRACE_START_PROGRESS(memnode);
-		_starpu_datawizard_progress(memnode, 1);
-		_STARPU_TRACE_END_PROGRESS(memnode);
-
-		STARPU_PTHREAD_MUTEX_LOCK(&baseworker->sched_mutex);
-
-		/* We pop tasklists of each worker in the set and process the
-		 * first non-empty list. */
-		for (micworkerid = 0 ; (micworkerid < args->nworkers) && (task == NULL); micworkerid++)
-		    task = _starpu_pop_task (&args->workers[micworkerid]);
-
-		if (task != NULL) {
-			micworkerid--;
-			goto task_found;
-		}
-
-#if 0 // XXX: synchronous execution for now
-		/* No task to submit, so we can poll the MIC device for
-		 * completed jobs. */
-		struct pollfd fd = {
-		    .fd = mic_nodes[baseworker->mp_nodeid]->mp_connection.mic_endpoint,
-		    .events = POLLIN
-		};
-
-		if (0 < poll (&fd, 1, 0)) {
-		    _starpu_mic_src_process_completed_job (args);
-		    goto restart_loop;
-		}
-#endif
-
-		/* At this point, there is really nothing to do for the thread
-		 * so we can block.
-		 * XXX: blocking drivers is in fact broken. DO NOT USE IT ! */
-		if (_starpu_worker_get_status(baseworkerid) != STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_START;
-			_starpu_worker_restart_sleeping(baseworkerid);
-			_starpu_worker_set_status(baseworkerid, STATUS_SLEEPING);
-		}
-
-		if (_starpu_worker_can_block(memnode))
-			STARPU_PTHREAD_COND_WAIT(&baseworker->sched_cond, &baseworker->sched_mutex);
-		else
-		{
-			if (_starpu_machine_is_running())
-				STARPU_UYIELD();
-		}
-
-		if (_starpu_worker_get_status(baseworkerid) == STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_END;
-			_starpu_worker_stop_sleeping(baseworkerid);
-			_starpu_worker_set_status(baseworkerid, STATUS_UNKNOWN);
-		}
-
-	restart_loop:
-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
-		continue;
-
-	task_found:
-		/* If the MIC core associated to `micworkerid' is already
-		 * processing a job, we push back this one in the worker task
-		 * list. */
-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
-
-		if (args->workers[micworkerid].current_task) {
-		    _starpu_push_task_to_workers(task);
-		    continue;
-		}
-
-		STARPU_ASSERT(task);
-		j = _starpu_get_job_associated_to_task(task);
-
-		/* can a MIC device do that task ? */
-		if (!_STARPU_MIC_MAY_PERFORM(j))
-		{
-			/* this isn't a mic task */
-			_starpu_push_task_to_workers(task);
-			continue;
-		}
-
-		args->workers[micworkerid].current_task = j->task;
-
-		res = _starpu_mic_src_execute_job (j, &args->workers[micworkerid]);
-
-		if (res)
-		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-					continue;
-				default:
-					STARPU_ASSERT(0);
-			}
-		}
-
-		/* XXX: synchronous execution for now */
-		_starpu_mic_src_process_completed_job (args);
-	}
+	_starpu_src_common_worker(worker_set, baseworkerid, mic_nodes[mp_nodeid]);
 
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
-
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
-
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
 
 	return NULL;

+ 3 - 1
src/drivers/mic/driver_mic_source.h

@@ -24,6 +24,7 @@
 
 #include <source/COIProcess_source.h>
 #include <source/COIEngine_source.h>
+#include <core/workers.h>
 
 #include <drivers/mp_common/mp_common.h>
 
@@ -41,9 +42,10 @@ struct _starpu_mic_async_event *event;
 #define STARPU_MIC_SRC_REPORT_SCIF_ERROR(status) \
 	_starpu_mic_src_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
 
-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
 const struct _starpu_mp_node *_starpu_mic_src_get_mp_node_from_memory_node(int memory_node);
 
+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);
 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol);
 

+ 155 - 101
src/drivers/mp_common/mp_common.c

@@ -27,12 +27,14 @@
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/scc/driver_scc_sink.h>
 
+#include <common/list.h>
+
 /* Allocate and initialize the sink structure, when the function returns
  * all the pointer of functions are linked to the right ones.
  */
 struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
-    _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
-				  int peer_id)
+_starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
+			      int peer_id)
 {
 	struct _starpu_mp_node *node;
 
@@ -45,111 +47,120 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 	switch(node->kind)
 	{
 #ifdef STARPU_USE_MIC
-		case STARPU_MIC_SOURCE:
-			{
-				node->nb_mp_sinks = starpu_mic_worker_get_count();
-				node->devid = peer_id;
-
-				node->init = _starpu_mic_src_init;
-				node->deinit = _starpu_mic_src_deinit;
-				node->report_error = _starpu_mic_src_report_scif_error;
-
-				node->mp_send = _starpu_mic_common_send;
-				node->mp_recv = _starpu_mic_common_recv;
-				node->dt_send = _starpu_mic_common_dt_send;
-				node->dt_recv = _starpu_mic_common_dt_recv;
-
-				node->execute = NULL;
-				node->nbcores = NULL;
-				node->allocate = NULL;
-				node->free = NULL;
-
-				/* A source node is only working on one core,
-				 * there is no need for this function */
-				node->get_nb_core = NULL;
-			}
-			break;
-
-		case STARPU_MIC_SINK:
-			{
-				node->devid = atoi(getenv("DEVID"));;
-				node->nb_mp_sinks = atoi(getenv("NB_MIC"));
-
-				node->init = _starpu_mic_sink_init;
-				node->deinit = _starpu_mic_sink_deinit;
-				node->report_error = _starpu_mic_sink_report_error;
-
-				node->mp_send = _starpu_mic_common_send;
-				node->mp_recv = _starpu_mic_common_recv;
-				node->dt_send = _starpu_mic_common_dt_send;
-				node->dt_recv = _starpu_mic_common_dt_recv;
-
-				node->execute = _starpu_sink_common_execute;
-				node->nbcores = _starpu_sink_nbcores;
-				node->allocate = _starpu_mic_sink_allocate;
-				node->free = _starpu_mic_sink_free;
-
-				node->get_nb_core = _starpu_mic_sink_get_nb_core;
-			}
-			break;
+	case STARPU_MIC_SOURCE:
+	{
+		node->nb_mp_sinks = starpu_mic_worker_get_count();
+		node->devid = peer_id;
+
+		node->init = _starpu_mic_src_init;
+		node->launch_workers= NULL;
+		node->deinit = _starpu_mic_src_deinit;
+		node->report_error = _starpu_mic_src_report_scif_error;
+
+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
+		node->mp_send = _starpu_mic_common_send;
+		node->mp_recv = _starpu_mic_common_recv;
+		node->dt_send = _starpu_mic_common_dt_send;
+		node->dt_recv = _starpu_mic_common_dt_recv;
+
+		node->get_kernel_from_job =_starpu_mic_src_get_kernel_from_job;
+		node->lookup = NULL;
+		node->bind_thread = NULL;
+		node->execute = NULL;
+		node->allocate = NULL;
+		node->free = NULL;
+	}
+	break;
+
+	case STARPU_MIC_SINK:
+	{
+		node->devid = atoi(getenv("DEVID"));;
+		node->nb_mp_sinks = atoi(getenv("NB_MIC"));
+
+		node->init = _starpu_mic_sink_init;
+		node->launch_workers = _starpu_mic_sink_launch_workers;
+		node->deinit = _starpu_mic_sink_deinit;
+		node->report_error = _starpu_mic_sink_report_error;
+
+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
+		node->mp_send = _starpu_mic_common_send;
+		node->mp_recv = _starpu_mic_common_recv;
+		node->dt_send = _starpu_mic_common_dt_send;
+		node->dt_recv = _starpu_mic_common_dt_recv;
+
+		node->get_kernel_from_job = NULL;
+		node->lookup = _starpu_mic_sink_lookup;
+		node->bind_thread = _starpu_mic_sink_bind_thread;
+		node->execute = _starpu_sink_common_execute;
+		node->allocate = _starpu_mic_sink_allocate;
+		node->free = _starpu_mic_sink_free;
+
+	}
+	break;
 #endif /* STARPU_USE_MIC */
 
 #ifdef STARPU_USE_SCC
-		case STARPU_SCC_SOURCE:
-			{
-				node->init = _starpu_scc_src_init;
-				node->deinit = NULL;
-				node->report_error = _starpu_scc_common_report_rcce_error;
-
-				node->mp_send = _starpu_scc_common_send;
-				node->mp_recv = _starpu_scc_common_recv;
-				node->dt_send = _starpu_scc_common_send;
-				node->dt_recv = _starpu_scc_common_recv;
-				node->dt_send_to_device = NULL;
-				node->dt_recv_from_device = NULL;
-
-				node->execute = NULL;
-				node->allocate = NULL;
-				node->free = NULL;
-
-				node->get_nb_core = NULL;
-			}
-			break;
-
-		case STARPU_SCC_SINK:
-			{
-				node->init = _starpu_scc_sink_init;
-				node->deinit = _starpu_scc_sink_deinit;
-				node->report_error = _starpu_scc_common_report_rcce_error;
-
-				node->mp_send = _starpu_scc_common_send;
-				node->mp_recv = _starpu_scc_common_recv;
-				node->dt_send = _starpu_scc_common_send;
-				node->dt_recv = _starpu_scc_common_recv;
-				node->dt_send_to_device = _starpu_scc_sink_send_to_device;
-				node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
-
-				node->execute = _starpu_scc_sink_execute;
-				node->allocate = _starpu_sink_common_allocate;
-				node->free = _starpu_sink_common_free;
-
-				node->get_nb_core = NULL;
-			}
-			break;
+	case STARPU_SCC_SOURCE:
+	{
+		node->init = _starpu_scc_src_init;
+		node->deinit = NULL;
+		node->deinit = NULL;
+		node->report_error = _starpu_scc_common_report_rcce_error;
+				
+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
+		node->mp_send = _starpu_scc_common_send;
+		node->mp_recv = _starpu_scc_common_recv;
+		node->dt_send = _starpu_scc_common_send;
+		node->dt_recv = _starpu_scc_common_recv;
+		node->dt_send_to_device = NULL;
+		node->dt_recv_from_device = NULL;
+
+		node->get_kernel_from_job =_starpu_scc_src_get_kernel_from_job;
+		node->lookup = NULL;
+		node->bind_thread = NULL;
+		node->execute = NULL;
+		node->allocate = NULL;
+		node->free = NULL;
+	}
+	break;
+
+	case STARPU_SCC_SINK:
+	{
+		node->init = _starpu_scc_sink_init;
+		node->launch_workers = _starpu_scc_sink_launch_workers;
+		node->deinit = _starpu_scc_sink_deinit;
+		node->report_error = _starpu_scc_common_report_rcce_error;
+
+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
+		node->mp_send = _starpu_scc_common_send;
+		node->mp_recv = _starpu_scc_common_recv;
+		node->dt_send = _starpu_scc_common_send;
+		node->dt_recv = _starpu_scc_common_recv;
+		node->dt_send_to_device = _starpu_scc_sink_send_to_device;
+		node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
+
+		node->get_kernel_from_job = NULL;
+		node->lookup = _starpu_scc_sink_lookup;
+		node->bind_thread = _starpu_scc_sink_bind_thread;
+		node->execute = _starpu_scc_sink_execute;
+		node->allocate = _starpu_sink_common_allocate;
+		node->free = _starpu_sink_common_free;
+	}
+	break;
 #endif /* STARPU_USE_SCC */
 
 #ifdef STARPU_USE_MPI
-		case STARPU_MPI_SOURCE:
-			STARPU_ABORT();
-			break;
+	case STARPU_MPI_SOURCE:
+		STARPU_ABORT();
+		break;
 
-		case STARPU_MPI_SINK:
-			STARPU_ABORT();
-			break;
+	case STARPU_MPI_SINK:
+		STARPU_ABORT();
+		break;
 #endif /* STARPU_USE_MPI */
 
-		default:
-			STARPU_ASSERT(0);
+	default:
+		STARPU_ASSERT(0);
 	}
 
 	/* Let's allocate the buffer, we want it to be big enough to contain
@@ -159,15 +170,60 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 	if (node->init)
 		node->init(node);
 
+	node->message_queue = mp_message_list_new();
+	STARPU_PTHREAD_MUTEX_INIT(&node->message_queue_mutex,NULL);
+
+	/* If the node is a sink then we must initialize some field */
+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
+	{
+		int i;
+		node->is_running = 1;
+		node->run_table = malloc(sizeof(struct mp_task *)*node->nb_cores);
+		node->sem_run_table = malloc(sizeof(sem_t)*node->nb_cores);
+
+		for(i=0; i<node->nb_cores; i++)
+		{
+			node->run_table[i] = NULL;
+			sem_init(&node->sem_run_table[i],0,0);
+		}
+		node->barrier_list = mp_barrier_list_new();
+		STARPU_PTHREAD_MUTEX_INIT(&node->barrier_mutex,NULL);
+
+		STARPU_PTHREAD_BARRIER_INIT(&node->init_completed_barrier, NULL, node->nb_cores+1);
+
+		node->launch_workers(node);
+	}	
+
+
 	return node;
 }
 
 /* Deinitialize the sink structure and release the structure */
-
 void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 {
 	if (node->deinit)
 		node->deinit(node);
+		
+	mp_message_list_delete(node->message_queue);
+	STARPU_PTHREAD_MUTEX_DESTROY(&node->message_queue_mutex);
+
+	/* If the node is a sink then we must destroy some field */
+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
+	{
+		int i;
+		for(i=0; i<node->nb_cores; i++)
+		{
+			sem_destroy(&node->sem_run_table[i]);
+		}
+
+		free(node->run_table);
+		free(node->sem_run_table);
+
+		mp_barrier_list_delete(node->barrier_list);
+
+		STARPU_PTHREAD_MUTEX_DESTROY(&node->barrier_mutex);
+		STARPU_PTHREAD_BARRIER_DESTROY(&node->init_completed_barrier);
+	}
 
 	free(node->buffer);
 
@@ -175,7 +231,6 @@ void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 }
 
 /* Send COMMAND to RECIPIENT, along with ARG if ARG_SIZE is non-zero */
-
 void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 				    const enum _starpu_mp_command command,
 				    void *arg, int arg_size)
@@ -202,7 +257,6 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
  * However, the data pointed by arg shouldn't be relied on after a new call to
  * STARPU_MP_COMMON_RECV_COMMAND as it might corrupt it.
  */
-
 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
 						       void **arg, int *arg_size)
 {

+ 88 - 29
src/drivers/mp_common/mp_common.h

@@ -18,10 +18,14 @@
 #define __MP_COMMON_H__
 
 #include <pthread.h>
+#include <semaphore.h>
 
 #include <starpu.h>
 #include <common/config.h>
-
+#include <common/list.h>
+#include <common/barrier.h>
+#include <common/thread.h>
+#include <datawizard/interfaces/data_interface.h>
 
 #ifdef STARPU_USE_MP
 
@@ -34,31 +38,31 @@
 #define STARPU_MP_SRC_NODE 0
 #define STARPU_MP_SINK_NODE(a) ((a) + 1)
 
-#define STARPU_MP_COMMON_REPORT_ERROR(node, status) \
+#define STARPU_MP_COMMON_REPORT_ERROR(node, status)			\
 	(node)->report_error(__starpu_func__, __FILE__, __LINE__, (status))
-
-
 enum _starpu_mp_command
 {
-	STARPU_EXIT = 0x00,
-	STARPU_EXECUTE = 0x01,
-	STARPU_ERROR_EXECUTE = 0x02,
-	STARPU_LOOKUP = 0X03,
-	STARPU_ANSWER_LOOKUP = 0X04,
-	STARPU_ERROR_LOOKUP = 0X05,
-	STARPU_ALLOCATE = 0x06,
-	STARPU_ANSWER_ALLOCATE = 0x07,
-	STARPU_ERROR_ALLOCATE = 0x08,
-	STARPU_FREE = 0x09,
-	STARPU_RECV_FROM_HOST = 0x10,
-	STARPU_SEND_TO_HOST = 0x11,
-	STARPU_RECV_FROM_SINK = 0x12,
-	STARPU_SEND_TO_SINK = 0x13,
-	STARPU_TRANSFER_COMPLETE = 0x14,
-	STARPU_SINK_NBCORES = 0x15,
-	STARPU_ANSWER_SINK_NBCORES = 0x16,
-	STARPU_EXECUTION_SUBMITTED = 0x17,
-	STARPU_EXECUTION_COMPLETED = 0x18
+	STARPU_EXIT,
+	STARPU_EXECUTE,
+	STARPU_ERROR_EXECUTE,
+	STARPU_LOOKUP,
+	STARPU_ANSWER_LOOKUP,
+	STARPU_ERROR_LOOKUP,
+	STARPU_ALLOCATE,
+	STARPU_ANSWER_ALLOCATE,
+	STARPU_ERROR_ALLOCATE,
+	STARPU_FREE,
+	STARPU_RECV_FROM_HOST,
+	STARPU_SEND_TO_HOST,
+	STARPU_RECV_FROM_SINK,
+	STARPU_SEND_TO_SINK,
+	STARPU_TRANSFER_COMPLETE,
+	STARPU_SINK_NBCORES,
+	STARPU_ANSWER_SINK_NBCORES,
+	STARPU_EXECUTION_SUBMITTED,
+	STARPU_EXECUTION_COMPLETED,
+	STARPU_PRE_EXECUTION,
+	STARPU_SYNC_WORKERS,
 };
 
 enum _starpu_mp_node_kind
@@ -96,12 +100,47 @@ struct _starpu_mp_transfer_command_to_device
 	void *addr;
 };
 
+LIST_TYPE(mp_barrier,
+		int id;
+		_starpu_pthread_barrier_t before_work_barrier;
+		_starpu_pthread_barrier_t after_work_barrier;
+	 );
+
+LIST_TYPE(mp_message,
+		enum _starpu_mp_command type;
+		char buffer[BUFFER_SIZE];
+		int size;
+	 );
+
+struct mp_task 
+{
+	void (*kernel)(void **, void *);
+	void * interfaces[STARPU_NMAXBUFS]; 
+	unsigned nb_interfaces;
+	void *cl_arg;
+	unsigned coreid;
+	enum starpu_codelet_type type;
+	int is_parallel_task;
+	int combined_workerid;
+ 	struct mp_barrier* mp_barrier;
+};
+
+
 /* Message-passing working node, whether source
  * or sink */
 struct _starpu_mp_node
 {
 	enum _starpu_mp_node_kind kind;
 
+	int baseworkerid;
+
+	/*the number of core on the device
+	 * Must be initialized during init function*/
+	int nb_cores;
+
+	/*Is starpu running*/
+	int is_running;
+
 	/* Buffer used for scif data transfers, allocated
 	 * during node initialization.
 	 * Size : BUFFER_SIZE */
@@ -117,7 +156,7 @@ struct _starpu_mp_node
 	int devid;
 
 	/* Only MIC use this for now !!
-	*  Is the number ok MIC on the system. */
+	 *  Is the number ok MIC on the system. */
 	unsigned int nb_mp_sinks;
 
 	/* Connection used for command passing between the host thread and the
@@ -138,12 +177,32 @@ struct _starpu_mp_node
 	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
 	union _starpu_mp_connection *sink_sink_dt_connections;
 
+	/* */
+	_starpu_pthread_barrier_t init_completed_barrier; 
+	
+	/* table to store pointer of the thread workers*/
+	void* thread_table;
+
+        /*list where threads add messages to send to the source node */
+        struct mp_message_list* message_queue;
+	starpu_pthread_mutex_t message_queue_mutex;
+
+	/*list of barrier for combined worker*/
+	struct mp_barrier_list* barrier_list;
+	starpu_pthread_mutex_t barrier_mutex;
+
+	/*table where worker comme pick task*/
+	struct mp_task ** run_table;
+	sem_t * sem_run_table;
+
 	/* Node general functions */
 	void (*init)(struct _starpu_mp_node *node);
+	void (*launch_workers)(struct _starpu_mp_node *node);
 	void (*deinit)(struct _starpu_mp_node *node);
 	void (*report_error)(const char *, const char *, const int, const int);
 
 	/* Message passing */
+	int (*mp_recv_is_ready)(const struct _starpu_mp_node *);
 	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
 	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
 
@@ -153,12 +212,12 @@ struct _starpu_mp_node
 	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
 	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
 
-	void (*execute)(const struct _starpu_mp_node *, void *, int);
-	void (*nbcores)(const struct _starpu_mp_node *);
+	void (*(*get_kernel_from_job)(const struct _starpu_mp_node *,struct _starpu_job *))(void);
+	void (*(*lookup)(const struct _starpu_mp_node *, char* ))(void);
+	void (*bind_thread)(const struct _starpu_mp_node *, int,int *,int);
+	void (*execute)(struct _starpu_mp_node *, void *, int);
 	void (*allocate)(const struct _starpu_mp_node *, void *, int);
 	void (*free)(const struct _starpu_mp_node *, void *, int);
-
-	unsigned int (*get_nb_core)(void);
 };
 
 struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid);
@@ -170,7 +229,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 				    void *arg, int arg_size);
 
 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
-						    void **arg, int *arg_size);
+						       void **arg, int *arg_size);
 
 
 #endif /* STARPU_USE_MP */

+ 487 - 142
src/drivers/mp_common/sink_common.c

@@ -15,20 +15,21 @@
  */
 
 
-#include <dlfcn.h>
-
 #include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <drivers/mp_common/mp_common.h>
 #include <datawizard/interfaces/data_interface.h>
-
+#include <common/barrier.h>
+#include <core/workers.h>
+#include <common/barrier_counter.h>
 #ifdef STARPU_USE_MIC
 #include <common/COISysInfo_common.h>
 #endif
 
 #include "sink_common.h"
 
+
 /* Return the sink kind of the running process, based on the value of the
  * STARPU_SINK environment variable.
  * If there is no valid value retrieved, return STARPU_INVALID_KIND
@@ -50,90 +51,25 @@ static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
 		return STARPU_INVALID_KIND;
 }
 
-void
-_starpu_sink_nbcores (const struct _starpu_mp_node *node)
-{
-    // Process packet received from `_starpu_src_common_sink_cores'.
-    int nbcores = 1;
-
-#ifdef STARPU_USE_MIC
-    // XXX I currently only support MIC for now.
-    if (STARPU_MIC_SINK == _starpu_sink_common_get_kind ())
-	nbcores = COISysGetCoreCount();
-#endif
 
-    _starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
-				    &nbcores, sizeof (int));
-}
-
-
-/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
- * [Function pointer on sink, number of interfaces, interfaces
- * (union _starpu_interface), cl_arg]
- * Then call the function given, passing as argument an array containing the
- * addresses of the received interfaces
+/* Send to host the number of cores of the sink device
  */
-void _starpu_sink_common_execute(const struct _starpu_mp_node *node,
-					void *arg, int arg_size)
+static void _starpu_sink_common_get_nb_cores (struct _starpu_mp_node *node)
 {
-	unsigned id = 0;
-
-	void *arg_ptr = arg;
-	void (*kernel)(void **, void *) = NULL;
-	unsigned coreid = 0;
-	unsigned nb_interfaces = 0;
-	void *interfaces[STARPU_NMAXBUFS];
-	void *cl_arg;
-
-	kernel = *(void(**)(void **, void *)) arg_ptr;
-	arg_ptr += sizeof(kernel);
-
-	coreid = *(unsigned *) arg_ptr;
-	arg_ptr += sizeof(coreid);
-
-	nb_interfaces = *(unsigned *) arg_ptr;
-	arg_ptr += sizeof(nb_interfaces);
-
-	/* The function needs an array pointing to each interface it needs
-	 * during execution. As in sink-side there is no mean to know which
-	 * kind of interface to expect, the array is composed of unions of
-	 * interfaces, thus we expect the same size anyway */
-	for (id = 0; id < nb_interfaces; id++)
-	{
-		interfaces[id] = arg_ptr;
-		arg_ptr += sizeof(union _starpu_interface);
-	}
-
-	/* Was cl_arg sent ? */
-	if (arg_size > arg_ptr - arg)
-		cl_arg = arg_ptr;
-	else
-		cl_arg = NULL;
-
-	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", kernel);
-	/* XXX: in the future, we will not have to directly execute the kernel
-	 * but submit it to the correct local worker. */
-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
-				       NULL, 0);
-
-	//_STARPU_DEBUG("executing the task %p\n", kernel);
-	/* XXX: we keep the synchronous execution model on the sink side for
-	 * now. */
-	kernel(interfaces, cl_arg);
-
-	//_STARPU_DEBUG("telling host that we have finished the task %p.\n", kernel);
-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_COMPLETED,
-				       &coreid, sizeof(coreid));
+	// Process packet received from `_starpu_src_common_sink_cores'.
+     	_starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
+					&node->nb_cores, sizeof (int));
 }
 
 
+/* Send to host the address of the function given in parameter
+ */
 static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
 				       char *func_name)
 {
 	void (*func)(void);
-	void *dl_handle = dlopen(NULL, RTLD_NOW);
-	func = dlsym(dl_handle, func_name);
-
+	func = node->lookup(node,func_name);
+	
 	//_STARPU_DEBUG("Looked up %s, got %p\n", func_name, func);
 
 	/* If we couldn't find the function, let's send an error to the host.
@@ -146,21 +82,24 @@ static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
 					       NULL, 0);
 }
 
+
+/* Allocate a memory space and send the address of this space to the host
+ */
 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node,
 				  void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(size_t));
-
-    void *addr = malloc(*(size_t *)(arg));
-
-    /* If the allocation fail, let's send an error to the host.
-     */
-    if (addr)
-	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
-				       &addr, sizeof(addr));
-    else
-	_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
-				       NULL, 0);
+	STARPU_ASSERT(arg_size == sizeof(size_t));
+
+	void *addr = malloc(*(size_t *)(arg));
+
+	/* If the allocation fail, let's send an error to the host.
+	 */
+	if (addr)
+		_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
+					       &addr, sizeof(addr));
+	else
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
+					       NULL, 0);
 }
 
 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED,
@@ -174,56 +113,129 @@ void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRI
 static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-    mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
+	mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
 }
 
 static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-    mp_node->dt_send(mp_node, cmd->addr, cmd->size);
+	mp_node->dt_send(mp_node, cmd->addr, cmd->size);
 }
 
 static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
 
-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
 
-    mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+	mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
 
-    _starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
+	_starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
 }
 
 static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+	mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+}
+
+
+/* Receive workers and combined workers and store them into the struct config
+ */
+static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void *arg, int arg_size)
+{
+	/* Retrieve information from the message */
+	STARPU_ASSERT(arg_size == (sizeof(int)*5));
+	void * arg_ptr = arg;
+	int i;
+	
+	int nworkers = *(int *)arg_ptr; 
+	arg_ptr += sizeof(nworkers);
+
+	int worker_size = *(int *)arg_ptr;
+	arg_ptr += sizeof(worker_size);
+
+	int combined_worker_size = *(int *)arg_ptr;
+	arg_ptr += sizeof(combined_worker_size);
+	
+	int baseworkerid = *(int *)arg_ptr;
+	arg_ptr += sizeof(baseworkerid);
+
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	config->topology.nworkers = *(int *)arg_ptr;
+
+
+	/* Retrieve workers */
+	struct _starpu_worker * workers = &config->workers[baseworkerid];
+	node->dt_recv(node,workers,worker_size);
+	
+	/* Update workers to have coherent field */
+	for(i=0; i<nworkers; i++)
+	{
+		workers[i].config = config;
+		starpu_pthread_mutex_init(&workers[i].mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].mutex);
+
+		starpu_pthread_cond_init(&workers[i].started_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].started_cond);
+
+		starpu_pthread_cond_init(&workers[i].ready_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].ready_cond);
+
+		starpu_pthread_mutex_init(&workers[i].sched_mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].sched_mutex);
+
+		starpu_pthread_cond_init(&workers[i].sched_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].sched_cond);
 
-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+		workers[i].current_task = NULL;
+		workers[i].set = NULL;
+		workers[i].terminated_jobs = NULL;
+	
+		//_starpu_barrier_counter_init(&workers[i].tasks_barrier, 1);
+		//_starpu_barrier_counter_destroy(&workers[i].tasks_barrier);
 
-    mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+		starpu_pthread_mutex_init(&workers[i].parallel_sect_mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].parallel_sect_mutex);
+
+		starpu_pthread_cond_init(&workers[i].parallel_sect_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].parallel_sect_cond);
+
+	}
+
+	/* Retrieve combined workers */
+	struct _starpu_combined_worker * combined_workers = config->combined_workers; 
+	node->dt_recv(node, combined_workers, combined_worker_size);
+
+	node->baseworkerid = baseworkerid;
+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
 }
 
+
+
 /* Function looping on the sink, waiting for tasks to execute.
  * If the caller is the host, don't do anything.
  */
-
 void _starpu_sink_common_worker(void)
 {
 	struct _starpu_mp_node *node = NULL;
 	enum _starpu_mp_command command = STARPU_EXIT;
 	int arg_size = 0;
 	void *arg = NULL;
-
+	int exit_starpu = 0;
 	enum _starpu_mp_node_kind node_kind = _starpu_sink_common_get_kind();
 
 	if (node_kind == STARPU_INVALID_KIND)
@@ -234,46 +246,82 @@ void _starpu_sink_common_worker(void)
 	/* Create and initialize the node */
 	node = _starpu_mp_common_node_create(node_kind, -1);
 
-	while ((command = _starpu_mp_common_recv_command(node, &arg, &arg_size)) != STARPU_EXIT)
+	starpu_pthread_key_t worker_key;
+	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
+
+
+	struct _starpu_machine_config *config;
+	while (!exit_starpu)
 	{
-		switch(command)
+		/* If we have received a message */
+		if(node->mp_recv_is_ready(node))
+		{
+
+			command = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+			switch(command)
+			{
+				case STARPU_EXIT:
+					exit_starpu = 1;
+					break;
+				case STARPU_EXECUTE:
+					config = _starpu_get_machine_config();
+					node->execute(node, arg, arg_size);
+					break;
+				case STARPU_SINK_NBCORES:
+					_starpu_sink_common_get_nb_cores(node);
+					break;
+				case STARPU_LOOKUP:
+					_starpu_sink_common_lookup(node, (char *) arg);
+					break;
+
+				case STARPU_ALLOCATE:
+					node->allocate(node, arg, arg_size);
+					break;
+
+				case STARPU_FREE:
+					node->free(node, arg, arg_size);
+					break;
+
+				case STARPU_RECV_FROM_HOST:
+					_starpu_sink_common_copy_from_host(node, arg, arg_size);
+					break;
+
+				case STARPU_SEND_TO_HOST:
+					_starpu_sink_common_copy_to_host(node, arg, arg_size);
+					break;
+
+				case STARPU_RECV_FROM_SINK:
+					_starpu_sink_common_copy_from_sink(node, arg, arg_size);
+					break;
+
+				case STARPU_SEND_TO_SINK:
+					_starpu_sink_common_copy_to_sink(node, arg, arg_size);
+					break;
+
+				case STARPU_SYNC_WORKERS:
+					_starpu_sink_common_recv_workers(node, arg, arg_size);
+					break;
+				default:
+					printf("Oops, command %x unrecognized\n", command);
+			}
+		}
+
+		STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+		/* If the list is not empty */
+		if(!mp_message_list_empty(node->message_queue))
 		{
-			case STARPU_EXECUTE:
-				node->execute(node, arg, arg_size);
-				break;
-			case STARPU_SINK_NBCORES:
-				node->nbcores (node);
-				break;
-			case STARPU_LOOKUP:
-				_starpu_sink_common_lookup(node, (char *) arg);
-				break;
-
-			case STARPU_ALLOCATE:
-				node->allocate(node, arg, arg_size);
-				break;
-
-			case STARPU_FREE:
-				node->free(node, arg, arg_size);
-				break;
-
-			case STARPU_RECV_FROM_HOST:
-				_starpu_sink_common_copy_from_host(node, arg, arg_size);
-				break;
-
-			case STARPU_SEND_TO_HOST:
-				_starpu_sink_common_copy_to_host(node, arg, arg_size);
-				break;
-
-			case STARPU_RECV_FROM_SINK:
-				_starpu_sink_common_copy_from_sink(node, arg, arg_size);
-				break;
-
-			case STARPU_SEND_TO_SINK:
-				_starpu_sink_common_copy_to_sink(node, arg, arg_size);
-				break;
-
-			default:
-				printf("Oops, command %x unrecognized\n", command);
+			/* We pop a message and send it to the host */
+			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
+			config = _starpu_get_machine_config();
+			_starpu_mp_common_send_command(node, message->type, 
+					&message->buffer, message->size);
+			mp_message_delete(message);
+		}
+		else
+		{
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 		}
 	}
 
@@ -282,3 +330,300 @@ void _starpu_sink_common_worker(void)
 
 	exit(0);
 }
+
+
+/* Search for the mp_barrier correspondind to the specified combined worker 
+ * and create it if it doesn't exist
+ */
+static struct mp_barrier * _starpu_sink_common_get_barrier(struct _starpu_mp_node * node, int cb_workerid, int cb_workersize)
+{
+	struct mp_barrier * b = NULL;
+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
+	/* Search if the barrier already exist */
+	for(b = mp_barrier_list_begin(node->barrier_list); 
+			b != mp_barrier_list_end(node->barrier_list) && b->id != cb_workerid; 
+			b = mp_barrier_list_next(b));
+
+	/* If we found the barrier */
+	if(b != NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+		return b;
+	}
+	else
+	{
+
+		/* Else we create, initialize and add it to the list*/
+		b = mp_barrier_new();
+		b->id = cb_workerid;
+		STARPU_PTHREAD_BARRIER_INIT(&b->before_work_barrier,NULL,cb_workersize);
+		STARPU_PTHREAD_BARRIER_INIT(&b->after_work_barrier,NULL,cb_workersize);
+		mp_barrier_list_push_back(node->barrier_list,b);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+		return b;
+	}
+}
+
+
+/* Erase for the mp_barrier correspondind to the specified combined worker
+*/
+static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, struct mp_barrier *barrier)
+{
+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
+	mp_barrier_list_erase(node->barrier_list,barrier);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+}
+
+/* Append the message given in parameter to the message list
+ */
+static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+	mp_message_list_push_front(node->message_queue,message);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+
+}
+/* Append to the message list a "STARPU_PRE_EXECUTION" message
+ */
+static void _starpu_sink_common_pre_execution_message(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Init message to tell the sink that the execution has begun */
+	struct mp_message * message = mp_message_new();
+	message->type = STARPU_PRE_EXECUTION;
+	*(int *) message->buffer = task->combined_workerid;
+	message->size = sizeof(task->combined_workerid);
+
+
+	/* Append the message to the queue */	
+	_starpu_sink_common_append_message(node, message);
+
+}
+
+/* Append to the message list a "STARPU_EXECUTION_COMPLETED" message
+ */
+static void _starpu_sink_common_execution_completed_message(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Init message to tell the sink that the execution is completed */
+	struct mp_message * message = mp_message_new();
+	message->type = STARPU_EXECUTION_COMPLETED;
+	message->size = sizeof(task->coreid);
+	*(int*) message->buffer = task->coreid;
+
+	/* Append the message to the queue */
+	_starpu_sink_common_append_message(node, message);
+}
+
+
+/* Bind the thread which is running on the specified core to the combined worker */
+static void _starpu_sink_common_bind_to_combined_worker(struct _starpu_mp_node *node, int coreid, struct _starpu_combined_worker * combined_worker)
+{
+	int i;
+	int * bind_set = malloc(sizeof(int)*combined_worker->worker_size);
+	for(i=0;i<combined_worker->worker_size;i++)
+		bind_set[i] = combined_worker->combined_workerid[i] - node->baseworkerid;
+	node->bind_thread(node, coreid, bind_set, combined_worker->worker_size);
+}
+
+
+
+/* Get the current rank of the worker in the combined worker 
+ */
+static int _starpu_sink_common_get_current_rank(int workerid, struct _starpu_combined_worker * combined_worker)
+{
+	int i;
+	for(i=0; i<combined_worker->worker_size; i++)
+		if(workerid == combined_worker->combined_workerid[i])
+			return i;
+
+	STARPU_ASSERT(0);
+}
+
+/* Execute the task 
+ */
+static void _starpu_sink_common_execute_kernel(struct _starpu_mp_node *node, int coreid, struct _starpu_worker * worker)
+{
+	struct _starpu_combined_worker * combined_worker = NULL;
+	struct mp_task* task = node->run_table[coreid];
+
+
+	/* If it's a parallel task */
+	if(task->is_parallel_task)
+	{
+		combined_worker = _starpu_get_combined_worker_struct(task->combined_workerid);
+		
+		worker->current_rank = _starpu_sink_common_get_current_rank(worker->workerid, combined_worker);
+		worker->combined_workerid = task->combined_workerid;
+		worker->worker_size = combined_worker->worker_size;
+		
+		/* Synchronize with others threads of the combined worker*/
+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->before_work_barrier);
+
+
+		/* The first thread of the combined worker */
+		if(worker->current_rank == 0)
+		{
+			/* tell the sink that the execution has begun */
+			_starpu_sink_common_pre_execution_message(node,task);
+
+			/* If the mode is FORKJOIN, 
+			 * the first thread binds himself 
+			 * on all core of the combined worker*/
+			if(task->type == STARPU_FORKJOIN)
+			{
+				_starpu_sink_common_bind_to_combined_worker(node, coreid, combined_worker);
+			}
+		}
+	}
+	else
+	{
+		worker->current_rank = 0;
+		worker->combined_workerid = 0;
+		worker->worker_size = 1;
+	}
+	if(task->type != STARPU_FORKJOIN || worker->current_rank == 0)
+	{
+		/* execute the task */
+		task->kernel(task->interfaces,task->cl_arg);
+	}
+
+	/* If it's a parallel task */
+	if(task->is_parallel_task)
+	{
+		/* Synchronize with others threads of the combined worker*/
+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->after_work_barrier);
+
+		/* The fisrt thread of the combined */
+		if(worker->current_rank == 0)
+		{
+			/* Erase the barrier from the list */
+			_starpu_sink_common_erase_barrier(node,task->mp_barrier);
+
+			/* If the mode is FORKJOIN, 
+			 * the first thread rebinds himself on his own core */
+			if(task->type == STARPU_FORKJOIN)
+				node->bind_thread(node, coreid, &coreid, 1);
+
+		}
+	}
+
+	node->run_table[coreid] = NULL;
+
+	/* tell the sink that the execution is completed */
+	_starpu_sink_common_execution_completed_message(node,task);
+
+	/*free the task*/
+	unsigned i;
+	for (i = 0; i < task->nb_interfaces; i++)
+		free(task->interfaces[i]);
+	free(task);
+
+}
+
+
+/* The main function executed by the thread 
+ * thread_arg is a structure containing the information needed by the thread
+ */
+void* _starpu_sink_thread(void * thread_arg)
+{
+	/* Retrieve the information from the structure */
+	struct _starpu_mp_node *node = ((struct arg_sink_thread *)thread_arg)->node;
+	int coreid =((struct arg_sink_thread *)thread_arg)->coreid;
+	/* free the structure */
+	free(thread_arg);
+
+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
+
+	struct _starpu_worker *worker = &_starpu_get_machine_config()->workers[node->baseworkerid + coreid];
+
+	_starpu_set_local_worker_key(worker);
+	while(node->is_running)
+	{
+		/*Wait there is a task available */
+		sem_wait(&node->sem_run_table[coreid]);
+		if(node->run_table[coreid] != NULL)
+			_starpu_sink_common_execute_kernel(node,coreid,worker);
+
+	}
+	pthread_exit(NULL);
+}
+
+
+/* Add the task to the specific thread and wake him up
+*/
+static void _starpu_sink_common_execute_thread(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Add the task to the specific thread */
+	node->run_table[task->coreid] = task;
+	/* Unlock the mutex to wake up the thread which will execute the task */
+	sem_post(&node->sem_run_table[task->coreid]);
+}
+
+
+
+/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
+ * [Function pointer on sink, number of interfaces, interfaces
+ * (union _starpu_interface), cl_arg]
+ * Then call the function given, passing as argument an array containing the
+ * addresses of the received interfaces
+ */
+
+void _starpu_sink_common_execute(struct _starpu_mp_node *node,
+		void *arg, int arg_size)
+{
+	unsigned i;
+
+	void *arg_ptr = arg;
+	struct mp_task *task = malloc(sizeof(struct mp_task));
+
+	task->kernel = *(void(**)(void **, void *)) arg_ptr;
+	arg_ptr += sizeof(task->kernel);
+
+	task->type = *(enum starpu_codelet_type *) arg_ptr;
+	arg_ptr += sizeof(task->type);
+
+	task->is_parallel_task = *(int *) arg_ptr;
+	arg_ptr += sizeof(task->is_parallel_task);
+
+	if(task->is_parallel_task)
+	{
+		task->combined_workerid= *(int *) arg_ptr;
+		arg_ptr += sizeof(task->combined_workerid);
+
+		task->mp_barrier = _starpu_sink_common_get_barrier(node,task->combined_workerid,_starpu_get_combined_worker_struct(task->combined_workerid)->worker_size);
+	}
+
+	task->coreid = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(task->coreid);
+
+	task->nb_interfaces = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(task->nb_interfaces);
+
+	/* The function needs an array pointing to each interface it needs
+	 * during execution. As in sink-side there is no mean to know which
+	 * kind of interface to expect, the array is composed of unions of
+	 * interfaces, thus we expect the same size anyway */
+	for (i = 0; i < task->nb_interfaces; i++)
+	{
+		union _starpu_interface * interface = malloc(sizeof(union _starpu_interface));   
+		memcpy(interface, arg_ptr, 
+				sizeof(union _starpu_interface));
+		task->interfaces[i] = interface;
+		arg_ptr += sizeof(union _starpu_interface);
+	}
+
+	/* Was cl_arg sent ? */
+	if (arg_size > arg_ptr - arg)
+		task->cl_arg = arg_ptr;
+	else
+		task->cl_arg = NULL;
+
+
+	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", task->kernel);
+	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
+			NULL, 0);
+
+	//_STARPU_DEBUG("executing the task %p\n", task->kernel);
+	_starpu_sink_common_execute_thread(node, task);	
+
+}

+ 9 - 2
src/drivers/mp_common/sink_common.h

@@ -32,14 +32,21 @@ struct _starpu_sink_topology
 	unsigned nb_cpus;
 };
 
+struct arg_sink_thread
+{
+	struct _starpu_mp_node *node;
+	int coreid;
+};
+
 void _starpu_sink_common_worker(void);
 
-void _starpu_sink_common_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
-void _starpu_sink_nbcores (const struct _starpu_mp_node *node);
+void _starpu_sink_common_execute(struct _starpu_mp_node *node, void *arg, int arg_size);
 
 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
 
+void* _starpu_sink_thread(void * thread_arg);
+
 #endif /* STARPU_USE_MP */
 
 

+ 432 - 64
src/drivers/mp_common/source_common.c

@@ -19,28 +19,241 @@
 #include <pthread.h>
 
 #include <starpu.h>
+#include <core/task.h>
+#include <core/sched_policy.h>
+
+#include <drivers/driver_common/driver_common.h>
+
+
 #include <datawizard/coherency.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
 
-int
-_starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
+
+/* Finalize the execution of a task by a worker*/
+static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker) 
+{
+	uint32_t mask = 0;
+	int profiling = starpu_profiling_status_get();
+	struct timespec codelet_end;
+	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
+			profiling);
+	
+	int count = worker->current_rank;
+
+	/* If it's a combined worker, we check if it's the last one of his combined */
+	if(j->task_size > 1)
+	{
+		struct _starpu_combined_worker * cb_worker = _starpu_get_combined_worker_struct(worker->combined_workerid); 
+
+		pthread_mutex_lock(&cb_worker->count_mutex);
+		count = cb_worker->count--;
+		if(count == 0)
+			cb_worker->count = cb_worker->worker_size - 1; 
+		pthread_mutex_unlock(&cb_worker->count_mutex);
+	}
+
+	/* Finalize the execution */
+	if(count == 0)
+	{
+
+		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
+				&j->cl_start, &codelet_end,
+				profiling);
+
+		_starpu_push_task_output (j, mask);
+
+		_starpu_handle_job_termination(j);
+	}
+	return 0;
+}
+
+
+/* Complete the execution of the job */
+static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *workerset, void * arg, int arg_size)
+{
+	int coreid;
+
+	STARPU_ASSERT(sizeof(coreid) == arg_size);	
+	
+	coreid = *(int *) arg;
+
+	struct _starpu_worker *worker = &workerset->workers[coreid];
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(worker->current_task);
+
+	struct _starpu_worker * old_worker = _starpu_get_local_worker_key();
+
+	_starpu_set_local_worker_key(worker);
+	_starpu_src_common_finalize_job (j, worker);
+	_starpu_set_local_worker_key(old_worker);
+
+	worker->current_task = NULL;
+	return 0;
+}
+
+/* Tell the scheduler when the execution has begun */
+static void _starpu_src_common_pre_exec(void * arg, int arg_size)
+{
+	int cb_workerid, i;
+	STARPU_ASSERT(sizeof(cb_workerid) == arg_size);
+	cb_workerid = *(int *) arg;
+	struct _starpu_combined_worker *combined_worker = _starpu_get_combined_worker_struct(cb_workerid);
+	for(i=0; i < combined_worker->worker_size; i++)
+	{
+		struct _starpu_worker * worker = _starpu_get_worker_struct(combined_worker->combined_workerid[i]);
+		_starpu_set_local_worker_key(worker);
+		_starpu_sched_pre_exec_hook(worker->current_task);
+	}	
+}
+
+/* recv a message and handle asynchronous message
+ * return 0 if the message has not been handle (it's certainly mean that it's a synchronous message)
+ * return 1 if the message has been handle
+ */
+static int _starpu_src_common_handle_async(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, 
+		void * arg, int arg_size, 
+		enum _starpu_mp_command answer)
+{
+	struct _starpu_worker_set * worker_set=NULL; 
+	switch(answer) 
+	{
+		case STARPU_EXECUTION_COMPLETED:
+			worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
+			_starpu_src_common_process_completed_job(worker_set, arg, arg_size);
+			break;
+		case STARPU_PRE_EXECUTION:
+			_starpu_src_common_pre_exec(arg,arg_size);
+			break;
+		default:
+			return 0;
+			break;
+	}
+	return 1;
+}
+
+/* Handle all message which have been stored in the message_queue */
+static void _starpu_src_common_handle_stored_async(struct _starpu_mp_node *node)
+{
+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+	/* while the list is not empty */
+	while(!mp_message_list_empty(node->message_queue))
+	{
+		/* We pop a message and handle it */
+		struct mp_message * message = mp_message_list_pop_back(node->message_queue);
+		_starpu_src_common_handle_async(node, message->buffer, 
+				message->size, message->type);
+		mp_message_delete(message);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+}
+
+/* Store a message if is asynchronous 
+ * return 1 if the message has been stored
+ * return 0 if the message is unknown or synchrone */
+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
+		void * arg, int arg_size, enum _starpu_mp_command answer)
+{
+	struct mp_message * message = NULL;
+	switch(answer)
+	{
+		case STARPU_EXECUTION_COMPLETED:
+		case STARPU_PRE_EXECUTION:
+			message = mp_message_new();
+			message->type = answer;
+			memcpy(message->buffer, arg, arg_size); 
+			message->size = arg_size; 
+
+			STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+			mp_message_list_push_front(node->message_queue,message);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+			return 1;
+			break;
+		default:
+			return 0;
+			break;
+	}
+}
+
+/* Store all asynchronous messages and return when a synchronous message is received */
+static enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
+		void ** arg, int* arg_size)
+{
+	enum _starpu_mp_command answer;
+	int is_sync = 0;
+	while(!is_sync)
+	{
+		answer = _starpu_mp_common_recv_command(node, arg, arg_size);
+		if(!_starpu_src_common_store_message(node,*arg,*arg_size,answer))
+			is_sync=1;
+	}
+	return answer;
+}
+
+/* Handle a asynchrone message and return a error if a synchronous message is received */
+static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
+{
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size;
+	answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer))
+	{
+		printf("incorrect commande: unknown command or sync command");
+		STARPU_ASSERT(0);
+	}	
+}
+
+/* Handle all asynchrone message while a completed execution message from a specific worker has been receive */
+ enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size)
+{
+	enum _starpu_mp_command answer;
+
+	int completed = 0;	
+	/*While the waited completed execution message has not been receive*/
+	while(!completed)
+	{
+		answer = _starpu_mp_common_recv_command (node, arg, arg_size);
+
+		if(answer == STARPU_EXECUTION_COMPLETED)
+		{
+			int coreid;
+			STARPU_ASSERT(sizeof(coreid) == *arg_size);	
+			coreid = *(int *) *arg;
+			if(devid == coreid)
+				completed = 1;
+			else
+				if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
+					/* We receive a unknown or asynchronous message  */
+					STARPU_ASSERT(0);
+		}
+		else
+		{
+			if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
+				/* We receive a unknown or asynchronous message  */
+				STARPU_ASSERT(0);
+		}
+	}
+	return answer;
+}
+
+
+/* Send a request to the sink NODE for the number of cores on it. */
+int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
 {
-    // Send a request to the sink NODE for the number of cores on it.
 
-    enum _starpu_mp_command answer;
-    void *arg;
-    int arg_size = sizeof (int);
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size = sizeof (int);
 
-    _starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
+	_starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
 
-    answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
+	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
 
-    STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
+	STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
 
-    memcpy (buf, arg, arg_size);
+	memcpy (buf, arg, arg_size);
 
-    return 0;
+	return 0;
 }
 
 /* Send a request to the sink linked to NODE for the pointer to the
@@ -49,7 +262,7 @@ _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
  * else it returns -ESPIPE if the function was not found.
  */
 int _starpu_src_common_lookup(struct _starpu_mp_node *node,
-			      void (**func_ptr)(void), const char *func_name)
+		void (**func_ptr)(void), const char *func_name)
 {
 	enum _starpu_mp_command answer;
 	void *arg;
@@ -60,19 +273,21 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
 	//_STARPU_DEBUG("Looking up %s\n", func_name);
 	_starpu_mp_common_send_command(node, STARPU_LOOKUP, (void *) func_name,
-				       arg_size);
-	answer = _starpu_mp_common_recv_command(node, (void **) &arg,
-						&arg_size);
+			arg_size);
+
+	answer = _starpu_src_common_wait_command_sync(node, (void **) &arg,
+			&arg_size);
 
-	if (answer == STARPU_ERROR_LOOKUP) {
+	if (answer == STARPU_ERROR_LOOKUP) 
+	{
 		_STARPU_DISP("Error looking up symbol %s\n", func_name);
 		return -ESPIPE;
 	}
 
 	/* We have to be sure the device answered the right question and the
 	 * answer has the right size */
-	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP &&
-		      arg_size == sizeof(*func_ptr));
+	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP);
+	STARPU_ASSERT(arg_size == sizeof(*func_ptr));
 
 	memcpy(func_ptr, arg, arg_size);
 
@@ -81,32 +296,46 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 	return 0;
 }
 
- /* Send a message to the sink to execute a kernel.
+/* Send a message to the sink to execute a kernel.
  * The message sent has the form below :
  * [Function pointer on sink, number of interfaces, interfaces
  * (union _starpu_interface), cl_arg]
  */
-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
-				      void (*kernel)(void), unsigned coreid,
-				      starpu_data_handle_t *handles,
-				      void **interfaces,
-				      unsigned nb_interfaces,
-				      void *cl_arg, size_t cl_arg_size)
+/* Launch the execution of the function KERNEL points to on the sink linked
+ * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
+ * pointer.
+ * Data interfaces in task are send to the sink.
+ */
+int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
+		void (*kernel)(void), unsigned coreid,
+		enum starpu_codelet_type type,
+		int is_parallel_task, int cb_workerid,
+		starpu_data_handle_t *handles,
+		void **interfaces,
+		unsigned nb_interfaces,
+		void *cl_arg, size_t cl_arg_size)
 {
-	unsigned id;
-	void *buffer, *buffer_ptr, *arg = NULL;
-	int buffer_size = 0, arg_size = 0;
+
+	void *buffer, *buffer_ptr, *arg =NULL;
+	int buffer_size = 0, arg_size =0;
+	unsigned i;
+
+	buffer_size = sizeof(kernel) + sizeof(coreid) + sizeof(type)
+		+ sizeof(nb_interfaces) + nb_interfaces * sizeof(union _starpu_interface) + sizeof(is_parallel_task);
+	
+	/*if the task is paralle*/
+	if(is_parallel_task)
+	{
+		buffer_size += sizeof(cb_workerid); 
+	}
 
 	/* If the user didn't give any cl_arg, there is no need to send it */
-	buffer_size =
-	    sizeof(kernel) + sizeof(coreid) + sizeof(nb_interfaces) +
-	    nb_interfaces * sizeof(union _starpu_interface);
 	if (cl_arg)
 	{
 		STARPU_ASSERT(cl_arg_size);
 		buffer_size += cl_arg_size;
 	}
-
+	
 	/* We give to send_command a buffer we just allocated, which contains
 	 * a pointer to the function (sink-side), core on which execute this
 	 * function (sink-side), number of interfaces we send,
@@ -116,6 +345,18 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	*(void(**)(void)) buffer = kernel;
 	buffer_ptr += sizeof(kernel);
 
+	*(enum starpu_codelet_type *) buffer_ptr = type;
+	buffer_ptr += sizeof(type);
+
+	*(int *) buffer_ptr = is_parallel_task;
+	buffer_ptr += sizeof(is_parallel_task);
+
+	if(is_parallel_task)
+	{
+		*(int *) buffer_ptr = cb_workerid ;
+		buffer_ptr += sizeof(cb_workerid);
+	}
+
 	*(unsigned *) buffer_ptr = coreid;
 	buffer_ptr += sizeof(coreid);
 
@@ -126,11 +367,13 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	 * executed on a sink with a different memory, whereas a codelet is
 	 * executed on the host part for the other accelerators.
 	 * Thus we need to send a copy of each interface on the MP device */
-	for (id = 0; id < nb_interfaces; id++)
+
+	for (i = 0; i < nb_interfaces; i++)
 	{
-		starpu_data_handle_t handle = handles[id];
-		memcpy (buffer_ptr, interfaces[id],
-			handle->ops->interface_size);
+		starpu_data_handle_t handle = handles[i];
+
+		memcpy (buffer_ptr, interfaces[i],
+				handle->ops->interface_size);
 		/* The sink side has no mean to get the type of each
 		 * interface, we use a union to make it generic and permit the
 		 * sink to go through the array */
@@ -141,7 +384,7 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 		memcpy(buffer_ptr, cl_arg, cl_arg_size);
 
 	_starpu_mp_common_send_command(node, STARPU_EXECUTE, buffer, buffer_size);
-	enum _starpu_mp_command answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+	enum _starpu_mp_command answer = _starpu_src_common_wait_command_sync(node, &arg, &arg_size);
 
 	if (answer == STARPU_ERROR_EXECUTE)
 		return -EINVAL;
@@ -151,46 +394,76 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	free(buffer);
 
 	return 0;
-
 }
 
-/* Launch the execution of the function KERNEL points to on the sink linked
- * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
- * pointer.
- * Data interfaces in task are send to the sink.
- */
-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
-						void (*kernel)(void), unsigned coreid,
-						struct starpu_task *task)
+
+/* Get the information and call the function to send to the sink a message to execute the task*/
+static int _starpu_src_common_execute(struct _starpu_job *j, 
+		struct _starpu_worker *worker, 
+		struct _starpu_mp_node * node)
 {
-    return _starpu_src_common_execute_kernel(node, kernel, coreid,
-					     task->handles, task->interfaces, task->cl->nbuffers,
-					     task->cl_arg, task->cl_arg_size);
+	int ret;
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	int profiling = starpu_profiling_status_get();
+
+	STARPU_ASSERT(task);
+	if (worker->current_rank == 0) 
+	{
+		ret = _starpu_fetch_task_input(j, mask);
+		if (ret != 0)
+		{
+			/* there was not enough memory, so the input of
+			 * the codelet cannot be fetched ... put the
+			 * codelet back, and try it later */
+			return -EAGAIN;
+		}
+	}
+
+	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
+
+	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
+
+
+	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
+
+	_starpu_src_common_execute_kernel(node, kernel, worker->devid, task->cl->type,
+			(j->task_size > 1),
+			j->combined_workerid, task->handles,
+			task->interfaces, task->cl->nbuffers,
+			task->cl_arg, task->cl_arg_size);
+
+
+	return 0;
 }
 
+
 /* Send a request to the sink linked to the MP_NODE to allocate SIZE bytes on
  * the sink.
  * In case of success, it returns 0 and *ADDR contains the address of the
  * allocated area ;
  * else it returns 1 if the allocation fail.
  */
-int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
-								void **addr, size_t size)
+int _starpu_src_common_allocate(struct _starpu_mp_node *mp_node,
+		void **addr, size_t size)
 {
 	enum _starpu_mp_command answer;
 	void *arg;
 	int arg_size;
 
 	_starpu_mp_common_send_command(mp_node, STARPU_ALLOCATE, &size,
-								   sizeof(size));
+			sizeof(size));
 
-	answer = _starpu_mp_common_recv_command(mp_node, &arg, &arg_size);
+	answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
 
 	if (answer == STARPU_ERROR_ALLOCATE)
 		return 1;
 
 	STARPU_ASSERT(answer == STARPU_ANSWER_ALLOCATE &&
-				  arg_size == sizeof(*addr));
+			arg_size == sizeof(*addr));
 
 	memcpy(addr, arg, arg_size);
 
@@ -201,15 +474,15 @@ int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
  * area pointed by ADDR.
  */
 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
-							 void *addr)
+		void *addr)
 {
 	_starpu_mp_common_send_command(mp_node, STARPU_FREE, &addr, sizeof(addr));
 }
 
 /* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
- */
+*/
 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
-										 void *src, void *dst, size_t size)
+		void *src, void *dst, size_t size)
 {
 	struct _starpu_mp_transfer_command cmd = {size, dst};
 
@@ -220,9 +493,9 @@ int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
 }
 
 /* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
- */
+*/
 int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
-										 void *src, void *dst, size_t size)
+		void *src, void *dst, size_t size)
 {
 	struct _starpu_mp_transfer_command cmd = {size, src};
 
@@ -265,8 +538,8 @@ int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
 /* 5 functions to determine the executable to run on the device (MIC, SCC,
  * MPI).
  */
-static void _starpu_src_common_cat_3(char *final, const char *first, const char *second,
-										  const char *third)
+static void _starpu_src_common_cat_3(char *final, const char *first, 
+		const char *second, const char *third)
 {
 	strcpy(final, first);
 	strcat(final, second);
@@ -304,9 +577,9 @@ static int _starpu_src_common_test_suffixes(char *located_file_name, const char
 }
 
 int _starpu_src_common_locate_file(char *located_file_name,
-							const char *env_file_name, const char *env_mic_path,
-							const char *config_file_name, const char *actual_file_name,
-							const char **suffixes)
+		const char *env_file_name, const char *env_mic_path,
+		const char *config_file_name, const char *actual_file_name,
+		const char **suffixes)
 {
 	if (env_file_name != NULL)
 	{
@@ -372,3 +645,98 @@ int _starpu_src_common_locate_file(char *located_file_name,
 
 	return 1;
 }
+
+/* Send workers to the sink node 
+ */
+static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int baseworkerid, int nworkers)
+{	
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	int worker_size = sizeof(struct _starpu_worker)*nworkers;	
+	int combined_worker_size = STARPU_NMAX_COMBINEDWORKERS*sizeof(struct _starpu_combined_worker);
+	int msg[5];
+	msg[0] = nworkers;
+	msg[1] = worker_size;
+	msg[2] = combined_worker_size;
+	msg[3] = baseworkerid;
+	msg[4] = starpu_worker_get_count();
+
+	/* tell the sink node that we will send him all workers */
+	_starpu_mp_common_send_command(node, STARPU_SYNC_WORKERS, 
+			&msg, sizeof(msg));
+
+	/* Send all worker to the sink node */
+	node->dt_send(node,&config->workers[baseworkerid],worker_size);
+
+	/* Send all combined workers to the sink node */
+	node->dt_send(node, &config->combined_workers,combined_worker_size);
+}	
+
+/* Function looping on the source node */
+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
+		unsigned baseworkerid, 
+		struct _starpu_mp_node * mp_node)
+{ 
+	unsigned memnode = worker_set->workers[0].memory_node;
+	struct starpu_task **tasks = malloc(sizeof(struct starpu_task *)*worker_set->nworkers);
+
+	_starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
+
+	/*main loop*/
+	while (_starpu_machine_is_running())
+	{
+		int res;
+		struct _starpu_job * j;
+
+		_STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		_STARPU_TRACE_END_PROGRESS(memnode);
+
+		/* Handle message which have been store */
+		_starpu_src_common_handle_stored_async(mp_node);
+
+		/* poll the device for completed jobs.*/
+		while(mp_node->mp_recv_is_ready(mp_node))
+			_starpu_src_common_recv_async(mp_node);
+
+		/* get task for each worker*/
+		res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers);
+
+		/*if at least one worker have pop a task*/
+		if(res != 0)
+		{
+			unsigned i;
+			for(i=0; i<worker_set->nworkers; i++)
+			{
+				if(tasks[i] != NULL)
+				{
+					j = _starpu_get_job_associated_to_task(tasks[i]);
+					_starpu_set_local_worker_key(&worker_set->workers[i]);
+					res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
+					switch (res)
+					{
+						case 0:
+							/* The task task has been launched with no error */
+							break;
+						case -EAGAIN:
+							_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", tasks[i]);
+							_starpu_push_task_to_workers(tasks[i]);
+							STARPU_ABORT();
+							continue;
+							break;
+						default:
+							STARPU_ASSERT(0);
+					}
+				}
+			}
+		}
+	}
+	free(tasks);
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
+}

+ 28 - 9
src/drivers/mp_common/source_common.h

@@ -21,28 +21,42 @@
 
 #ifdef STARPU_USE_MP
 
+#include <core/sched_policy.h>
+#include <core/task.h>
 #include <drivers/mp_common/mp_common.h>
 
+
+enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
+							     void ** arg, int* arg_size);
+void _starpu_src_common_recv_async(struct _starpu_worker_set *worker_set, 
+				   struct _starpu_mp_node * baseworker_node);
+
+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
+		void * arg, int arg_size, enum _starpu_mp_command answer);
+
+enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size);
+
 int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
 
 int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
 			      void (**func_ptr)(void), const char *func_name);
 
-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
-				      void (*kernel)(void), unsigned coreid,
-				      starpu_data_handle_t *handles, void **interfaces, unsigned nb_interfaces,
-				      void *cl_arg, size_t cl_arg_size);
-
-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
-						void (*kernel)(void), unsigned coreid,
-						struct starpu_task *task);
-
 int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
 				void **addr, size_t size);
 
 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
 			     void *addr);
 
+int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
+				      void (*kernel)(void), unsigned coreid,
+				      enum starpu_codelet_type type,
+				      int is_parallel_task, int cb_workerid,
+				      starpu_data_handle_t *handles,
+				      void **interfaces,
+				      unsigned nb_interfaces,
+				      void *cl_arg, size_t cl_arg_size);
+
+
 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
 					 void *src, void *dst, size_t size);
 
@@ -57,6 +71,11 @@ int _starpu_src_common_locate_file(char *located_file_name,
 				   const char *config_file_name, const char *actual_file_name,
 				   const char **suffixes);
 
+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
+			       unsigned baseworkerid, 
+			       struct _starpu_mp_node * node_set);
+
+
 #endif /* STARPU_USE_MP */
 
 

+ 8 - 0
src/drivers/scc/driver_scc_common.c

@@ -172,3 +172,11 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 	fprintf(stderr, "RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
 	STARPU_ABORT();
 }
+
+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
+{
+  /***********
+      TODO
+  ************/
+  STARPU_ASSERT(0);
+}

+ 2 - 0
src/drivers/scc/driver_scc_common.h

@@ -44,6 +44,8 @@ void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
 void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
 
+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
+
 #endif /* STARPU_USE_SCC */
 
 

+ 34 - 0
src/drivers/scc/driver_scc_sink.c

@@ -16,6 +16,7 @@
 
 
 #include <RCCE.h>
+#include <dlfcn.h>
 
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/sink_common.h>
@@ -27,6 +28,23 @@
 void _starpu_scc_sink_init(struct _starpu_mp_node *node)
 {
 	node->mp_connection.scc_nodeid = _starpu_scc_common_get_src_node_id();
+
+	/****************
+	 *     TODO     *
+	 * get nb_cores *
+	 ****************/
+	node->nb_cores = 1; 
+	STARPU_ASSERT(0);
+
+}
+
+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node)
+{
+	/*****************
+	 *     TODO      *
+	 * init thread   *
+	 *****************/
+	STARPU_ASSERT(0);
 }
 
 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
@@ -51,6 +69,15 @@ void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int s
 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
 }
 
+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread)
+{
+	/****************
+	 *     TODO     *
+	 ****************/
+	STARPU_ASSERT(0);
+}
+
+
 /* arg -> [Function pointer on sink, number of interfaces, interfaces
  * (union _starpu_interface), cl_arg]
  *
@@ -124,3 +151,10 @@ void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int
 
 	_starpu_sink_common_execute(node, arg, arg_size);
 }
+
+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
+{
+	void *dl_handle = dlopen(NULL, RTLD_NOW);
+	return dlsym(dl_handle, func_name);
+}
+

+ 5 - 0
src/drivers/scc/driver_scc_sink.h

@@ -25,13 +25,18 @@
 #include <drivers/mp_common/mp_common.h>
 
 void _starpu_scc_sink_init(struct _starpu_mp_node *node);
+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node);
 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
 
 void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
 void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
 
+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread);
+
 void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
 
+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void);
+
 #endif /* STARPU_USE_SCC */
 
 

+ 28 - 121
src/drivers/scc/driver_scc_source.c

@@ -60,79 +60,40 @@ static void _starpu_scc_src_deinit_context(int devid)
 
 	_starpu_mp_common_node_destroy(scc_mp_nodes[devid]);
 }
-
-static int _starpu_scc_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void)
 {
-	int ret;
-	uint32_t mask = 0;
-
-	STARPU_ASSERT(j);
-	struct starpu_task *task = j->task;
-
-	struct timespec codelet_start, codelet_end;
-
-	int profiling = starpu_profiling_status_get();
-	unsigned calibrate_model = 0;
-
-	STARPU_ASSERT(task);
-	struct starpu_codelet *cl = task->cl;
-	STARPU_ASSERT(cl);
-
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
-	ret = _starpu_fetch_task_input(j, mask);
-	if (ret != 0)
+  starpu_scc_kernel_t kernel = NULL;
+
+  starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
+  if (func)
+    {
+      /* We execute the function contained in the codelet, it must return a
+       * pointer to the function to execute on the device, either specified
+       * directly by the user or by a call to starpu_scc_get_kernel().
+       */
+      kernel = func();
+    }
+  else
+    {
+      /* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
+       * cpu_funcs_name.
+       */
+      char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+      if (func_name)
 	{
-		/* there was not enough memory, so the input of
-		 * the codelet cannot be fetched ... put the
-		 * codelet back, and try it later */
-		return -EAGAIN;
-	}
-
+	  starpu_scc_func_symbol_t symbol;
 
-	starpu_scc_kernel_t kernel = NULL;
+	  _starpu_scc_src_register_kernel(&symbol, func_name);
 
-	starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
-	if (func)
-	{
-		/* We execute the function contained in the codelet, it must return a
-		 * pointer to the function to execute on the device, either specified
-		 * directly by the user or by a call to starpu_scc_get_kernel().
-		 */
-		kernel = func();
+	  kernel = _starpu_scc_src_get_kernel(symbol);
 	}
-	else
-	{
-		/* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
-		 * cpu_funcs_name.
-		 */
-		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
-		if (func_name)
-		{
-			starpu_scc_func_symbol_t symbol;
-
-			_starpu_scc_src_register_kernel(&symbol, func_name);
-
-			kernel = _starpu_scc_src_get_kernel(symbol);
-		}
-	}
-	STARPU_ASSERT(kernel);
-
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
-
-	_starpu_src_common_execute_kernel_from_task(scc_mp_nodes[args->devid], (void (*)(void)) kernel, 0, task);
-
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
-
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
-
-	_starpu_push_task_output(j, mask);
+    }
+  STARPU_ASSERT(kernel);  
 
-
-	return 0;
+  return (void (*)(void))kernel;
 }
 
+
 void _starpu_scc_src_mp_deinit()
 {
 	_starpu_scc_common_unmap_shared_memory();
@@ -320,7 +281,7 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
 void *_starpu_scc_src_worker(void *arg)
 {
-	struct _starpu_worker *args = arg;
+	struct _starpu_worker_set *args = arg;
 
 	int devid = args->devid;
 	int workerid = args->workerid;
@@ -350,64 +311,10 @@ void *_starpu_scc_src_worker(void *arg)
 	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
-	struct _starpu_job * j;
-	struct starpu_task *task;
-	int res;
-
-	while (_starpu_machine_is_running())
-	{
-		_STARPU_TRACE_START_PROGRESS(memnode);
-		_starpu_datawizard_progress(memnode, 1);
-		_STARPU_TRACE_END_PROGRESS(memnode);
-
-		task = _starpu_get_worker_task(args, workerid, memnode);
-		if (!task)
-			continue;
-
-		j = _starpu_get_job_associated_to_task(task);
-
-		/* can a SCC device do that task ? */
-		if (!_STARPU_SCC_MAY_PERFORM(j))
-		{
-			/* this isn't a SCC task */
-			_starpu_push_task_to_workers(task);
-			continue;
-		}
-
-		_starpu_set_current_task(task);
-		args->current_task = j->task;
-
-		res = _starpu_scc_src_execute_job(j, args);
-
-		_starpu_set_current_task(NULL);
-		args->current_task = NULL;
-
-		if (res)
-		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, SCC could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-					continue;
-				default:
-					STARPU_ASSERT(0);
-			}
-		}
-
-		_starpu_handle_job_termination(j);
-	}
+	_starpu_src_common_worker(args, baseworkerid, scc_mp_nodes[mp_nodeid]);
 
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
-
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
-
 	_starpu_scc_src_deinit_context(args->devid);
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_SCC_KEY);

+ 1 - 0
src/drivers/scc/driver_scc_source.h

@@ -29,6 +29,7 @@
 
 void _starpu_scc_src_mp_deinit();
 
+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void);
 int _starpu_scc_src_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
 starpu_scc_kernel_t _starpu_scc_src_get_kernel(starpu_scc_func_symbol_t symbol);
 

+ 3 - 3
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -53,8 +53,6 @@ struct _starpu_dmda_data
 	long int ready_task_cnt;
 };
 
-static double idle_power = 0.0;
-
 /* The dmda scheduling policy uses
  *
  * alpha * T_computation + beta * T_communication + gamma * Consumption
@@ -70,6 +68,7 @@ static double idle_power = 0.0;
 static double alpha = _STARPU_SCHED_ALPHA_DEFAULT;
 static double beta = _STARPU_SCHED_BETA_DEFAULT;
 static double _gamma = _STARPU_SCHED_GAMMA_DEFAULT;
+static double idle_power = 0.0;
 static const float alpha_minimum=0;
 static const float alpha_maximum=10.0;
 static const float beta_minimum=0;
@@ -349,7 +348,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = best_workerid;
+		intptr_t id = best_workerid;
 		AYU_event(AYU_ADDTASKTOQUEUE, _starpu_get_job_associated_to_task(task)->job_id, &id);
 	}
 #endif
@@ -866,6 +865,7 @@ static void initialize_dmda_policy(unsigned sched_ctx_id)
 		dt->idle_power = atof(strval_idle_power);
 
 #ifdef STARPU_USE_TOP
+	/* FIXME: broken, needs to access context variable */
 	starpu_top_register_parameter_float("DMDA_ALPHA", &alpha,
 					    alpha_minimum, alpha_maximum, param_modified);
 	starpu_top_register_parameter_float("DMDA_BETA", &beta,

+ 16 - 18
src/sched_policies/parallel_eager.c

@@ -29,8 +29,8 @@ struct _starpu_peager_data
         starpu_pthread_mutex_t policy_mutex;
 };
 
-#define STARPU_NMAXCOMBINED_WORKERS 10
-/* XXX instead of 10, we should use some "MAX combination .."*/
+#define STARPU_NMAXCOMBINED_WORKERS 520 
+/* instead of STARPU_NMAXCOMBINED_WORKERS, we should use some "MAX combination .."*/
 static int possible_combinations_cnt[STARPU_NMAXWORKERS];
 static int possible_combinations[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
 static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
@@ -42,15 +42,11 @@ static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WO
 
 static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
+	_starpu_sched_find_worker_combinations(workerids, nworkers);
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned nbasic_workers = starpu_worker_get_count();
-		
-	_starpu_sched_find_worker_combinations(workerids, nworkers);
-
+	unsigned ncombined_workers= starpu_combined_worker_get_count();
 	unsigned workerid, i;
-	unsigned ncombinedworkers;
-
-	ncombinedworkers = starpu_combined_worker_get_count();
 
 	/* Find the master of each worker. We first assign the worker as its
 	 * own master, and then iterate over the different worker combinations
@@ -67,7 +63,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 	}
 
 
-	for (i = 0; i < ncombinedworkers; i++)
+	for (i = 0; i < ncombined_workers; i++)
 	{
 		workerid = nbasic_workers + i;
 
@@ -77,20 +73,17 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 		starpu_combined_worker_get_description(workerid, &size, &workers);
 
 		int master = workers[0];
-
 		int j;
 		for (j = 0; j < size; j++)
 		{
 			if (data->master_id[workers[j]] > master)
 				data->master_id[workers[j]] = master;
-
 			int cnt = possible_combinations_cnt[workers[j]]++;
 			possible_combinations[workers[j]][cnt] = workerid;
 			possible_combinations_size[workers[j]][cnt] = size;
 		}
 	}
 
-
 	for(i = 0; i < nworkers; i++)
 	{
 		workerid = workerids[i];
@@ -176,9 +169,11 @@ static int push_task_peager_policy(struct starpu_task *task)
 	{
 		worker = workers->get_next(workers, &it);
 		int master = data->master_id[worker];
-		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-		if ((!starpu_worker_is_combined_worker(worker) && starpu_worker_get_type(worker) != STARPU_CPU_WORKER)
-		    || (master == worker))
+		/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
+		if ((!starpu_worker_is_combined_worker(worker) && 
+		    starpu_worker_get_type(worker) != STARPU_MIC_WORKER &&
+		    starpu_worker_get_type(worker) != STARPU_CPU_WORKER)  
+			|| (master == worker))
 		{
 			starpu_pthread_mutex_t *sched_mutex;
 			starpu_pthread_cond_t *sched_cond;
@@ -198,8 +193,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	int workerid = starpu_worker_get_id();
 
-	/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER)
+	/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
+	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER && starpu_worker_get_type(workerid) != STARPU_MIC_WORKER)
 	{
 		struct starpu_task *task = NULL;
 		STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
@@ -211,6 +206,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	int master = data->master_id[workerid];
 
+	//_STARPU_DEBUG("workerid:%d, master:%d\n",workerid,master);
+
+
 	if (master == workerid)
 	{
 		/* The worker is a master */
@@ -248,9 +246,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		/* Is this a basic worker or a combined worker ? */
 		int nbasic_workers = (int)starpu_worker_get_count();
 		int is_basic_worker = (best_workerid < nbasic_workers);
-
 		if (is_basic_worker)
 		{
+
 			/* The master is alone */
 			return task;
 		}

+ 1 - 1
src/sched_policies/parallel_heft.c

@@ -502,7 +502,7 @@ static int parallel_heft_push_task(struct starpu_task *task)
 	return ret_val;
 }
 
-static void parallel_heft_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+static void parallel_heft_add_workers(__attribute__((unused)) unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
 	int workerid;
 	unsigned i;

+ 2 - 2
src/sched_policies/random_policy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -89,7 +89,7 @@ static int _random_push_task(struct starpu_task *task, unsigned prio)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = selected;
+		intptr_t id = selected;
 		AYU_event(AYU_ADDTASKTOQUEUE, _starpu_get_job_associated_to_task(task)->job_id, &id);
 	}
 #endif

+ 1 - 1
src/sched_policies/work_stealing_policy.c

@@ -369,7 +369,7 @@ int ws_push_task(struct starpu_task *task)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	{
-		int id = workerid;
+		intptr_t id = workerid;
 		AYU_event(AYU_ADDTASKTOQUEUE, j->job_id, &id);
 	}
 #endif

+ 7 - 2
src/util/starpu_data_cpy.c

@@ -78,8 +78,13 @@ void mp_cpy_kernel(void *descr[], void *cl_arg)
 	void *dst_interface = descr[0];
 	void *src_interface = descr[1];
 
-	STARPU_ASSERT(copy_methods->ram_to_ram);
-	copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
+	if(copy_methods->ram_to_ram)
+		copy_methods->ram_to_ram(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM);
+	else if(copy_methods->any_to_any)
+		copy_methods->any_to_any(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM, NULL);
+	else
+		STARPU_ABORT();
+
 }
 
 static starpu_mic_kernel_t mic_cpy_func()

+ 3 - 3
tests/Makefile.am

@@ -15,10 +15,10 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = $(HWLOC_CFLAGS) $(FXT_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
+AM_CFLAGS = $(HWLOC_CFLAGS) $(FXT_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) $(FXT_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/src -I$(top_srcdir)/src/
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(FXT_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(FXT_LDFLAGS)
 
 EXTRA_DIST =					\
 	helper.h				\
@@ -611,7 +611,7 @@ endif
 perfmodels_feed_SOURCES=\
 	perfmodels/feed.c
 
-sched_policies_execute_all_tasks_LDFLAGS = -lm
+sched_policies_execute_all_tasks_LDFLAGS = $(AM_LDFLAGS) -lm
 
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 2 - 0
tests/datawizard/acquire_cb_insert.c

@@ -77,6 +77,8 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
+
 	/* Declare x */
 	starpu_variable_data_register(&x_handle, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
 

+ 6 - 2
tests/datawizard/commute.c

@@ -85,7 +85,9 @@ void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 {
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-	if (codelet_end.modes[0] & STARPU_W)
+	enum starpu_data_access_mode end_mode = *(enum starpu_data_access_mode*) _args;
+
+	if (end_mode & STARPU_W)
 		(*x)++;
 }
 
@@ -105,7 +107,7 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	int ret;
 
 	codelet_begin.modes[0] = begin_mode;
-	codelet_end.modes[0] = end_mode;
+	codelet_end.modes[0] = end_mode;	
 
 	begin_t = starpu_task_create();
 	begin_t->cl = &codelet_begin;
@@ -130,6 +132,8 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	end_t->cl = &codelet_end;
 	end_t->handles[0] = x_handle;
 	end_t->detach = 0;
+	end_t->cl_arg = &end_mode;
+	end_t->cl_arg_size = sizeof(end_mode);
 
 	if (starpu_task_submit(begin_t) == -ENODEV)
 		exit(STARPU_TEST_SKIPPED);

+ 2 - 0
tests/datawizard/data_invalidation.c

@@ -141,6 +141,8 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
+
 	/* The buffer should never be explicitely allocated */
 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
 

+ 1 - 1
tests/datawizard/interfaces/coo/coo_interface.c

@@ -21,7 +21,7 @@
 #define MATRIX_SIZE (NX*NY)
 
 #if defined(STARPU_USE_CPU) || defined(STAPRU_USE_MIC)
-static void test_coo_cpu_func(void *buffers[], void *args);
+void test_coo_cpu_func(void *buffers[], void *args);
 #endif
 #ifdef STARPU_USE_CUDA
 extern void test_coo_cuda_func(void *buffers[], void *args);

+ 1 - 1
tests/disk/disk_copy.c

@@ -41,7 +41,7 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) goto enodev;
 
 	/* register a disk */
-	int new_dd = starpu_disk_register(&starpu_disk_leveldb_ops, (void *) "/tmp/tsss", 1024*1024*200);
+	int new_dd = starpu_disk_register(&starpu_disk_stdio_ops, (void *) "/tmp", 1024*1024*200);
 	/* can't write on /tmp/ */
 	if (new_dd == -ENOENT) goto enoent;
 	

+ 0 - 1
tests/errorcheck/invalid_blocking_calls.c

@@ -47,7 +47,6 @@ static struct starpu_codelet wrong_codelet =
 	.cpu_funcs = {wrong_func, NULL},
 	.cuda_funcs = {wrong_func, NULL},
         .opencl_funcs = {wrong_func, NULL},
-	.cpu_funcs_name = {"wrong_func", NULL},
 	.model = NULL,
 	.nbuffers = 0
 };

+ 0 - 0
tests/loader-cross.sh.in


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.