Browse Source

Merge from trunk @11364:11518

Marc Sergent 11 years ago
parent
commit
596effeffa
100 changed files with 1187 additions and 693 deletions
  1. 6 0
      ChangeLog
  2. 2 2
      configure.ac
  3. 5 0
      doc/Makefile.am
  4. 1 1
      doc/doxygen/chapters/advanced_examples.doxy
  5. 3 1
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  6. 4 0
      doc/doxygen/chapters/api/scheduling_policy.doxy
  7. 21 1
      doc/doxygen/chapters/environment_variables.doxy
  8. 17 0
      doc/doxygen/chapters/optimize_performance.doxy
  9. 13 11
      doc/doxygen/chapters/tips_and_tricks.doxy
  10. 0 3
      include/starpu.h
  11. 2 0
      include/starpu_bitmap.h
  12. 2 1
      include/starpu_sched_ctx_hypervisor.h
  13. 128 18
      include/starpu_task_list.h
  14. 4 0
      include/starpu_thread.h
  15. 47 1
      include/starpu_thread_util.h
  16. 10 0
      include/starpu_util.h
  17. 19 18
      mpi/src/starpu_mpi.c
  18. 1 0
      mpi/src/starpu_mpi_collective.c
  19. 7 0
      mpi/src/starpu_mpi_task_insert.c
  20. 8 0
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  21. 2 2
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  22. 9 2
      sc_hypervisor/src/policies_utils/lp_tools.c
  23. 1 1
      sc_hypervisor/src/policies_utils/speed.c
  24. 112 100
      sc_hypervisor/src/sc_hypervisor.c
  25. 2 0
      sc_hypervisor/src/sc_hypervisor_intern.h
  26. 0 2
      src/Makefile.am
  27. 1 0
      src/common/barrier.c
  28. 1 0
      src/common/barrier.h
  29. 17 3
      src/common/barrier_counter.c
  30. 5 3
      src/common/barrier_counter.h
  31. 11 0
      src/common/fxt.h
  32. 45 0
      src/common/thread.c
  33. 14 0
      src/common/utils.c
  34. 2 0
      src/common/utils.h
  35. 4 1
      src/core/debug.h
  36. 3 2
      src/core/dependencies/tags.c
  37. 10 7
      src/core/detect_combined_workers.c
  38. 2 2
      src/core/jobs.c
  39. 2 0
      src/core/perfmodel/perfmodel.h
  40. 2 2
      src/core/perfmodel/perfmodel_bus.c
  41. 0 6
      src/core/perfmodel/perfmodel_history.c
  42. 1 0
      src/core/perfmodel/perfmodel_nan.c
  43. 3 7
      src/core/progress_hook.c
  44. 100 7
      src/core/sched_ctx.c
  45. 23 0
      src/core/sched_ctx.h
  46. 4 27
      src/core/sched_policy.c
  47. 163 91
      src/core/task.c
  48. 3 8
      src/core/task.h
  49. 22 13
      src/core/topology.c
  50. 5 0
      src/core/workers.c
  51. 4 3
      src/core/workers.h
  52. 20 1
      src/datawizard/coherency.c
  53. 52 23
      src/datawizard/data_request.c
  54. 2 1
      src/datawizard/data_request.h
  55. 4 2
      src/datawizard/datawizard.c
  56. 3 1
      src/datawizard/filters.c
  57. 1 0
      src/datawizard/footprint.c
  58. 5 5
      src/datawizard/interfaces/block_interface.c
  59. 5 4
      src/datawizard/interfaces/data_interface.c
  60. 15 9
      src/datawizard/interfaces/matrix_interface.c
  61. 37 17
      src/datawizard/malloc.c
  62. 2 0
      src/datawizard/malloc.h
  63. 72 65
      src/datawizard/memalloc.c
  64. 2 1
      src/datawizard/memalloc.h
  65. 1 1
      src/datawizard/user_interactions.c
  66. 5 0
      src/debug/traces/starpu_fxt.c
  67. 7 3
      src/debug/traces/starpu_paje.c
  68. 2 3
      src/drivers/cpu/driver_cpu.c
  69. 7 6
      src/drivers/cuda/driver_cuda.c
  70. 1 0
      src/drivers/disk/driver_disk.c
  71. 6 5
      src/drivers/opencl/driver_opencl.c
  72. 2 1
      src/profiling/profiling.h
  73. 1 0
      src/sched_policies/eager_central_priority_policy.c
  74. 1 2
      src/sched_policies/parallel_eager.c
  75. 12 0
      src/sched_policies/parallel_heft.c
  76. 1 0
      src/sched_policies/work_stealing_policy.c
  77. 1 0
      src/util/starpu_task_insert.c
  78. 0 158
      src/util/starpu_task_list_inline.h
  79. 2 1
      tests/Makefile.am
  80. 1 0
      tests/datawizard/acquire_cb.c
  81. 1 0
      tests/datawizard/acquire_cb_insert.c
  82. 2 0
      tests/datawizard/acquire_release.c
  83. 2 0
      tests/datawizard/acquire_release2.c
  84. 3 3
      tests/datawizard/allocate.c
  85. 1 7
      tests/datawizard/commute.c
  86. 1 1
      tests/datawizard/critical_section_with_void_interface.c
  87. 2 1
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  88. 1 1
      tests/datawizard/data_implicit_deps.c
  89. 1 1
      tests/datawizard/data_invalidation.c
  90. 2 1
      tests/datawizard/dining_philosophers.c
  91. 1 3
      tests/datawizard/increment_redux.c
  92. 1 3
      tests/datawizard/increment_redux_lazy.c
  93. 0 2
      tests/datawizard/increment_redux_v2.c
  94. 0 1
      tests/datawizard/interfaces/block/block_interface.c
  95. 2 0
      tests/datawizard/interfaces/multiformat/advanced/generic.c
  96. 2 2
      tests/datawizard/lazy_unregister.c
  97. 2 2
      tests/datawizard/manual_reduction.c
  98. 4 8
      tests/datawizard/mpi_like.c
  99. 16 2
      src/util/starpu_inlines.c
  100. 0 0
      tests/datawizard/readers_and_writers.c

+ 6 - 0
ChangeLog

@@ -54,6 +54,8 @@ New features:
     cudaMalloc overhead.
     cudaMalloc overhead.
   * Prefetching is now done for all schedulers when it can be done whatever
   * Prefetching is now done for all schedulers when it can be done whatever
     the scheduling decision.
     the scheduling decision.
+  * Add a watchdog which permits to easily trigger a crash when StarPU gets
+    stuck.
 
 
 Small features:
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
   * New functions starpu_data_acquire_cb_sequential_consistency() and
@@ -89,6 +91,10 @@ Changes:
   * Function starpu_sched_ctx_create() now takes a variable argument
   * Function starpu_sched_ctx_create() now takes a variable argument
     list to define the scheduler to be used, and the minimum and
     list to define the scheduler to be used, and the minimum and
     maximum priority values
     maximum priority values
+  * The functions starpu_sched_set/get_min/max_priority set/get the
+    priorities of the current scheduling context, i.e the one which
+    was set by a call to starpu_sched_ctx_set_context() or the initial
+    context if the function was not called yet.
 
 
 StarPU 1.1.0 (svn revision xxxx)
 StarPU 1.1.0 (svn revision xxxx)
 ==============================================
 ==============================================

+ 2 - 2
configure.ac

@@ -1536,8 +1536,8 @@ AC_DEFINE_UNQUOTED(STARPU_MAXNODES, [$maxnodes],
 
 
 
 
 AC_MSG_CHECKING(whether allocation cache should be used)
 AC_MSG_CHECKING(whether allocation cache should be used)
-AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--enable-allocation-cache],
-			[enable data allocation cache])],
+AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--disable-allocation-cache],
+			[disable data allocation cache])],
 			enable_allocation_cache=$enableval, enable_allocation_cache=yes)
 			enable_allocation_cache=$enableval, enable_allocation_cache=yes)
 AC_MSG_RESULT($enable_allocation_cache)
 AC_MSG_RESULT($enable_allocation_cache)
 if test x$enable_allocation_cache = xyes; then
 if test x$enable_allocation_cache = xyes; then

+ 5 - 0
doc/Makefile.am

@@ -28,3 +28,8 @@ EXTRA_DIST =    tutorial/hello_world.c \
 
 
 txtdir = ${docdir}/tutorial
 txtdir = ${docdir}/tutorial
 txt_DATA = $(EXTRA_DIST)
 txt_DATA = $(EXTRA_DIST)
+
+showcheck:
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 1 - 1
doc/doxygen/chapters/advanced_examples.doxy

@@ -347,7 +347,7 @@ tasks with varying size so that the regression can be computed. StarPU will not
 trust the regression unless there is at least 10% difference between the minimum
 trust the regression unless there is at least 10% difference between the minimum
 and maximum observed input size. It can be useful to set the
 and maximum observed input size. It can be useful to set the
 environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
 environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
-on varying input sizes with \ref STARPU_SCHED set to <c>eager</c> scheduler,
+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
 so as to feed the performance model for a variety of
 so as to feed the performance model for a variety of
 inputs. The application can also provide the measurements explictly by
 inputs. The application can also provide the measurements explictly by
 using the function starpu_perfmodel_update_history(). The tools
 using the function starpu_perfmodel_update_history(). The tools

+ 3 - 1
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -124,7 +124,9 @@ Set the scheduling context the subsequent tasks will be submitted to
 
 
 \fn unsigned starpu_sched_ctx_get_context(void)
 \fn unsigned starpu_sched_ctx_get_context(void)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts
-Return the scheduling context the tasks are currently submitted to
+Return the scheduling context the tasks are currently submitted to,
+or ::STARPU_NMAX_SCHED_CTXS if no default context has been defined
+by calling the function starpu_sched_ctx_set_context().
 
 
 \fn void starpu_sched_ctx_stop_task_submission(void)
 \fn void starpu_sched_ctx_stop_task_submission(void)
 \ingroup API_Scheduling_Contexts
 \ingroup API_Scheduling_Contexts

+ 4 - 0
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -76,6 +76,7 @@ block and wake up all workers.
 
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy
 \ingroup API_Scheduling_Policy
+TODO: check if this is correct
 Defines the minimum task priority level supported by the scheduling
 Defines the minimum task priority level supported by the scheduling
 policy. The default minimum priority level is the same as the default
 policy. The default minimum priority level is the same as the default
 priority level which is 0 by convention. The application may access
 priority level which is 0 by convention. The application may access
@@ -86,6 +87,7 @@ application.
 
 
 \fn int starpu_sched_set_max_priority(int max_prio)
 \fn int starpu_sched_set_max_priority(int max_prio)
 \ingroup API_Scheduling_Policy
 \ingroup API_Scheduling_Policy
+TODO: check if this is correct
 Defines the maximum priority level supported by the scheduling policy.
 Defines the maximum priority level supported by the scheduling policy.
 The default maximum priority level is 1. The application may access
 The default maximum priority level is 1. The application may access
 that value by calling the function starpu_sched_get_max_priority().
 that value by calling the function starpu_sched_get_max_priority().
@@ -95,11 +97,13 @@ application.
 
 
 \fn int starpu_sched_get_min_priority(void)
 \fn int starpu_sched_get_min_priority(void)
 \ingroup API_Scheduling_Policy
 \ingroup API_Scheduling_Policy
+TODO: check if this is correct
 Returns the current minimum priority level supported by the scheduling
 Returns the current minimum priority level supported by the scheduling
 policy
 policy
 
 
 \fn int starpu_sched_get_max_priority(void)
 \fn int starpu_sched_get_max_priority(void)
 \ingroup API_Scheduling_Policy
 \ingroup API_Scheduling_Policy
+TODO: check if this is correct
 Returns the current maximum priority level supported by the scheduling
 Returns the current maximum priority level supported by the scheduling
 policy
 policy
 
 

+ 21 - 1
doc/doxygen/chapters/environment_variables.doxy

@@ -257,7 +257,8 @@ Disable asynchronous copies between CPU and MIC devices.
 <dd>
 <dd>
 \anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
 \anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
 \addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
 \addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-Enable direct CUDA transfers from GPU to GPU, without copying through RAM.
+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
+through RAM. The default is Enabled.
 This permits to test the performance effect of GPU-Direct.
 This permits to test the performance effect of GPU-Direct.
 </dd>
 </dd>
 
 
@@ -548,6 +549,25 @@ When set to 0, data statistics will not be displayed at the
 end of the execution of an application (\ref DataStatistics).
 end of the execution of an application (\ref DataStatistics).
 </dd>
 </dd>
 
 
+<dt>STARPU_WATCHDOG_TIMEOUT</dt>
+<dd>
+\anchor STARPU_WATCHDOG_TIMEOUT
+\addindex __env__STARPU_WATCHDOG_TIMEOUT
+When set to a value other than 0, allows to make StarPU print an error
+message whenever StarPU does not terminate any task for 10ms. Should
+be used in combination with \ref STARPU_WATCHDOG_CRASH (see \ref
+DetectionStuckConditions).
+</dd>
+
+<dt>STARPU_WATCHDOG_CRASH</dt>
+<dd>
+\anchor STARPU_WATCHDOG_CRASH
+\addindex __env__STARPU_WATCHDOG_CRASH
+When set to a value other than 0, it triggers a crash when the watch
+dog is reached, thus allowing to catch the situation in gdb, etc
+(see \ref DetectionStuckConditions)
+</dd>
+
 </dl>
 </dl>
 
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 17 - 0
doc/doxygen/chapters/optimize_performance.doxy

@@ -382,6 +382,23 @@ Statistics on the execution can then be obtained by using <c>export
 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
 STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
  More details on performance feedback are provided by the next chapter.
  More details on performance feedback are provided by the next chapter.
 
 
+\section DetectionStuckConditions Detection Stuck Conditions
+
+It may happen that for some reason, StarPU does not make progress for a long
+period of time.  Reason are sometimes due to contention inside StarPU, but
+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
+driver, etc.
+
+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
+
+allows to make StarPU print an error message whenever StarPU does not terminate
+any task for 10ms. In addition to that,
+
+<c>export STARPU_WATCHDOG_CRASH=1</c>
+
+triggers a crash in that condition, thus allowing to catch the situation in gdb
+etc.
+
 \section CUDA-specificOptimizations CUDA-specific Optimizations
 \section CUDA-specificOptimizations CUDA-specific Optimizations
 
 
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 Due to CUDA limitations, StarPU will have a hard time overlapping its own

+ 13 - 11
doc/doxygen/chapters/tips_and_tricks.doxy

@@ -15,7 +15,7 @@ may run on the machine. For instance, a C++ computation class which is not
 thread-safe by itself, but for which several instanciated objects of that class
 thread-safe by itself, but for which several instanciated objects of that class
 can be used concurrently. This can be used in StarPU by initializing one such
 can be used concurrently. This can be used in StarPU by initializing one such
 object per worker. For instance, the libstarpufft example does the following to
 object per worker. For instance, the libstarpufft example does the following to
-be able to use FFTW.
+be able to use FFTW on CPUs.
 
 
 Some global array stores the instanciated objects:
 Some global array stores the instanciated objects:
 
 
@@ -49,21 +49,23 @@ static void fft(void *descr[], void *_args)
 }
 }
 \endcode
 \endcode
 
 
-Another way to go which may be needed is to execute some code from the workers
-themselves thanks to starpu_execute_on_each_worker(). This may be required
-by CUDA to behave properly due to threading issues. For instance, StarPU's
-starpu_cublas_init() looks like the following to call
-<c>cublasInit</c> from the workers themselves:
+This however is not sufficient for FFT on CUDA: initialization has
+to be done from the workers themselves.  This can be done thanks to
+starpu_execute_on_each_worker().  For instance libstarpufft does the following.
 
 
 \code{.c}
 \code{.c}
-static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
+static void fft_plan_gpu(void *args)
 {
 {
-    cublasStatus cublasst = cublasInit();
-    cublasSetKernelStream(starpu_cuda_get_local_stream());
+    plan plan = args;
+    int n2 = plan->n2[0];
+    int workerid = starpu_worker_get_id();
+
+    cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
+    cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
 }
 }
-void starpu_cublas_init(void)
+void starpufft_plan(void)
 {
 {
-    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
+    starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
 }
 }
 \endcode
 \endcode
 
 

+ 0 - 3
include/starpu.h

@@ -53,9 +53,6 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_worker.h>
 #include <starpu_worker.h>
 #include <starpu_task.h>
 #include <starpu_task.h>
 #include <starpu_task_list.h>
 #include <starpu_task_list.h>
-#ifdef BUILDING_STARPU
-#include <util/starpu_task_list_inline.h>
-#endif
 #include <starpu_task_util.h>
 #include <starpu_task_util.h>
 #include <starpu_sched_ctx.h>
 #include <starpu_sched_ctx.h>
 #include <starpu_expert.h>
 #include <starpu_expert.h>

+ 2 - 0
include/starpu_bitmap.h

@@ -44,4 +44,6 @@ int starpu_bitmap_first(struct starpu_bitmap *);
 int starpu_bitmap_last(struct starpu_bitmap *);
 int starpu_bitmap_last(struct starpu_bitmap *);
 //return the index of bit right after e, -1 if none
 //return the index of bit right after e, -1 if none
 int starpu_bitmap_next(struct starpu_bitmap *, int e);
 int starpu_bitmap_next(struct starpu_bitmap *, int e);
+int starpu_bitmap_has_next(struct starpu_bitmap * b, int e);
+
 #endif
 #endif

+ 2 - 1
include/starpu_sched_ctx_hypervisor.h

@@ -29,7 +29,8 @@ struct starpu_sched_ctx_performance_counters
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag);
+	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag,
+				      int nready_tasks, double nready_flops);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
 	void (*notify_ready_task)(unsigned sched_ctx_id, struct starpu_task *task);
 	void (*notify_ready_task)(unsigned sched_ctx_id, struct starpu_task *task);
 	void (*notify_empty_ctx)(unsigned sched_ctx_id, struct starpu_task *task);
 	void (*notify_empty_ctx)(unsigned sched_ctx_id, struct starpu_task *task);

+ 128 - 18
include/starpu_task_list.h

@@ -18,6 +18,7 @@
 #define __STARPU_TASK_LIST_H__
 #define __STARPU_TASK_LIST_H__
 
 
 #include <starpu_task.h>
 #include <starpu_task.h>
+#include <starpu_util.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C"
 extern "C"
@@ -30,25 +31,134 @@ struct starpu_task_list
 	struct starpu_task *tail;
 	struct starpu_task *tail;
 };
 };
 
 
-/* If we are building starpu and not using gnu inline, we have to avoid
- * declaring the functions extern, as in that case the compiler will compile
- * the inline into all .o files! */
-#if !defined(BUILDING_STARPU) || defined(__GNUC_GNU_INLINE__)
-
-void starpu_task_list_init(struct starpu_task_list *list);
-void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
-void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list);
-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list);
-int starpu_task_list_empty(struct starpu_task_list *list);
-void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task);
-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list);
-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list);
-struct starpu_task *starpu_task_list_next(struct starpu_task *task);
+static STARPU_INLINE
+void starpu_task_list_init(struct starpu_task_list *list)
+{
+	list->head = NULL;
+	list->tail = NULL;
+}
 
 
-#endif
+static STARPU_INLINE
+void starpu_task_list_push_front(struct starpu_task_list *list,
+				struct starpu_task *task)
+{
+	if (list->tail == NULL)
+	{
+		list->tail = task;
+	}
+	else
+	{
+		list->head->prev = task;
+	}
+
+	task->prev = NULL;
+	task->next = list->head;
+	list->head = task;
+}
+
+static STARPU_INLINE
+void starpu_task_list_push_back(struct starpu_task_list *list,
+				struct starpu_task *task)
+{
+	if (list->head == NULL)
+	{
+		list->head = task;
+	}
+	else
+	{
+		list->tail->next = task;
+	}
+
+	task->next = NULL;
+	task->prev = list->tail;
+	list->tail = task;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
+{
+	return list->head;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
+{
+	return list->tail;
+}
+
+static STARPU_INLINE
+int starpu_task_list_empty(struct starpu_task_list *list)
+{
+	return (list->head == NULL);
+}
+
+static STARPU_INLINE
+void starpu_task_list_erase(struct starpu_task_list *list,
+				struct starpu_task *task)
+{
+	struct starpu_task *p = task->prev;
+
+	if (p)
+	{
+		p->next = task->next;
+	}
+	else
+	{
+		list->head = task->next;
+	}
+
+	if (task->next)
+	{
+		task->next->prev = p;
+	}
+	else
+	{
+		list->tail = p;
+	}
+
+	task->prev = NULL;
+	task->next = NULL;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
+{
+	struct starpu_task *task = list->head;
+
+	if (task)
+		starpu_task_list_erase(list, task);
+
+	return task;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
+{
+	struct starpu_task *task = list->tail;
+
+	if (task)
+		starpu_task_list_erase(list, task);
+
+	return task;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
+{
+	return list->head;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
+{
+	return NULL;
+}
+
+static STARPU_INLINE
+struct starpu_task *starpu_task_list_next(struct starpu_task *task)
+{
+	return task->next;
+}
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 4 - 0
include/starpu_thread.h

@@ -160,7 +160,9 @@ typedef int starpu_pthread_rwlockattr_t;
 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr);
 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr);
 int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 
 #elif !defined(_MSC_VER) /* STARPU_SIMGRID */
 #elif !defined(_MSC_VER) /* STARPU_SIMGRID */
@@ -172,7 +174,9 @@ typedef pthread_rwlockattr_t starpu_pthread_rwlockattr_t;
 #define starpu_pthread_rwlock_destroy pthread_rwlock_destroy
 #define starpu_pthread_rwlock_destroy pthread_rwlock_destroy
 
 
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock);
+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock);
+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 
 #endif /* STARPU_SIMGRID, _MSC_VER */
 #endif /* STARPU_SIMGRID, _MSC_VER */

+ 47 - 1
include/starpu_thread_util.h

@@ -18,7 +18,8 @@
 #ifndef __STARPU_THREAD_UTIL_H__
 #ifndef __STARPU_THREAD_UTIL_H__
 #define __STARPU_THREAD_UTIL_H__
 #define __STARPU_THREAD_UTIL_H__
 
 
-#include <starpu.h>
+#include <starpu_util.h>
+#include <errno.h>
 
 
 /*
 /*
  * Encapsulation of the starpu_pthread_create_* functions.
  * Encapsulation of the starpu_pthread_create_* functions.
@@ -78,6 +79,21 @@
 	}                                                                      \
 	}                                                                      \
 } while (0)
 } while (0)
 
 
+#define STARPU_PTHREAD_MUTEX_TRYLOCK(mutex) \
+	_STARPU_PTHREAD_MUTEX_TRYLOCK(mutex, __FILE__, __LINE__)
+static STARPU_INLINE
+int _STARPU_PTHREAD_MUTEX_TRYLOCK(starpu_pthread_mutex_t *mutex, char *file, int line)
+{
+	int p_ret = starpu_pthread_mutex_trylock(mutex);
+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
+		fprintf(stderr,
+			"%s:%d starpu_pthread_mutex_trylock: %s\n",
+			file, line, strerror(p_ret));
+		STARPU_ABORT();
+	}
+	return p_ret;
+}
+
 #define STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
 #define STARPU_PTHREAD_MUTEX_UNLOCK(mutex) do {                               \
 	int p_ret = starpu_pthread_mutex_unlock(mutex);                        \
 	int p_ret = starpu_pthread_mutex_unlock(mutex);                        \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
@@ -143,6 +159,21 @@
 	}                                                                      \
 	}                                                                      \
 } while (0)
 } while (0)
 
 
+#define STARPU_PTHREAD_RWLOCK_TRYRDLOCK(rwlock) \
+	_starpu_pthread_rwlock_tryrdlock(rwlock, __FILE__, __LINE__)
+static STARPU_INLINE
+int _starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock, char *file, int line)
+{
+	int p_ret = starpu_pthread_rwlock_tryrdlock(rwlock);
+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
+		fprintf(stderr,
+			"%s:%d starpu_pthread_rwlock_tryrdlock: %s\n",
+			file, line, strerror(p_ret));
+		STARPU_ABORT();
+	}
+	return p_ret;
+}
+
 #define STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) do {                              \
 #define STARPU_PTHREAD_RWLOCK_WRLOCK(rwlock) do {                              \
 	int p_ret = starpu_pthread_rwlock_wrlock(rwlock);                      \
 	int p_ret = starpu_pthread_rwlock_wrlock(rwlock);                      \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
@@ -153,6 +184,21 @@
 	}                                                                      \
 	}                                                                      \
 } while (0)
 } while (0)
 
 
+#define STARPU_PTHREAD_RWLOCK_TRYWRLOCK(rwlock) \
+	_starpu_pthread_rwlock_trywrlock(rwlock, __FILE__, __LINE__)
+static STARPU_INLINE
+int _starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock, char *file, int line)
+{
+	int p_ret = starpu_pthread_rwlock_trywrlock(rwlock);
+	if (STARPU_UNLIKELY(p_ret != 0 && p_ret != EBUSY)) {
+		fprintf(stderr,
+			"%s:%d starpu_pthread_rwlock_trywrlock: %s\n",
+			file, line, strerror(p_ret));
+		STARPU_ABORT();
+	}
+	return p_ret;
+}
+
 #define STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) do {                              \
 #define STARPU_PTHREAD_RWLOCK_UNLOCK(rwlock) do {                              \
 	int p_ret = starpu_pthread_rwlock_unlock(rwlock);                      \
 	int p_ret = starpu_pthread_rwlock_unlock(rwlock);                      \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \
 	if (STARPU_UNLIKELY(p_ret)) {                                          \

+ 10 - 0
include/starpu_util.h

@@ -57,6 +57,16 @@ extern "C"
 #  define STARPU_ATTRIBUTE_ALIGNED(size)
 #  define STARPU_ATTRIBUTE_ALIGNED(size)
 #endif
 #endif
 
 
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define STARPU_INLINE inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define STARPU_INLINE __inline
+#else
+#  define STARPU_INLINE __inline__
+#endif
+
 #if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API) && !defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
 #if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API) && !defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
 #define STARPU_DEPRECATED  __attribute__((__deprecated__))
 #define STARPU_DEPRECATED  __attribute__((__deprecated__))
 #else
 #else

+ 19 - 18
mpi/src/starpu_mpi.c

@@ -25,6 +25,7 @@
 #include <common/config.h>
 #include <common/config.h>
 #include <common/thread.h>
 #include <common/thread.h>
 
 
+static void _starpu_mpi_add_sync_point_in_fxt(void);
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
 static void _starpu_mpi_submit_new_mpi_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
@@ -1248,6 +1249,19 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 #endif //STARPU_USE_FXT
 #endif //STARPU_USE_FXT
 	}
 	}
 
 
+	_starpu_mpi_add_sync_point_in_fxt();
+	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
+	_starpu_mpi_cache_init(MPI_COMM_WORLD);
+
+	{
+		int nb_nodes, k;
+		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
+		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
+		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
+		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
+	}
+
 	/* notify the main thread that the progression thread is ready */
 	/* notify the main thread that the progression thread is ready */
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 1;
 	running = 1;
@@ -1534,30 +1548,17 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	argc_argv->argc = argc;
 	argc_argv->argc = argc;
 	argc_argv->argv = argv;
 	argc_argv->argv = argv;
 
 
-	STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
-
-	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	while (!running)
-		STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-
 #ifdef STARPU_MPI_ACTIVITY
 #ifdef STARPU_MPI_ACTIVITY
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	hookid = starpu_progression_hook_register(progression_hook_func, NULL);
 	STARPU_ASSERT_MSG(hookid >= 0, "starpu_progression_hook_register failed");
 	STARPU_ASSERT_MSG(hookid >= 0, "starpu_progression_hook_register failed");
 #endif /* STARPU_MPI_ACTIVITY */
 #endif /* STARPU_MPI_ACTIVITY */
 
 
-	_starpu_mpi_add_sync_point_in_fxt();
-	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
-	_starpu_mpi_cache_init(MPI_COMM_WORLD);
+	STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
 
 
-	{
-		int nb_nodes, k;
-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
-		_starpu_mpi_app_req_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_req *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_app_req_hashmap[k] = NULL;
-		_starpu_mpi_copy_handle_hashmap = malloc(nb_nodes * sizeof(struct _starpu_mpi_copy_handle_hash_list *));
-		for(k=0 ; k<nb_nodes ; k++) _starpu_mpi_copy_handle_hashmap[k] = NULL;
-	}
+	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	while (!running)
+		STARPU_PTHREAD_COND_WAIT(&cond_progression, &mutex);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 
 	return 0;
 	return 0;
 }
 }

+ 1 - 0
mpi/src/starpu_mpi_collective.c

@@ -27,6 +27,7 @@ struct _callback_arg
 	int count;
 	int count;
 };
 };
 
 
+static
 void _callback_collective(void *arg)
 void _callback_collective(void *arg)
 {
 {
 	struct _callback_arg *callback_arg = arg;
 	struct _callback_arg *callback_arg = arg;

+ 7 - 0
mpi/src/starpu_mpi_task_insert.c

@@ -27,6 +27,7 @@
 #include <core/task.h>
 #include <core/task.h>
 
 
 #include <starpu_mpi_private.h>
 #include <starpu_mpi_private.h>
+#include <starpu_mpi_task_insert.h>
 
 
 /* Whether we are allowed to keep copies of remote data. */
 /* Whether we are allowed to keep copies of remote data. */
 struct _starpu_data_entry
 struct _starpu_data_entry
@@ -64,6 +65,7 @@ void _starpu_mpi_cache_init(MPI_Comm comm)
 	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
 	for(i=0 ; i<nb_nodes ; i++) _cache_received_data[i] = NULL;
 }
 }
 
 
+static
 void _starpu_mpi_cache_empty_tables(int world_size)
 void _starpu_mpi_cache_empty_tables(int world_size)
 {
 {
 	int i;
 	int i;
@@ -308,6 +310,7 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
 	}
 	}
 }
 }
 
 
+static
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute, MPI_Comm comm)
 void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int do_execute, MPI_Comm comm)
 {
 {
 	if (_cache_enabled)
 	if (_cache_enabled)
@@ -363,6 +366,7 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
 	}
 	}
 }
 }
 
 
+static
 int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_list varg_list)
 int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_list varg_list)
 {
 {
 	int arg_type;
 	int arg_type;
@@ -857,6 +861,7 @@ void _starpu_mpi_redux_data_dummy_func(STARPU_ATTRIBUTE_UNUSED void *buffers[],
 {
 {
 }
 }
 
 
+static
 struct starpu_codelet _starpu_mpi_redux_data_read_cl =
 struct starpu_codelet _starpu_mpi_redux_data_read_cl =
 {
 {
 	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
 	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func, NULL},
@@ -877,6 +882,7 @@ struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
 	.name = "_starpu_mpi_redux_data_write_cl"
 	.name = "_starpu_mpi_redux_data_write_cl"
 };
 };
 
 
+static
 void _starpu_mpi_redux_data_detached_callback(void *arg)
 void _starpu_mpi_redux_data_detached_callback(void *arg)
 {
 {
 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
@@ -889,6 +895,7 @@ void _starpu_mpi_redux_data_detached_callback(void *arg)
 	free(args);
 	free(args);
 }
 }
 
 
+static
 void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 {
 {
 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
 	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;

+ 8 - 0
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -47,6 +47,10 @@ struct sc_hypervisor_wrapper
 	/* user configuration meant to limit resizing */
 	/* user configuration meant to limit resizing */
 	struct sc_hypervisor_policy_config *config;
 	struct sc_hypervisor_policy_config *config;
 
 
+
+	/* the start time of the resizing sample of the workers of this context*/
+	double start_time_w[STARPU_NMAXWORKERS];
+
 	/* idle time of workers in this context */
 	/* idle time of workers in this context */
 	double current_idle_time[STARPU_NMAXWORKERS];
 	double current_idle_time[STARPU_NMAXWORKERS];
 
 
@@ -121,6 +125,10 @@ struct sc_hypervisor_wrapper
 
 
 	/* boolean indicating that a context is being sized */
 	/* boolean indicating that a context is being sized */
 	unsigned to_be_sized;
 	unsigned to_be_sized;
+
+	/* boolean indicating if we add the idle of this worker to 
+	   the idle of the context */
+	unsigned compute_idle[STARPU_NMAXWORKERS];
 };
 };
 
 
 /* return the wrapper of context that saves its monitoring information */
 /* return the wrapper of context that saves its monitoring information */

+ 2 - 2
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -24,8 +24,8 @@ int resize_no = 0;
 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 {
 	/* for vite */
 	/* for vite */
-	printf("resize_no = %d\n", resize_no);
-	starpu_trace_user_event(resize_no++);
+/* 	printf("resize_no = %d\n", resize_no); */
+/* 	starpu_trace_user_event(resize_no++); */
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;
 	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;

+ 9 - 2
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -729,13 +729,20 @@ void sc_hypervisor_lp_share_remaining_resources(int ns, unsigned *sched_ctxs,  i
 			}
 			}
 		}
 		}
 		if(!found)
 		if(!found)
+		{
 			remaining_workers[nw++] = worker;
 			remaining_workers[nw++] = worker;
+		}
 	}
 	}
 
 
 	if(nw > 0)
 	if(nw > 0)
+	{
 		for(s = 0; s < ns; s++)
 		for(s = 0; s < ns; s++)
-			sc_hypervisor_add_workers_to_sched_ctx(remaining_workers, nw, sched_ctxs[s]);		
-
+		{
+			for(w = 0; w < nw; w++)
+				_sc_hypervisor_allow_compute_idle(sched_ctxs[s], remaining_workers[w], 0);
+			sc_hypervisor_add_workers_to_sched_ctx(remaining_workers, nw, sched_ctxs[s]);
+		}		
+	}
 }
 }
 
 
 double sc_hypervisor_lp_find_tmax(double t1, double t2)
 double sc_hypervisor_lp_find_tmax(double t1, double t2)

+ 1 - 1
sc_hypervisor/src/policies_utils/speed.c

@@ -138,7 +138,7 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 		{
 		{
 			worker = workers->get_next(workers, &it);
 			worker = workers->get_next(workers, &it);
 			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
 			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
-			if(arch == req_arch)
+			if(arch == req_arch && sc_w->compute_idle[worker])
 			{
 			{
 				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
 				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
 				if(max_workers_idle_time < sc_w->idle_time[worker])
 				if(max_workers_idle_time < sc_w->idle_time[worker])

+ 112 - 100
sc_hypervisor/src/sc_hypervisor.c

@@ -24,7 +24,8 @@ struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 
 
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
-static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag);
+static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, 
+				  int hypervisor_tag, int nready_tasks, double ready_flops);
 static void notify_poped_task(unsigned sched_ctx, int  worker);
 static void notify_poped_task(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint, size_t data_size);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint, size_t data_size);
 static void notify_ready_task(unsigned sched_ctx, struct starpu_task *task);
 static void notify_ready_task(unsigned sched_ctx, struct starpu_task *task);
@@ -196,6 +197,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 		int j;
 		int j;
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
 		{
 		{
+			hypervisor.sched_ctx_w[i].start_time_w[i] = 0.0;
 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
 			hypervisor.sched_ctx_w[i].current_idle_time[j] = 0.0;
 			hypervisor.sched_ctx_w[i].idle_time[j] = 0.0;
 			hypervisor.sched_ctx_w[i].idle_time[j] = 0.0;
 			hypervisor.sched_ctx_w[i].idle_start_time[j] = 0.0;
 			hypervisor.sched_ctx_w[i].idle_start_time[j] = 0.0;
@@ -208,6 +210,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
+			hypervisor.sched_ctx_w[i].compute_idle[j] = 1;
 		}
 		}
 	}
 	}
 
 
@@ -472,17 +475,6 @@ double sc_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sc_hypervisor_
 	return ret_val;
 	return ret_val;
 }
 }
 
 
-static void _reset_idle_time(unsigned sched_ctx)
-{
-	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
-	{
-		hypervisor.sched_ctx_w[sched_ctx].idle_time[i] = 0.0;
-		hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] = hypervisor.sched_ctx_w[sched_ctx].idle_start_time[i] != 0.0 ? starpu_timing_now() : 0.0;
-	}
-	return;
-}
-
 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
 {
 {
 	double start_time =  starpu_timing_now();
 	double start_time =  starpu_timing_now();
@@ -493,32 +485,32 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 		
 		
 		sender_sc_w->start_time = start_time;
 		sender_sc_w->start_time = start_time;
 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
-		_reset_idle_time(sender_sched_ctx);
 		int i;
 		int i;
 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 		{
+			sender_sc_w->start_time_w[i] = start_time;
+			sender_sc_w->idle_time[i] = 0.0;
 			sender_sc_w->idle_start_time[i] = 0.0;
 			sender_sc_w->idle_start_time[i] = 0.0;
-			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = 0.0;
 			hypervisor.sched_ctx_w[sender_sched_ctx].exec_time[i] = 0.0;
 			hypervisor.sched_ctx_w[sender_sched_ctx].exec_time[i] = 0.0;
+			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = (hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
 		}
 		}
 		
 		
 	}
 	}
 
 
 	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 	if(receiver_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 	{
 	{
-
 		struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
 		struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
 		
 		
 		receiver_sc_w->start_time = start_time;
 		receiver_sc_w->start_time = start_time;
 		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
 		_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
-		_reset_idle_time(receiver_sched_ctx);
 		int i;
 		int i;
 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
 		for(i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 		{
-			receiver_sc_w->idle_start_time[i] = (hypervisor.sched_ctx_w[receiver_sched_ctx].idle_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
-			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = 0.0;
+			receiver_sc_w->start_time_w[i] = (receiver_sc_w->start_time_w[i] != 0.0) ? starpu_timing_now() : 0.0;
+			receiver_sc_w->idle_time[i] = 0.0;
+			receiver_sc_w->idle_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? 0.0 : starpu_timing_now();
+			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
 			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_time[i] = 0.0;
 			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_time[i] = 0.0;
-
 		}
 		}
 
 
 	}
 	}
@@ -801,6 +793,11 @@ void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *work
 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
 }
 }
 
 
+void _sc_hypervisor_allow_compute_idle(unsigned sched_ctx, int worker, unsigned allow)
+{
+	hypervisor.sched_ctx_w[sched_ctx].compute_idle[worker] = allow;
+}
+
 void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 {
 {
 	unsigned sched_ctx;
 	unsigned sched_ctx;
@@ -823,32 +820,62 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 		if(workers->init_iterator)
 		if(workers->init_iterator)
 			workers->init_iterator(workers, &it);
 			workers->init_iterator(workers, &it);
 		
 		
-		max_workers_idle_time[i] = 0.0;
-		int nshared_workers = 0;
-		double cpu_used_in_shared = 0.0;
-		double exec_time = 0.0;
+		double elapsed_time_worker[STARPU_NMAXWORKERS];
+		double norm_idle_time = 0.0;
+		double end_time  = starpu_timing_now();
 		while(workers->has_next(workers, &it))
 		while(workers->has_next(workers, &it))
 		{
 		{
+			double idle_time = 0.0;
 			worker = workers->get_next(workers, &it);
 			worker = workers->get_next(workers, &it);
-			if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]==0.0)
+			if(hypervisor.sched_ctx_w[sched_ctx].compute_idle[worker])
+			{
+				if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+					elapsed_time_worker[worker] = 0.0;
+				else
+					elapsed_time_worker[worker] = (end_time - hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker]) / 1000000.0;
+				
+				if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker] == 0.0)
+				{
+					idle_time = hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
+				}
+				else
+				{
+					double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
+					idle_time = hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
+				}		
+				norm_idle_time += (elapsed_time_worker[worker] == 0.0 ? 0.0 : (idle_time / elapsed_time_worker[worker]));
+/* 				printf("%d/%d: start time %lf elapsed time %lf idle time %lf norm_idle_time %lf \n",  */
+/* 				       worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker], elapsed_time_worker[worker], idle_time, norm_idle_time); */
+			}
+		}
+
+		double norm_exec_time = 0.0;
+		for(worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+		{
+			double exec_time = 0.0;
+			if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+				elapsed_time_worker[worker] = 0.0;
+			else
+				elapsed_time_worker[worker] = (end_time - hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker]) / 1000000.0;
+
+			if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] == 0.0)
 			{
 			{
-				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker]; /* in seconds */
+				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker];
+//				printf("%d/%d: exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker]);
 			}
 			}
 			else
 			else
 			{
 			{
-				double end_time  = starpu_timing_now();
-				double idle = (end_time - hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker]) / 1000000.0; /* in seconds */ 
-				max_workers_idle_time[i] += hypervisor.sched_ctx_w[sched_ctx].idle_time[worker] + idle;
+				double current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] + current_exec_time;
+//				printf("%d/%d: exec_time %lf current_exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker], current_exec_time);
 			}		
 			}		
-			exec_time += hypervisor.sched_ctx_w[sched_ctx].exec_time[worker];
+			norm_exec_time += elapsed_time_worker[worker] == 0.0 ? 0.0 : exec_time / elapsed_time_worker[worker];
 		}			
 		}			
 
 
-		
 		double curr_time = starpu_timing_now();
 		double curr_time = starpu_timing_now();
 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
-		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
-		double norm_exec_time = exec_time / elapsed_time;
-
+//		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
+//		double norm_exec_time = exec_time / elapsed_time;
 		if(norm_idle_time >= 0.9)
 		if(norm_idle_time >= 0.9)
 		{
 		{
 //			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time);
 //			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time);
@@ -860,8 +887,8 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 		{
 		{
 			if(norm_idle_time < 0.1)//(max_workers_idle_time[i] < 0.000001)
 			if(norm_idle_time < 0.1)//(max_workers_idle_time[i] < 0.000001)
 				config->max_nworkers = lrint(norm_exec_time)  + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
 				config->max_nworkers = lrint(norm_exec_time)  + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
-/* 			else */
-/* 				config->max_nworkers = workers->nworkers; */
+			else
+				config->max_nworkers = lrint(norm_exec_time);
 		}
 		}
 		
 		
 		if(config->max_nworkers < 0)
 		if(config->max_nworkers < 0)
@@ -897,26 +924,31 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
 		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
 	}
 	}
 }
 }
-/* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
-static void notify_poped_task(unsigned sched_ctx, int worker)
+
+/* notifies the hypervisor that a new task was pushed on the queue of the worker */
+static void notify_pushed_task(unsigned sched_ctx, int worker)
 {
 {
-	hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = starpu_timing_now();
+	hypervisor.sched_ctx_w[sched_ctx].pushed_tasks[worker]++;
+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time == 0.0)
+		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
 
 
-	if(hypervisor.resize[sched_ctx])
-		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
 
 
-	struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
+		hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
+
+	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
 
 
-	if(sc_w->idle_start_time[worker] != 0.0)
+	if((hypervisor.min_tasks == 0 || (!(hypervisor.resize[sched_ctx] == 0 && imposed_resize) && ntasks == hypervisor.min_tasks)) && hypervisor.check_min_tasks[sched_ctx])
 	{
 	{
-		double end_time  = starpu_timing_now();
-		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
-		sc_w->idle_start_time[worker] = 0.0;
+		hypervisor.resize[sched_ctx] = 1;
+		if(imposed_resize) imposed_resize = 0;
+		hypervisor.check_min_tasks[sched_ctx] = 0;
 	}
 	}
-			
-	if(hypervisor.policy.handle_idle_end)
-		hypervisor.policy.handle_idle_end(sched_ctx, worker);
 
 
+	if(hypervisor.policy.handle_pushed_task)
+		hypervisor.policy.handle_pushed_task(sched_ctx, worker);
 }
 }
 
 
 /* notifies the hypervisor that the worker spent another cycle in idle time */
 /* notifies the hypervisor that the worker spent another cycle in idle time */
@@ -938,36 +970,46 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 	return;
 	return;
 }
 }
 
 
-/* notifies the hypervisor that a new task was pushed on the queue of the worker */
-static void notify_pushed_task(unsigned sched_ctx, int worker)
+/* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
+static void notify_poped_task(unsigned sched_ctx, int worker)
 {
 {
-	hypervisor.sched_ctx_w[sched_ctx].pushed_tasks[worker]++;
-	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time == 0.0)
-		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
+	if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
 
 
-	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
-		hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
+	hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = starpu_timing_now();
 
 
-	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
+	if(hypervisor.resize[sched_ctx])
+		hypervisor.sched_ctx_w[sched_ctx].current_idle_time[worker] = 0.0;
 
 
-	if((hypervisor.min_tasks == 0 || (!(hypervisor.resize[sched_ctx] == 0 && imposed_resize) && ntasks == hypervisor.min_tasks)) && hypervisor.check_min_tasks[sched_ctx])
+	struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[sched_ctx];
+
+	if(sc_w->idle_start_time[worker] > 0.0)
 	{
 	{
-		hypervisor.resize[sched_ctx] = 1;
-		if(imposed_resize) imposed_resize = 0;
-		hypervisor.check_min_tasks[sched_ctx] = 0;
+		double end_time  = starpu_timing_now();
+		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
+		sc_w->idle_start_time[worker] = 0.0;
 	}
 	}
+			
+	if(hypervisor.policy.handle_idle_end)
+		hypervisor.policy.handle_idle_end(sched_ctx, worker);
 
 
-	if(hypervisor.policy.handle_pushed_task)
-		hypervisor.policy.handle_pushed_task(sched_ctx, worker);
 }
 }
 
 
-
+ 
 /* notifies the hypervisor that a tagged task has just been executed */
 /* notifies the hypervisor that a tagged task has just been executed */
-static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag)
+static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag, int ready_tasks, double ready_flops)
 {
 {
 	unsigned sched_ctx = task->sched_ctx;
 	unsigned sched_ctx = task->sched_ctx;
 	int worker = starpu_worker_get_id();
 	int worker = starpu_worker_get_id();
 
 
+	if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] != 0.0)
+	{
+		double current_time = starpu_timing_now();
+		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += (current_time - 
+									hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+		hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = 0.0;
+	}
+
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += task->flops;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += task->flops;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
@@ -976,57 +1018,27 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 
 
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops;
-	hypervisor.sched_ctx_w[sched_ctx].nready_tasks--;
-	hypervisor.sched_ctx_w[sched_ctx].ready_flops -= task->flops;
+	hypervisor.sched_ctx_w[sched_ctx].nready_tasks = ready_tasks;
+	hypervisor.sched_ctx_w[sched_ctx].ready_flops = ready_flops;
 	if(hypervisor.sched_ctx_w[sched_ctx].ready_flops < 0.0)
 	if(hypervisor.sched_ctx_w[sched_ctx].ready_flops < 0.0)
 		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
 		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
+	_ack_resize_completed(sched_ctx, worker);
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
 
-/* 	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx); */
-	
-/* 	unsigned finished_sample = 0; */
-/* 	char *speed_sample_criteria = getenv("SC_HYPERVISOR_SAMPLE_CRITERIA"); */
-/* 	if(speed_sample_criteria && (strcmp(speed_sample_criteria, "time") == 0)) */
-/* 	{ */
-
-/* 		double curr_time = starpu_timing_now(); */
-/* 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /\* in seconds *\/ */
-
-/* 		finished_sample = elapsed_time > config->time_sample; */
-/* 	} */
-/* 	else */
-/* 	{ */
-/* 		double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]); */
-/* 		double ctx_sample = config->ispeed_ctx_sample; */
-
-/* 		finished_sample = ctx_elapsed_flops > ctx_sample; */
-/* 	} */
-
-/* 	if(finished_sample) */
-/* 	{ */
-/* 		sc_hypervisor_update_resize_interval(sched_ctx); */
-/* 	} */
 	
 	
 	if(hypervisor.resize[sched_ctx])
 	if(hypervisor.resize[sched_ctx])
 	{	
 	{	
 		if(hypervisor.policy.handle_poped_task)
 		if(hypervisor.policy.handle_poped_task)
 			hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
 			hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
 	}
 	}
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	_ack_resize_completed(sched_ctx, worker);
-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+/* 	starpu_pthread_mutex_lock(&act_hypervisor_mutex); */
+/* 	_ack_resize_completed(sched_ctx, worker); */
+/* 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex); */
 	if(hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker] % 200 == 0)
 	if(hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker] % 200 == 0)
 		_print_current_time();
 		_print_current_time();
 
 
 	if(task_tag <= 0)
 	if(task_tag <= 0)
-	{
-		int workerid = starpu_worker_get_id();
-		double current_time = starpu_timing_now();
-		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += current_time - 
-			hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] / 1000000.0; /* in seconds */ 
-		hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = 0.0;
 		return; 
 		return; 
-	}
 	
 	
 	unsigned conf_sched_ctx;
 	unsigned conf_sched_ctx;
 	unsigned i;
 	unsigned i;

+ 2 - 0
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -124,3 +124,5 @@ double _get_optimal_v(unsigned sched_ctx);
 void _set_optimal_v(unsigned sched_ctx, double optimal_v);
 void _set_optimal_v(unsigned sched_ctx, double optimal_v);
 
 
 int _sc_hypervisor_use_lazy_resize(void);
 int _sc_hypervisor_use_lazy_resize(void);
+
+void _sc_hypervisor_allow_compute_idle(unsigned sched_ctx, int worker, unsigned allow);

+ 0 - 2
src/Makefile.am

@@ -129,7 +129,6 @@ noinst_HEADERS = 						\
 	profiling/profiling.h					\
 	profiling/profiling.h					\
 	util/starpu_task_insert_utils.h				\
 	util/starpu_task_insert_utils.h				\
 	util/starpu_data_cpy.h					\
 	util/starpu_data_cpy.h					\
-	util/starpu_task_list_inline.h				\
 	starpu_parameters.h					\
 	starpu_parameters.h					\
 	top/starpu_top_message_queue.h				\
 	top/starpu_top_message_queue.h				\
 	top/starpu_top_connection.h				\
 	top/starpu_top_connection.h				\
@@ -229,7 +228,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	util/starpu_data_cpy.c					\
 	util/starpu_data_cpy.c					\
 	util/starpu_task_insert.c				\
 	util/starpu_task_insert.c				\
 	util/starpu_task_insert_utils.c				\
 	util/starpu_task_insert_utils.c				\
-	util/starpu_inlines.c					\
 	debug/traces/starpu_fxt.c				\
 	debug/traces/starpu_fxt.c				\
 	debug/traces/starpu_fxt_mpi.c				\
 	debug/traces/starpu_fxt_mpi.c				\
 	debug/traces/starpu_fxt_dag.c				\
 	debug/traces/starpu_fxt_dag.c				\

+ 1 - 0
src/common/barrier.c

@@ -23,6 +23,7 @@ int _starpu_barrier_init(struct _starpu_barrier *barrier, int count)
 	barrier->count = count;
 	barrier->count = count;
 	barrier->reached_start = 0;
 	barrier->reached_start = 0;
 	barrier->reached_exit = 0;
 	barrier->reached_exit = 0;
+	barrier->reached_flops = 0.0;
 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex_exit, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&barrier->mutex_exit, NULL);
 	STARPU_PTHREAD_COND_INIT(&barrier->cond, NULL);
 	STARPU_PTHREAD_COND_INIT(&barrier->cond, NULL);

+ 1 - 0
src/common/barrier.h

@@ -29,6 +29,7 @@ struct _starpu_barrier
 	int count;
 	int count;
 	int reached_start;
 	int reached_start;
 	int reached_exit;
 	int reached_exit;
+	double reached_flops;
 	starpu_pthread_mutex_t mutex;
 	starpu_pthread_mutex_t mutex;
 	starpu_pthread_mutex_t mutex_exit;
 	starpu_pthread_mutex_t mutex_exit;
 	starpu_pthread_cond_t cond;
 	starpu_pthread_cond_t cond;

+ 17 - 3
src/common/barrier_counter.c

@@ -56,7 +56,7 @@ int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter
 	return 0;
 	return 0;
 }
 }
 
 
-int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c)
+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c, double flops)
 {
 {
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	int ret = 0;
 	int ret = 0;
@@ -64,6 +64,7 @@ int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier
 
 
 	if (--barrier->reached_start == 0)
 	if (--barrier->reached_start == 0)
 	{
 	{
+		barrier->reached_flops -= flops;
 		ret = 1;
 		ret = 1;
 		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
 		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
 	}
 	}
@@ -72,7 +73,7 @@ int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier
 	return ret;
 	return ret;
 }
 }
 
 
-int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c)
+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c, double flops)
 {
 {
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	int ret = 0;
 	int ret = 0;
@@ -80,6 +81,7 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
 
 	if(++barrier->reached_start == barrier->count)
 	if(++barrier->reached_start == barrier->count)
 	{
 	{
+		barrier->reached_flops += flops;
 		ret = 1;
 		ret = 1;
 		STARPU_PTHREAD_COND_BROADCAST(&barrier_c->cond2);
 		STARPU_PTHREAD_COND_BROADCAST(&barrier_c->cond2);
 	}
 	}
@@ -88,14 +90,26 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 	return ret;
 	return ret;
 }
 }
 
 
-int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c)
+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c, double flops)
 {
 {
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	struct _starpu_barrier *barrier = &barrier_c->barrier;
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
 
 
 	barrier->reached_start++;
 	barrier->reached_start++;
+	barrier->reached_flops += flops;
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
 	return 0;
 	return 0;
 }
 }
 
 
+int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+
+	if(barrier->reached_start == 0)
+		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return 0;
+}

+ 5 - 3
src/common/barrier_counter.h

@@ -34,10 +34,12 @@ int _starpu_barrier_counter_wait_for_empty_counter(struct _starpu_barrier_counte
 
 
 int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c);
 int _starpu_barrier_counter_wait_for_full_counter(struct _starpu_barrier_counter *barrier_c);
 
 
-int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c);
+int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier_counter *barrier_c, double flops);
 
 
-int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c);
+int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_counter *barrier_c, double flops);
 
 
-int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
+int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c, double flops);
+
+int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c);
 
 
 #endif
 #endif

+ 11 - 0
src/common/fxt.h

@@ -137,6 +137,9 @@
 
 
 #define _STARPU_FUT_DATA_LOAD 0x5153
 #define _STARPU_FUT_DATA_LOAD 0x5153
 
 
+#define _STARPU_FUT_START_UNPARTITION 0x5154
+#define _STARPU_FUT_END_UNPARTITION 0x5155
+
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
 #include <fxt/fut.h>
@@ -564,6 +567,12 @@ do {										\
 #define _STARPU_TRACE_MEMORY_FULL(size)	\
 #define _STARPU_TRACE_MEMORY_FULL(size)	\
 	FUT_DO_PROBE2(_STARPU_FUT_MEMORY_FULL,size,_starpu_gettid());
 	FUT_DO_PROBE2(_STARPU_FUT_MEMORY_FULL,size,_starpu_gettid());
 
 
+#define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
+	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION, memnode, _starpu_gettid(), handle);
+	
+#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
+	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION, memnode, _starpu_gettid(), handle);
+
 #else // !STARPU_USE_FXT
 #else // !STARPU_USE_FXT
 
 
 /* Dummy macros in case FxT is disabled */
 /* Dummy macros in case FxT is disabled */
@@ -629,6 +638,8 @@ do {										\
 #define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
 #define _STARPU_TRACE_COND_WAIT_BEGIN()		do {} while(0)
 #define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
 #define _STARPU_TRACE_COND_WAIT_END()			do {} while(0)
 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
+#define _STARPU_TRACE_START_UNPARTITION(handle, memnode)	do {} while(0)
+#define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		do {} while(0)
 
 
 #endif // STARPU_USE_FXT
 #endif // STARPU_USE_FXT
 
 

+ 45 - 0
src/common/thread.c

@@ -217,6 +217,16 @@ int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 	return p_ret;
 	return p_ret;
 }
 }
 
 
+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
+{
+	int p_ret = starpu_pthread_mutex_trylock(rwlock);
+
+	if (!p_ret)
+		_STARPU_TRACE_RWLOCK_RDLOCKED();
+
+	return p_ret;
+}
+
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 {
 {
 	_STARPU_TRACE_WRLOCKING_RWLOCK();
 	_STARPU_TRACE_WRLOCKING_RWLOCK();
@@ -228,6 +238,17 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 	return p_ret;
 	return p_ret;
 }
 }
 
 
+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
+{
+	int p_ret =  starpu_pthread_mutex_trylock(rwlock);
+
+	if (!p_ret)
+		_STARPU_TRACE_RWLOCK_RDLOCKED();
+
+	return p_ret;
+}
+
+
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 {
 {
 	_STARPU_TRACE_UNLOCKING_RWLOCK();
 	_STARPU_TRACE_UNLOCKING_RWLOCK();
@@ -298,6 +319,18 @@ int starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
 	return p_ret;
 	return p_ret;
 }
 }
 
 
+int starpu_pthread_rwlock_tryrdlock(starpu_pthread_rwlock_t *rwlock)
+{
+	_STARPU_TRACE_RDLOCKING_RWLOCK();
+
+ 	int p_ret = pthread_rwlock_tryrdlock(rwlock);
+
+	if (!p_ret)
+		_STARPU_TRACE_RWLOCK_RDLOCKED();
+
+	return p_ret;
+}
+
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 {
 {
 	_STARPU_TRACE_WRLOCKING_RWLOCK();
 	_STARPU_TRACE_WRLOCKING_RWLOCK();
@@ -309,6 +342,18 @@ int starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
 	return p_ret;
 	return p_ret;
 }
 }
 
 
+int starpu_pthread_rwlock_trywrlock(starpu_pthread_rwlock_t *rwlock)
+{
+	_STARPU_TRACE_WRLOCKING_RWLOCK();
+
+ 	int p_ret = pthread_rwlock_trywrlock(rwlock);
+
+	if (!p_ret)
+		_STARPU_TRACE_RWLOCK_WRLOCKED();
+
+	return p_ret;
+}
+
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 {
 {
 	_STARPU_TRACE_UNLOCKING_RWLOCK();
 	_STARPU_TRACE_UNLOCKING_RWLOCK();

+ 14 - 0
src/common/utils.c

@@ -159,3 +159,17 @@ void _starpu_gethostname(char *hostname, size_t size)
 			*c = 0;
 			*c = 0;
 	}
 	}
 }
 }
+
+void _starpu_sleep(struct timespec ts)
+{
+#ifdef STARPU_HAVE_WINDOWS
+	Sleep((ts.tv_sec * 1000) + (ts.tv_nsec / 1000000));
+#else
+	struct timespec req, rem;
+
+	req = ts;
+	while (nanosleep(&req, &rem))
+		req = rem;
+#endif
+}
+

+ 2 - 0
src/common/utils.h

@@ -118,4 +118,6 @@ const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl);
 
 
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex);
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex);
 
 
+void _starpu_sleep(struct timespec ts);
+
 #endif // __COMMON_UTILS_H__
 #endif // __COMMON_UTILS_H__

+ 4 - 1
src/core/debug.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,4 +44,7 @@ extern int _starpu_use_fxt;
 /* Get an Ayudame id for CL */
 /* Get an Ayudame id for CL */
 int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl);
 int64_t _starpu_ayudame_get_func_id(struct starpu_codelet *cl);
 
 
+void _starpu_watchdog_init(void);
+void _starpu_watchdog_shutdown(void);
+
 #endif // __DEBUG_H__
 #endif // __DEBUG_H__

+ 3 - 2
src/core/dependencies/tags.c

@@ -288,7 +288,7 @@ void starpu_tag_restart(starpu_tag_t id)
 	struct _starpu_tag *tag = gettag_struct(id);
 	struct _starpu_tag *tag = gettag_struct(id);
 
 
 	_starpu_spin_lock(&tag->lock);
 	_starpu_spin_lock(&tag->lock);
-	STARPU_ASSERT_MSG(tag->state == STARPU_DONE, "Only completed tags can be restarted (%llu was %d)", (unsigned long long) id, tag->state);
+	STARPU_ASSERT_MSG(tag->state == STARPU_DONE || tag->state == STARPU_INVALID_STATE || tag->state == STARPU_ASSOCIATED || tag->state == STARPU_BLOCKED, "Only completed tags can be restarted (%llu was %d)", (unsigned long long) id, tag->state);
 	tag->state = STARPU_BLOCKED;
 	tag->state = STARPU_BLOCKED;
 	_starpu_spin_unlock(&tag->lock);
 	_starpu_spin_unlock(&tag->lock);
 }
 }
@@ -313,8 +313,9 @@ void _starpu_tag_declare(starpu_tag_t id, struct _starpu_job *job)
 	 * detect when either of them are finished. We however don't allow
 	 * detect when either of them are finished. We however don't allow
 	 * several tasks to share a tag when it is used to wake them by
 	 * several tasks to share a tag when it is used to wake them by
 	 * dependency */
 	 * dependency */
+	if (tag->job != job)
+		tag->is_assigned++;
 	tag->job = job;
 	tag->job = job;
-	tag->is_assigned++;
 
 
 	job->tag = tag;
 	job->tag = tag;
 	/* the tag is now associated to a job */
 	/* the tag is now associated to a job */

+ 10 - 7
src/core/detect_combined_workers.c

@@ -39,13 +39,17 @@ static void find_workers(hwloc_obj_t obj, int cpu_workers[STARPU_NMAXWORKERS], u
 	}
 	}
 
 
 	/* Got to a PU leaf */
 	/* Got to a PU leaf */
-	struct _starpu_worker *worker = obj->userdata;
-	/* is it a CPU worker? */
-	if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+	struct _starpu_worker_list *workers = obj->userdata;
+	struct _starpu_worker *worker;
+	for(worker = _starpu_worker_list_begin(workers); worker != _starpu_worker_list_end(workers); worker = _starpu_worker_list_next(worker))
 	{
 	{
-		_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
-		/* Add it to the combined worker */
-		cpu_workers[(*n)++] = worker->workerid;
+		/* is it a CPU worker? */
+		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
+		{
+			_STARPU_DEBUG("worker %d is part of it\n", worker->workerid);
+			/* Add it to the combined worker */
+			cpu_workers[(*n)++] = worker->workerid;
+		}
 	}
 	}
 }
 }
 
 
@@ -177,7 +181,6 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
 		if (worker->perf_arch.type == STARPU_CPU_WORKER && worker->perf_arch.ncore == 0)
 		{
 		{
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
 			hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, config->cpu_depth, worker->bindid);
-			STARPU_ASSERT(obj->userdata == worker);
 			obj = obj->parent;
 			obj = obj->parent;
 			while (obj)
 			while (obj)
 			{
 			{

+ 2 - 2
src/core/jobs.c

@@ -143,6 +143,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	unsigned sched_ctx = task->sched_ctx;
 	unsigned sched_ctx = task->sched_ctx;
+	double flops = task->flops;
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
 	task->status = STARPU_TASK_FINISHED;
 	task->status = STARPU_TASK_FINISHED;
@@ -294,10 +295,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		int ret = _starpu_submit_job(j);
 		int ret = _starpu_submit_job(j);
 		STARPU_ASSERT(!ret);
 		STARPU_ASSERT(!ret);
 	}
 	}
-	_starpu_decrement_nsubmitted_tasks();
-	_starpu_decrement_nready_tasks();
 
 
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
+	_starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx, flops);
 
 
 	struct _starpu_worker *worker;
 	struct _starpu_worker *worker;
 	worker = _starpu_get_local_worker_key();
 	worker = _starpu_get_local_worker_key();

+ 2 - 0
src/core/perfmodel/perfmodel.h

@@ -63,6 +63,7 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 void _starpu_load_perfmodel(struct starpu_perfmodel *model);
 void _starpu_load_perfmodel(struct starpu_perfmodel *model);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
+void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
 
 
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
 double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
 					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
 					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
@@ -90,6 +91,7 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 					    double latency_write, double latency_read, unsigned node);
 					    double latency_write, double latency_read, unsigned node);
 
 
 int _starpu_read_double(FILE *f, char *format, double *val);
 int _starpu_read_double(FILE *f, char *format, double *val);
+void _starpu_simgrid_get_platform_path(char *path, size_t maxlen);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 2 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -238,7 +238,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	 * since we cleanly shutdown CUDA before returning. */
 	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(src);
 	cudaSetDevice(src);
 
 
-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 	{
 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
 		if (!cures && can)
 		if (!cures && can)
@@ -260,7 +260,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	 * since we cleanly shutdown CUDA before returning. */
 	 * since we cleanly shutdown CUDA before returning. */
 	cudaSetDevice(dst);
 	cudaSetDevice(dst);
 
 
-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 	{
 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
 		if (!cures && can)
 		if (!cures && can)

+ 0 - 6
src/core/perfmodel/perfmodel_history.c

@@ -1014,12 +1014,6 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 		else
 		else
 		{
 		{
 			_STARPU_DEBUG("File does not exists\n");
 			_STARPU_DEBUG("File does not exists\n");
-			if (!calibrate_flag)
-			{
-				_STARPU_DISP("Warning: model %s is not calibrated, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol);
-				_starpu_set_calibrate_flag(1);
-				model->benchmarking = 1;
-			}
 		}
 		}
 
 
 		_STARPU_DEBUG("Performance model file %s for model %s is loaded\n", path, model->symbol);
 		_STARPU_DEBUG("Performance model file %s for model %s is loaded\n", path, model->symbol);

+ 1 - 0
src/core/perfmodel/perfmodel_nan.c

@@ -22,6 +22,7 @@
 #include <math.h>
 #include <math.h>
 #include <string.h>
 #include <string.h>
 #include <config.h>
 #include <config.h>
+#include <core/perfmodel/perfmodel.h>
 
 
 int _starpu_read_double(FILE *f, char *format, double *val)
 int _starpu_read_double(FILE *f, char *format, double *val)
 {
 {

+ 3 - 7
src/core/progress_hook.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -41,6 +41,7 @@ static int active_hook_cnt = 0;
 void _starpu_init_progression_hooks(void)
 void _starpu_init_progression_hooks(void)
 {
 {
 	STARPU_PTHREAD_RWLOCK_INIT(&progression_hook_rwlock, NULL);
 	STARPU_PTHREAD_RWLOCK_INIT(&progression_hook_rwlock, NULL);
+	STARPU_HG_DISABLE_CHECKING(active_hook_cnt);
 }
 }
 
 
 int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
 int starpu_progression_hook_register(unsigned (*func)(void *arg), void *arg)
@@ -85,12 +86,7 @@ void starpu_progression_hook_deregister(int hook_id)
 
 
 unsigned _starpu_execute_registered_progression_hooks(void)
 unsigned _starpu_execute_registered_progression_hooks(void)
 {
 {
-	/* If there is no hook registered, we short-cut loop. */
-	STARPU_PTHREAD_RWLOCK_RDLOCK(&progression_hook_rwlock);
-	int no_hook = (active_hook_cnt == 0);
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&progression_hook_rwlock);
-
-	if (no_hook)
+	if (active_hook_cnt == 0)
 		return 1;
 		return 1;
 
 
 	/* By default, it is possible to block, but if some progression hooks
 	/* By default, it is possible to block, but if some progression hooks

+ 100 - 7
src/core/sched_ctx.c

@@ -292,7 +292,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	sem_init(&sched_ctx->parallel_code_sem, 0, 0);
 	sem_init(&sched_ctx->parallel_code_sem, 0, 0);
 
 
 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
 	_starpu_barrier_counter_init(&sched_ctx->tasks_barrier, 0);
+	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
 
 
+	sched_ctx->ready_flops = 0.0;
 	/*init the strategy structs and the worker_collection of the ressources of the context */
 	/*init the strategy structs and the worker_collection of the ressources of the context */
 	_starpu_init_sched_policy(config, sched_ctx, policy);
 	_starpu_init_sched_policy(config, sched_ctx, policy);
 
 
@@ -630,6 +632,7 @@ void _starpu_delete_all_sched_ctxs()
 		{
 		{
 			_starpu_sched_ctx_free_scheduling_data(sched_ctx);
 			_starpu_sched_ctx_free_scheduling_data(sched_ctx);
 			_starpu_barrier_counter_destroy(&sched_ctx->tasks_barrier);
 			_starpu_barrier_counter_destroy(&sched_ctx->tasks_barrier);
+			_starpu_barrier_counter_destroy(&sched_ctx->ready_tasks_barrier);
 			_starpu_delete_sched_ctx(sched_ctx);
 			_starpu_delete_sched_ctx(sched_ctx);
 		}
 		}
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[i]);
 		STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[i]);
@@ -748,6 +751,28 @@ void starpu_sched_ctx_remove_workers(int *workers_to_remove, int nworkers_to_rem
 	return;
 	return;
 }
 }
 
 
+int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
+{
+	unsigned worker = 0, nworkers = 0;
+	STARPU_PTHREAD_RWLOCK_WRLOCK(&changing_ctx_mutex[sched_ctx->id]);
+	struct starpu_worker_collection *workers = sched_ctx->workers;
+
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		STARPU_ASSERT_MSG(worker < STARPU_NMAXWORKERS, "worker id %d", worker);
+		if (starpu_worker_can_execute_task(worker, task, 0))
+			nworkers++;
+	}
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx->id]);
+
+	return nworkers;
+}
+
 /* unused sched_ctx have the id STARPU_NMAX_SCHED_CTXS */
 /* unused sched_ctx have the id STARPU_NMAX_SCHED_CTXS */
 void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
 void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config)
 {
 {
@@ -786,8 +811,12 @@ int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
 
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 {
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	if (!config->watchdog_ok)
+		config->watchdog_ok = 1;
+
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier);
+	int finished = _starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->tasks_barrier, 0.0);
         /* when finished decrementing the tasks if the user signaled he will not submit tasks anymore
         /* when finished decrementing the tasks if the user signaled he will not submit tasks anymore
            we can move all its workers to the inheritor context */
            we can move all its workers to the inheritor context */
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
@@ -818,14 +847,67 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 			return;
 			return;
 		}
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&finished_submit_mutex);
+		/* FIXME: */
+		/* We also need to check for config->submitting = 0 (i.e. the
+		 * user calle starpu_drivers_request_termination()), in which
+		 * case we need to set config->running to 0 and wake workers,
+		 * so they can terminate, just like
+		 * starpu_drivers_request_termination() does.
+		 *
+		 * Set FIXME to 1 in tests/main/driver_api/run_driver.c to
+		 * check it is actually fixed.
+		 */
 	}
 	}
+
 	return;
 	return;
 }
 }
 
 
 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
+	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier, 0.0);
+}
+
+int _starpu_get_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return sched_ctx->tasks_barrier.barrier.reached_start;
+}
+
+int _starpu_check_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return _starpu_barrier_counter_check(&sched_ctx->tasks_barrier);
+}
+
+void _starpu_increment_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	_starpu_barrier_counter_increment(&sched_ctx->ready_tasks_barrier, ready_flops);
+}
+
+void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	_starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->ready_tasks_barrier, ready_flops);
+}
+
+int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return sched_ctx->ready_tasks_barrier.barrier.reached_start;
+}
+
+double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return sched_ctx->ready_tasks_barrier.barrier.reached_flops;
+}
+
+int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	return _starpu_barrier_counter_wait_for_empty_counter(&sched_ctx->ready_tasks_barrier);
 }
 }
 
 
 void starpu_sched_ctx_set_context(unsigned *sched_ctx)
 void starpu_sched_ctx_set_context(unsigned *sched_ctx)
@@ -842,6 +924,15 @@ unsigned starpu_sched_ctx_get_context()
 	return *sched_ctx;
 	return *sched_ctx;
 }
 }
 
 
+unsigned _starpu_sched_ctx_get_current_context()
+{
+	unsigned sched_ctx = starpu_sched_ctx_get_context();
+	if (sched_ctx == STARPU_NMAX_SCHED_CTXS)
+		return _starpu_get_initial_sched_ctx()->id;
+	else
+		return sched_ctx;
+}
+
 void starpu_sched_ctx_notify_hypervisor_exists()
 void starpu_sched_ctx_notify_hypervisor_exists()
 {
 {
 	with_hypervisor = 1;
 	with_hypervisor = 1;
@@ -1090,7 +1181,9 @@ void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task,
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
 	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
-		sched_ctx->perf_counters->notify_post_exec_task(task, data_size, footprint, task->hypervisor_tag);
+		sched_ctx->perf_counters->notify_post_exec_task(task, data_size, footprint, task->hypervisor_tag, 
+								_starpu_get_nready_tasks_of_sched_ctx(sched_ctx->id), 
+								_starpu_get_nready_flops_of_sched_ctx(sched_ctx->id));
 }
 }
 
 
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
@@ -1105,22 +1198,22 @@ void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 
 
 int starpu_sched_get_min_priority(void)
 int starpu_sched_get_min_priority(void)
 {
 {
-	return starpu_sched_ctx_get_min_priority(_starpu_get_initial_sched_ctx()->id);
+	return starpu_sched_ctx_get_min_priority(_starpu_sched_ctx_get_current_context());
 }
 }
 
 
 int starpu_sched_get_max_priority(void)
 int starpu_sched_get_max_priority(void)
 {
 {
-	return starpu_sched_ctx_get_max_priority(_starpu_get_initial_sched_ctx()->id);
+	return starpu_sched_ctx_get_max_priority(_starpu_sched_ctx_get_current_context());
 }
 }
 
 
 int starpu_sched_set_min_priority(int min_prio)
 int starpu_sched_set_min_priority(int min_prio)
 {
 {
-	return starpu_sched_ctx_set_min_priority(_starpu_get_initial_sched_ctx()->id, min_prio);
+	return starpu_sched_ctx_set_min_priority(_starpu_sched_ctx_get_current_context(), min_prio);
 }
 }
 
 
 int starpu_sched_set_max_priority(int max_prio)
 int starpu_sched_set_max_priority(int max_prio)
 {
 {
-	return starpu_sched_ctx_set_max_priority(_starpu_get_initial_sched_ctx()->id, max_prio);
+	return starpu_sched_ctx_set_max_priority(_starpu_sched_ctx_get_current_context(), max_prio);
 }
 }
 
 
 int starpu_sched_ctx_get_min_priority(unsigned sched_ctx_id)
 int starpu_sched_ctx_get_min_priority(unsigned sched_ctx_id)

+ 23 - 0
src/core/sched_ctx.h

@@ -59,6 +59,12 @@ struct _starpu_sched_ctx
 	/* wait for the tasks submitted to the context to be executed */
 	/* wait for the tasks submitted to the context to be executed */
 	struct _starpu_barrier_counter tasks_barrier;
 	struct _starpu_barrier_counter tasks_barrier;
 
 
+	/* wait for the tasks ready of the context to be executed */
+	struct _starpu_barrier_counter ready_tasks_barrier;
+
+	/* amount of ready flops in a context */
+	double ready_flops;
+
 	/* cond to block push when there are no workers in the ctx */
 	/* cond to block push when there are no workers in the ctx */
 	starpu_pthread_cond_t no_workers_cond;
 	starpu_pthread_cond_t no_workers_cond;
 
 
@@ -141,6 +147,14 @@ int _starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id);
  * task currently submitted to the context */
  * task currently submitted to the context */
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
 void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
 void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
+int _starpu_get_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
+int _starpu_check_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
+
+void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
+void _starpu_increment_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
+int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id);
+double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id);
+int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id);
 
 
 /* Return the corresponding index of the workerid in the ctx table */
 /* Return the corresponding index of the workerid in the ctx table */
 int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
 int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
@@ -175,6 +189,15 @@ void _starpu_sched_ctx_rebind_thread_to_its_cpu(unsigned cpuid);
 /* let the appl know that the worker blocked to execute parallel code */
 /* let the appl know that the worker blocked to execute parallel code */
 void _starpu_sched_ctx_signal_worker_blocked(int workerid);
 void _starpu_sched_ctx_signal_worker_blocked(int workerid);
 
 
+/* If starpu_sched_ctx_set_context() has been called, returns the context
+ * id set by its last call, or the id of the initial context */
+unsigned _starpu_sched_ctx_get_current_context();
+
+/* verify how many workers can execute a certain task */
+int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx);
+
+void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);

+ 4 - 27
src/core/sched_policy.c

@@ -299,25 +299,6 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 	}
 	}
 }
 }
 
 
-static int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
-{
-	unsigned worker = 0, nworkers = 0;
-	struct starpu_worker_collection *workers = sched_ctx->workers;
-
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
-
-	while(workers->has_next(workers, &it))
-	{
-		worker = workers->get_next(workers, &it);
-		if (starpu_worker_can_execute_task(worker, task, 0))
-			nworkers++;
-	}
-
-	return nworkers;
-}
-
 /* the generic interface that call the proper underlying implementation */
 /* the generic interface that call the proper underlying implementation */
 
 
 int _starpu_push_task(struct _starpu_job *j)
 int _starpu_push_task(struct _starpu_job *j)
@@ -334,13 +315,8 @@ int _starpu_push_task(struct _starpu_job *j)
 	_STARPU_LOG_IN();
 	_STARPU_LOG_IN();
 
 
 	_STARPU_TRACE_JOB_PUSH(task, task->priority > 0);
 	_STARPU_TRACE_JOB_PUSH(task, task->priority > 0);
-	_starpu_increment_nready_tasks();
+	_starpu_increment_nready_tasks_of_sched_ctx(task->sched_ctx, task->flops);
 	task->status = STARPU_TASK_READY;
 	task->status = STARPU_TASK_READY;
-#ifdef STARPU_USE_SC_HYPERVISOR
-	if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL 
-	   && sched_ctx->perf_counters->notify_ready_task)
-		sched_ctx->perf_counters->notify_ready_task(sched_ctx->id, task);
-#endif //STARPU_USE_SC_HYPERVISOR
 
 
 #ifdef HAVE_AYUDAME_H
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
 	if (AYU_event)
@@ -587,6 +563,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 	return conversion_task;
 	return conversion_task;
 }
 }
 
 
+static
 struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
 struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker *worker)
 {	
 {	
 	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
@@ -595,8 +572,8 @@ struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker
 	for (l = worker->sched_ctx_list; l; l = l->next)
 	for (l = worker->sched_ctx_list; l; l = l->next)
 	{
 	{
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
-		if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1)
-			return sched_ctx;
+/* 		if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) */
+/* 			return sched_ctx; */
 		if(sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
 		if(sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
 		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
 		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
 		{
 		{

+ 163 - 91
src/core/task.c

@@ -31,15 +31,17 @@
 #include <math.h>
 #include <math.h>
 #include <string.h>
 #include <string.h>
 #include <core/debug.h>
 #include <core/debug.h>
+#include <core/sched_ctx.h>
+#include <time.h>
+#ifdef STARPU_HAVE_WINDOWS
+#include <windows.h>
+#endif
 
 
 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
  * sure that no task remains !) */
  * sure that no task remains !) */
 /* TODO we could make this hierarchical to avoid contention ? */
 /* TODO we could make this hierarchical to avoid contention ? */
-static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
+//static starpu_pthread_cond_t submitted_cond = STARPU_PTHREAD_COND_INITIALIZER;
 static starpu_pthread_mutex_t submitted_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t submitted_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-static long int nsubmitted = 0, nready = 0;
-
-static void _starpu_increment_nsubmitted_tasks(void);
 
 
 /* This key stores the task currently handled by the thread, note that we
 /* This key stores the task currently handled by the thread, note that we
  * cannot use the worker structure to store that information because it is
  * cannot use the worker structure to store that information because it is
@@ -74,7 +76,7 @@ void starpu_task_init(struct starpu_task *task)
 	task->predicted_transfer = NAN;
 	task->predicted_transfer = NAN;
 
 
 	task->magic = 42;
 	task->magic = 42;
-	task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
+	task->sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 
 	task->flops = 0.0;
 	task->flops = 0.0;
 
 
@@ -236,7 +238,6 @@ int _starpu_submit_job(struct _starpu_job *j)
 	/* notify bound computation of a new task */
 	/* notify bound computation of a new task */
 	_starpu_bound_record(j);
 	_starpu_bound_record(j);
 
 
-	_starpu_increment_nsubmitted_tasks();
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 
 
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
@@ -411,19 +412,21 @@ int starpu_task_submit(struct starpu_task *task)
 	int ret;
 	int ret;
 	unsigned is_sync = task->synchronous;
 	unsigned is_sync = task->synchronous;
 	starpu_task_bundle_t bundle = task->bundle;
 	starpu_task_bundle_t bundle = task->bundle;
-	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
-	unsigned set_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	/* internally, StarPU manipulates a struct _starpu_job * which is a wrapper around a
 	* task structure, it is possible that this job structure was already
 	* task structure, it is possible that this job structure was already
 	* allocated. */
 	* allocated. */
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 
-	if (task->sched_ctx == _starpu_get_initial_sched_ctx()->id  && nsched_ctxs != 1 && !j->internal)
+	if (j->internal)
 	{
 	{
-		set_sched_ctx = starpu_sched_ctx_get_context();
-		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
-			task->sched_ctx = set_sched_ctx;
+		// Internal tasks are submitted to initial context
+		task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
+	}
+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
+	{
+		// If the task has not specified a context, we set the current context
+		task->sched_ctx = _starpu_sched_ctx_get_current_context();
 	}
 	}
 
 
 	if (is_sync)
 	if (is_sync)
@@ -575,7 +578,18 @@ int _starpu_task_submit_nodeps(struct starpu_task *task)
 	}
 	}
 
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-	_starpu_increment_nsubmitted_tasks();
+
+	if (j->internal)
+	{
+		// Internal tasks are submitted to initial context
+		j->task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
+	}
+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
+	{
+		// If the task has not specified a context, we set the current context
+		j->task->sched_ctx = _starpu_sched_ctx_get_current_context();
+	}
+
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 
 
@@ -629,11 +643,22 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	}
 	}
 
 
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
-	_starpu_increment_nsubmitted_tasks();
+
+	if (j->internal)
+	{
+		// Internal tasks are submitted to initial context
+		j->task->sched_ctx = _starpu_get_initial_sched_ctx()->id;
+	}
+	else if (task->sched_ctx == STARPU_NMAX_SCHED_CTXS)
+	{
+		// If the task has not specified a context, we set the current context
+		j->task->sched_ctx = _starpu_sched_ctx_get_current_context();
+	}
+
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	_starpu_increment_nsubmitted_tasks_of_sched_ctx(j->task->sched_ctx);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	j->submitted = 1;
 	j->submitted = 1;
-	_starpu_increment_nready_tasks();
+	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops);
 
 
 	for (i=0 ; i<task->cl->nbuffers ; i++)
 	for (i=0 ; i<task->cl->nbuffers ; i++)
 	{
 	{
@@ -711,38 +736,40 @@ int starpu_task_wait_for_all(void)
 		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 		if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 			return -EDEADLK;
 			return -EDEADLK;
 
 
-		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-		_STARPU_TRACE_TASK_WAIT_FOR_ALL;
-
-		while (nsubmitted > 0)
-			STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
-
-		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-
 #ifdef HAVE_AYUDAME_H
 #ifdef HAVE_AYUDAME_H
 		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
 		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
 #endif
 #endif
+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+		if(config->topology.nsched_ctxs == 1)
+			starpu_task_wait_for_all_in_ctx(0);
+		else
+		{
+			int s;
+			for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+			{
+				if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+				{
+					starpu_task_wait_for_all_in_ctx(config->sched_ctxs[s].id);
+				}
+			}
+		}
+
+		return 0;
 	}
 	}
 	else
 	else
 	{
 	{
 		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
 		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
-		_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id);
-#ifdef HAVE_AYUDAME_H
-		/* TODO: improve Temanejo into knowing about contexts ... */
-		if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
-#endif
+		return starpu_task_wait_for_all_in_ctx(sched_ctx_id);
 	}
 	}
-	return 0;
 }
 }
 
 
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 {
 {
 	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
 #ifdef HAVE_AYUDAME_H
 #ifdef HAVE_AYUDAME_H
+	/* TODO: improve Temanejo into knowing about contexts ... */
 	if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
 	if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);
 #endif
 #endif
-
 	return 0;
 	return 0;
 }
 }
 /*
 /*
@@ -754,39 +781,22 @@ int starpu_task_wait_for_no_ready(void)
 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 	if (STARPU_UNLIKELY(!_starpu_worker_may_perform_blocking_calls()))
 		return -EDEADLK;
 		return -EDEADLK;
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-	_STARPU_TRACE_TASK_WAIT_FOR_ALL;
-
-	while (nready > 0)
-		STARPU_PTHREAD_COND_WAIT(&submitted_cond, &submitted_mutex);
-
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-
-	return 0;
-}
-
-void _starpu_decrement_nsubmitted_tasks(void)
-{
-	struct _starpu_machine_config *config = _starpu_get_machine_config();
-
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-	if (--nsubmitted == 0)
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	if(config->topology.nsched_ctxs == 1)
+		_starpu_wait_for_no_ready_of_sched_ctx(0);
+	else
 	{
 	{
-		if (!config->submitting)
+		int s;
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
 		{
 		{
-			ANNOTATE_HAPPENS_AFTER(&config->running);
-			config->running = 0;
-			ANNOTATE_HAPPENS_BEFORE(&config->running);
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
+				_starpu_wait_for_no_ready_of_sched_ctx(config->sched_ctxs[s].id);
+			}
 		}
 		}
-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
 	}
 	}
 
 
-	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
-
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-
+	return 0;
 }
 }
 
 
 void
 void
@@ -795,57 +805,65 @@ starpu_drivers_request_termination(void)
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 	struct _starpu_machine_config *config = _starpu_get_machine_config();
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
+	int nsubmitted = starpu_task_nsubmitted();
 	config->submitting = 0;
 	config->submitting = 0;
 	if (nsubmitted == 0)
 	if (nsubmitted == 0)
 	{
 	{
 		ANNOTATE_HAPPENS_AFTER(&config->running);
 		ANNOTATE_HAPPENS_AFTER(&config->running);
 		config->running = 0;
 		config->running = 0;
 		ANNOTATE_HAPPENS_BEFORE(&config->running);
 		ANNOTATE_HAPPENS_BEFORE(&config->running);
-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
+		int s;
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+		{
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
+				_starpu_check_nsubmitted_tasks_of_sched_ctx(config->sched_ctxs[s].id);
+			}
+		}
 	}
 	}
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
 }
 }
 
 
-static void _starpu_increment_nsubmitted_tasks(void)
-{
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-	nsubmitted++;
-
-	_STARPU_TRACE_UPDATE_TASK_CNT(nsubmitted);
-
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-}
-
 int starpu_task_nsubmitted(void)
 int starpu_task_nsubmitted(void)
 {
 {
+	int nsubmitted = 0;
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	if(config->topology.nsched_ctxs == 1)
+		nsubmitted = _starpu_get_nsubmitted_tasks_of_sched_ctx(0);
+	else
+	{
+		int s;
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+		{
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
+				nsubmitted += _starpu_get_nsubmitted_tasks_of_sched_ctx(config->sched_ctxs[s].id);
+			}
+		}
+	}
 	return nsubmitted;
 	return nsubmitted;
 }
 }
 
 
-void _starpu_increment_nready_tasks(void)
-{
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-	nready++;
-
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-}
-
-void _starpu_decrement_nready_tasks(void)
-{
-	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
-
-	if (--nready == 0)
-		STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
-
-	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
-
-}
 
 
 int starpu_task_nready(void)
 int starpu_task_nready(void)
 {
 {
+	int nready = 0;
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	if(config->topology.nsched_ctxs == 1)
+		nready = _starpu_get_nready_tasks_of_sched_ctx(0);
+	else
+	{
+		int s;
+		for(s = 0; s < STARPU_NMAX_SCHED_CTXS; s++)
+		{
+			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
+			{
+				nready += _starpu_get_nready_tasks_of_sched_ctx(config->sched_ctxs[s].id);
+			}
+		}
+	}
+
 	return nready;
 	return nready;
 }
 }
 
 
@@ -982,3 +1000,57 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 {
 {
 	return cl->cpu_funcs_name[nimpl];
 	return cl->cpu_funcs_name[nimpl];
 }
 }
+
+static starpu_pthread_t watchdog_thread;
+
+/* Check from times to times that StarPU does finish some tasks */
+static void *watchdog_func(void *foo STARPU_ATTRIBUTE_UNUSED)
+{
+	struct timespec ts;
+	char *timeout_env;
+	unsigned long long timeout;
+
+	if (! (timeout_env = getenv("STARPU_WATCHDOG_TIMEOUT")))
+		return NULL;
+
+	timeout = atoll(timeout_env);
+	ts.tv_sec = timeout / 1000000;
+	ts.tv_nsec = (timeout % 1000000) * 1000;
+	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
+	
+	STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+	while (_starpu_machine_is_running())
+	{
+		int last_nsubmitted = starpu_task_nsubmitted();
+		config->watchdog_ok = 0;
+		STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+
+		_starpu_sleep(ts);
+
+		STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
+		if (!config->watchdog_ok && last_nsubmitted
+				&& last_nsubmitted == starpu_task_nsubmitted())
+		{
+			fprintf(stderr,"The StarPU watchdog detected that no task finished for %u.%06us (can be configure through STARPU_WATCHDOG_TIMEOUT)\n", (unsigned)ts.tv_sec, (unsigned)ts.tv_nsec/1000);
+			if (getenv("STARPU_WATCHDOG_CRASH"))
+			{
+				fprintf(stderr,"Crashing the process\n");
+				assert(0);
+			}
+			else
+				fprintf(stderr,"Set the STARPU_WATCHDOG_CRASH environment variable if you want to abort the process in such a case\n");
+		}
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
+	return NULL;
+}
+
+void _starpu_watchdog_init(void)
+{
+	STARPU_PTHREAD_CREATE(&watchdog_thread, NULL, watchdog_func, NULL);
+}
+
+void _starpu_watchdog_shutdown(void)
+{
+	starpu_pthread_join(watchdog_thread, NULL);
+}

+ 3 - 8
src/core/task.h

@@ -26,14 +26,6 @@
 /* Internal version of starpu_task_destroy: don't check task->destroy flag */
 /* Internal version of starpu_task_destroy: don't check task->destroy flag */
 void _starpu_task_destroy(struct starpu_task *task);
 void _starpu_task_destroy(struct starpu_task *task);
 
 
-/* In order to implement starpu_task_wait_for_all, we keep track of the number of
- * task currently submitted */
-void _starpu_decrement_nsubmitted_tasks(void);
-/* In order to implement starpu_task_wait_for_no_ready, we keep track of the number of
- * task currently ready */
-void _starpu_increment_nready_tasks(void);
-void _starpu_decrement_nready_tasks(void);
-
 /* A pthread key is used to store the task currently executed on the thread.
 /* A pthread key is used to store the task currently executed on the thread.
  * _starpu_initialize_current_task_key initializes this pthread key and
  * _starpu_initialize_current_task_key initializes this pthread key and
  * _starpu_set_current_task updates its current value. */
  * _starpu_set_current_task updates its current value. */
@@ -80,4 +72,7 @@ char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, un
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
 
 
+void _starpu_watchdog_init(void);
+void _starpu_watchdog_shutdown(void);
+
 #endif // __CORE_TASK_H__
 #endif // __CORE_TASK_H__

+ 22 - 13
src/core/topology.c

@@ -943,9 +943,9 @@ _starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_co
 		if (ncpu == -1)
 		if (ncpu == -1)
 		{
 		{
 			unsigned mic_busy_cpus = 0;
 			unsigned mic_busy_cpus = 0;
-			unsigned i = 0;
-			for (i = 0; i < STARPU_MAXMICDEVS; i++)
-				mic_busy_cpus += (topology->nmiccores[i] ? 1 : 0);
+			unsigned j = 0;
+			for (j = 0; j < STARPU_MAXMICDEVS; j++)
+				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
 
 
 			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
 			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
 				+ topology->nopenclgpus + topology->nsccdevices;
 				+ topology->nopenclgpus + topology->nsccdevices;
@@ -1195,8 +1195,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 				unsigned worker2;
 				unsigned worker2;
 				for (worker2 = 0; worker2 < worker; worker2++)
 				for (worker2 = 0; worker2 < worker; worker2++)
 				{
 				{
-					struct _starpu_worker *workerarg = &config->workers[worker];
-					if (workerarg->arch == STARPU_CUDA_WORKER)
+					struct _starpu_worker *workerarg2 = &config->workers[worker2];
+					if (workerarg2->arch == STARPU_CUDA_WORKER)
 					{
 					{
 						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
 						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
 						_starpu_register_bus(memory_node2, memory_node);
 						_starpu_register_bus(memory_node2, memory_node);
@@ -1291,16 +1291,17 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 		/* Put the worker descriptor in the userdata field of the
 		/* Put the worker descriptor in the userdata field of the
 		 * hwloc object describing the CPU */
 		 * hwloc object describing the CPU */
-		hwloc_obj_t worker_obj;
-		worker_obj =
-			hwloc_get_obj_by_depth (config->topology.hwtopology,
-						config->cpu_depth,
-						workerarg->bindid);
-		worker_obj->userdata = &config->workers[worker];
+		hwloc_obj_t worker_obj = hwloc_get_obj_by_depth(config->topology.hwtopology,
+								config->cpu_depth,
+								workerarg->bindid);
+		if (worker_obj->userdata == NULL)
+		{
+			worker_obj->userdata = _starpu_worker_list_new();
+		}
+		_starpu_worker_list_push_front(worker_obj->userdata, workerarg);
 
 
 		/* Clear the cpu set and set the cpu */
 		/* Clear the cpu set and set the cpu */
-		workerarg->hwloc_cpu_set =
-			hwloc_bitmap_dup (worker_obj->cpuset);
+		workerarg->hwloc_cpu_set = hwloc_bitmap_dup (worker_obj->cpuset);
 #endif
 #endif
 	}
 	}
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
@@ -1388,6 +1389,14 @@ _starpu_destroy_topology (
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 		struct _starpu_worker *workerarg = &config->workers[worker];
 		struct _starpu_worker *workerarg = &config->workers[worker];
 		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
 		hwloc_bitmap_free(workerarg->hwloc_cpu_set);
+		hwloc_obj_t worker_obj = hwloc_get_obj_by_depth(config->topology.hwtopology,
+								config->cpu_depth,
+								workerarg->bindid);
+		if (worker_obj->userdata)
+		{
+			_starpu_worker_list_delete(worker_obj->userdata);
+			worker_obj->userdata = NULL;
+		}
 #endif
 #endif
 	}
 	}
 
 

+ 5 - 0
src/core/workers.c

@@ -483,6 +483,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 {
 {
 	pconfig->running = 1;
 	pconfig->running = 1;
 	pconfig->submitting = 1;
 	pconfig->submitting = 1;
+	STARPU_HG_DISABLE_CHECKING(pconfig->watchdog_ok);
 
 
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
 
 
@@ -1041,6 +1042,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	if (!is_a_sink)
 	if (!is_a_sink)
 		_starpu_launch_drivers(&config);
 		_starpu_launch_drivers(&config);
 
 
+	_starpu_watchdog_init();
+
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	initialized = INITIALIZED;
 	initialized = INITIALIZED;
 	/* Tell everybody that we initialized */
 	/* Tell everybody that we initialized */
@@ -1215,6 +1218,8 @@ void starpu_shutdown(void)
 
 
 	_starpu_deinitialize_registered_performance_models();
 	_starpu_deinitialize_registered_performance_models();
 
 
+	_starpu_watchdog_shutdown();
+
 	/* wait for their termination */
 	/* wait for their termination */
 	_starpu_terminate_workers(&config);
 	_starpu_terminate_workers(&config);
 
 

+ 4 - 3
src/core/workers.h

@@ -53,8 +53,7 @@
 #include <starpu_parameters.h>
 #include <starpu_parameters.h>
 
 
 /* This is initialized from in _starpu_worker_init */
 /* This is initialized from in _starpu_worker_init */
-struct _starpu_worker
-{
+LIST_TYPE(_starpu_worker,
 	struct _starpu_machine_config *config;
 	struct _starpu_machine_config *config;
         starpu_pthread_mutex_t mutex;
         starpu_pthread_mutex_t mutex;
 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
 	enum starpu_worker_archtype arch; /* what is the type of worker ? */
@@ -116,7 +115,7 @@ struct _starpu_worker
 	/* hwloc_obj_t of the device controled by the worker*/
 	/* hwloc_obj_t of the device controled by the worker*/
 	hwloc_obj_t hw_obj;
 	hwloc_obj_t hw_obj;
 #endif
 #endif
-};
+);
 
 
 struct _starpu_combined_worker
 struct _starpu_combined_worker
 {
 {
@@ -313,6 +312,8 @@ struct _starpu_machine_config
 
 
 	/* this flag is set until the application is finished submitting tasks */
 	/* this flag is set until the application is finished submitting tasks */
 	unsigned submitting;
 	unsigned submitting;
+
+	int watchdog_ok;
 };
 };
 
 
 /* Three functions to manage argv, argc */
 /* Three functions to manage argv, argc */

+ 20 - 1
src/datawizard/coherency.c

@@ -24,6 +24,7 @@
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <math.h>
 #include <math.h>
 #include <core/task.h>
 #include <core/task.h>
+#include <starpu_scheduler.h>
 
 
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
 static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
 unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
@@ -428,12 +429,30 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 	/* find someone who already has the data */
 	/* find someone who already has the data */
 	unsigned src_node = 0;
 	unsigned src_node = 0;
 
 
-	/* if the data is in write only mode, there is no need for a source */
 	if (mode & STARPU_R)
 	if (mode & STARPU_R)
 	{
 	{
 		src_node = _starpu_select_src_node(handle, requesting_node);
 		src_node = _starpu_select_src_node(handle, requesting_node);
 		STARPU_ASSERT(src_node != requesting_node);
 		STARPU_ASSERT(src_node != requesting_node);
 	}
 	}
+	else
+	{
+		/* if the data is in write only mode, there is no need for a source */
+		if (requesting_node == STARPU_MAIN_RAM) {
+			/* And this is the main RAM, really no need for a
+			 * request, just allocate */
+			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
+			{
+				_starpu_update_data_state(handle, dst_replicate, mode);
+
+				_starpu_spin_unlock(&handle->header_lock);
+
+				if (callback_func)
+					callback_func(callback_arg);
+				_STARPU_LOG_OUT_TAG("data immediately allocated");
+				return NULL;
+			}
+		}
+	}
 
 
 	/* We can safely assume that there won't be more than 2 hops in the
 	/* We can safely assume that there won't be more than 2 hops in the
 	 * current implementation */
 	 * current implementation */

+ 52 - 23
src/datawizard/data_request.c

@@ -24,8 +24,8 @@
  * Data interfaces should also have to declare how many asynchronous requests
  * Data interfaces should also have to declare how many asynchronous requests
  * they have actually started (think of e.g. csr).
  * they have actually started (think of e.g. csr).
  */
  */
-#define MAX_PENDING_REQUESTS_PER_NODE 400
-#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 200
+#define MAX_PENDING_REQUESTS_PER_NODE 20
+#define MAX_PENDING_PREFETCH_REQUESTS_PER_NODE 10
 
 
 /* requests that have not been treated at all */
 /* requests that have not been treated at all */
 static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_list *data_requests[STARPU_MAXNODES];
@@ -356,17 +356,12 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 {
 {
 	starpu_data_handle_t handle = r->handle;
 	starpu_data_handle_t handle = r->handle;
 
 
-	if (prefetch) {
-		if (_starpu_spin_trylock(&handle->header_lock))
-			return -EBUSY;
-		if (_starpu_spin_trylock(&r->lock))
-		{
-			_starpu_spin_unlock(&handle->header_lock);
-			return -EBUSY;
-		}
-	} else {
-		_starpu_spin_lock(&handle->header_lock);
-		_starpu_spin_lock(&r->lock);
+	if (_starpu_spin_trylock(&handle->header_lock))
+		return -EBUSY;
+	if (_starpu_spin_trylock(&r->lock))
+	{
+		_starpu_spin_unlock(&handle->header_lock);
+		return -EBUSY;
 	}
 	}
 
 
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
 	struct _starpu_data_replicate *src_replicate = r->src_replicate;
@@ -420,19 +415,25 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 	return 0;
 }
 }
 
 
-void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 {
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_data_requests;
+	struct _starpu_data_request_list *empty_list;
+	int ret = 0;
 
 
 	/* Here helgrind would should that this is an un protected access.
 	/* Here helgrind would should that this is an un protected access.
 	 * We however don't care about missing an entry, we will get called
 	 * We however don't care about missing an entry, we will get called
 	 * again sooner or later. */
 	 * again sooner or later. */
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
 	if (_starpu_data_request_list_empty(data_requests[src_node]))
-		return;
+		return 0;
+
+	empty_list = _starpu_data_request_list_new();
 
 
 	/* take all the entries from the request list */
 	/* take all the entries from the request list */
-        STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
+		/* List is busy, do not bother with it */
+		return -EBUSY;
 
 
 	struct _starpu_data_request_list *local_list = data_requests[src_node];
 	struct _starpu_data_request_list *local_list = data_requests[src_node];
 
 
@@ -441,13 +442,14 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 		/* there is no request */
 		/* there is no request */
                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
 
-		return;
+		_starpu_data_request_list_delete(empty_list);
+		return 0;
 	}
 	}
 
 
 	/* There is an entry: we create a new empty list to replace the list of
 	/* There is an entry: we create a new empty list to replace the list of
 	 * requests, and we handle the request(s) one by one in the former
 	 * requests, and we handle the request(s) one by one in the former
 	 * list, without concurrency issues.*/
 	 * list, without concurrency issues.*/
-	data_requests[src_node] = _starpu_data_request_list_new();
+	data_requests[src_node] = empty_list;
 	STARPU_HG_DISABLE_CHECKING(data_requests[src_node]->_head);
 	STARPU_HG_DISABLE_CHECKING(data_requests[src_node]->_head);
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
@@ -463,6 +465,7 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 		{
 		{
 			/* Too many requests at the same time, skip pushing
 			/* Too many requests at the same time, skip pushing
 			 * more for now */
 			 * more for now */
+			ret = -EBUSY;
 			break;
 			break;
 		}
 		}
 
 
@@ -471,6 +474,8 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 		res = starpu_handle_data_request(r, may_alloc, 0);
 		res = starpu_handle_data_request(r, may_alloc, 0);
 		if (res != 0 && res != -EAGAIN)
 		if (res != 0 && res != -EAGAIN)
 		{
 		{
+			/* handle is busy, or not enough memory, postpone for now */
+			ret = res;
 			_starpu_data_request_list_push_back(new_data_requests, r);
 			_starpu_data_request_list_push_back(new_data_requests, r);
 			break;
 			break;
 		}
 		}
@@ -491,6 +496,8 @@ void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc)
 
 
 	_starpu_data_request_list_delete(new_data_requests);
 	_starpu_data_request_list_delete(new_data_requests);
 	_starpu_data_request_list_delete(local_list);
 	_starpu_data_request_list_delete(local_list);
+
+	return ret;
 }
 }
 
 
 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc)
 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc)
@@ -498,12 +505,17 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 	struct _starpu_data_request *r;
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_data_requests;
 	struct _starpu_data_request_list *new_prefetch_requests;
 	struct _starpu_data_request_list *new_prefetch_requests;
+	struct _starpu_data_request_list *empty_list;
 
 
 	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
 	if (_starpu_data_request_list_empty(prefetch_requests[src_node]))
 		return;
 		return;
 
 
+	empty_list = _starpu_data_request_list_new();
+
 	/* take all the entries from the request list */
 	/* take all the entries from the request list */
-        STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_list_mutex[src_node]))
+		/* List is busy, do not bother with it */
+		return;
 
 
 	struct _starpu_data_request_list *local_list = prefetch_requests[src_node];
 	struct _starpu_data_request_list *local_list = prefetch_requests[src_node];
 
 
@@ -511,13 +523,14 @@ void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc
 	{
 	{
 		/* there is no request */
 		/* there is no request */
                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
                 STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
+		_starpu_data_request_list_delete(empty_list);
 		return;
 		return;
 	}
 	}
 
 
 	/* There is an entry: we create a new empty list to replace the list of
 	/* There is an entry: we create a new empty list to replace the list of
 	 * requests, and we handle the request(s) one by one in the former
 	 * requests, and we handle the request(s) one by one in the former
 	 * list, without concurrency issues.*/
 	 * list, without concurrency issues.*/
-	prefetch_requests[src_node] = _starpu_data_request_list_new();
+	prefetch_requests[src_node] = empty_list;
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
 
@@ -582,12 +595,16 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //	_STARPU_DEBUG("_starpu_handle_pending_node_data_requests ...\n");
 //
 //
 	struct _starpu_data_request_list *new_data_requests_pending;
 	struct _starpu_data_request_list *new_data_requests_pending;
+	struct _starpu_data_request_list *empty_list;
 	unsigned taken, kept;
 	unsigned taken, kept;
 
 
 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
 	if (_starpu_data_request_list_empty(data_requests_pending[src_node]))
 		return;
 		return;
 
 
-	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[src_node]);
+	empty_list = _starpu_data_request_list_new();
+	if (STARPU_PTHREAD_MUTEX_TRYLOCK(&data_requests_pending_list_mutex[src_node]) && !force)
+		/* List is busy, do not bother with it */
+		return;
 
 
 	/* for all entries of the list */
 	/* for all entries of the list */
 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
 	struct _starpu_data_request_list *local_list = data_requests_pending[src_node];
@@ -595,9 +612,10 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 	{
 	{
 		/* there is no request */
 		/* there is no request */
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
+		_starpu_data_request_list_delete(empty_list);
 		return;
 		return;
 	}
 	}
-	data_requests_pending[src_node] = _starpu_data_request_list_new();
+	data_requests_pending[src_node] = empty_list;
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_pending_list_mutex[src_node]);
 
 
@@ -613,8 +631,19 @@ static void _handle_pending_node_data_requests(unsigned src_node, unsigned force
 
 
 		starpu_data_handle_t handle = r->handle;
 		starpu_data_handle_t handle = r->handle;
 
 
-		_starpu_spin_lock(&handle->header_lock);
+		if (force)
+			/* Have to wait for the handle, whatever it takes */
+			_starpu_spin_lock(&handle->header_lock);
+		else
+			if (_starpu_spin_trylock(&handle->header_lock))
+			{
+				/* Handle is busy, retry this later */
+				_starpu_data_request_list_push_back(new_data_requests_pending, r);
+				kept++;
+				continue;
+			}
 
 
+		/* This shouldn't be too hard to acquire */
 		_starpu_spin_lock(&r->lock);
 		_starpu_spin_lock(&r->lock);
 
 
 		/* wait until the transfer is terminated */
 		/* wait until the transfer is terminated */

+ 2 - 1
src/datawizard/data_request.h

@@ -109,7 +109,8 @@ LIST_TYPE(_starpu_data_requester,
 void _starpu_init_data_request_lists(void);
 void _starpu_init_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_deinit_data_request_lists(void);
 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node);
 void _starpu_post_data_request(struct _starpu_data_request *r, unsigned handling_node);
-void _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
+/* returns 0 if we have pushed all requests, -EBUSY or -ENOMEM otherwise */
+int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc);
 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc);
 void _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc);
 
 
 void _starpu_handle_pending_node_data_requests(unsigned src_node);
 void _starpu_handle_pending_node_data_requests(unsigned src_node);

+ 4 - 2
src/datawizard/datawizard.c

@@ -36,8 +36,10 @@ void _starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc)
 
 
 	/* in case some other driver requested data */
 	/* in case some other driver requested data */
 	_starpu_handle_pending_node_data_requests(memory_node);
 	_starpu_handle_pending_node_data_requests(memory_node);
-	_starpu_handle_node_data_requests(memory_node, may_alloc);
-	_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
+	if (_starpu_handle_node_data_requests(memory_node, may_alloc) == 0)
+		/* We pushed all pending requests, we can afford pushing
+		 * prefetch requests */
+		_starpu_handle_node_prefetch_requests(memory_node, may_alloc);
 	_starpu_execute_registered_progression_hooks();
 	_starpu_execute_registered_progression_hooks();
 }
 }
 
 

+ 3 - 1
src/datawizard/filters.c

@@ -258,7 +258,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		child->footprint = _starpu_compute_data_footprint(child);
 		child->footprint = _starpu_compute_data_footprint(child);
 
 
 		void *ptr;
 		void *ptr;
-		ptr = starpu_data_handle_to_pointer(child, 0);
+		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
 		if (ptr != NULL)
 		if (ptr != NULL)
 			_starpu_data_register_ram_pointer(child, ptr);
 			_starpu_data_register_ram_pointer(child, ptr);
 	}
 	}
@@ -281,6 +281,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 	unsigned node;
 	unsigned node;
 	unsigned sizes[root_handle->nchildren];
 	unsigned sizes[root_handle->nchildren];
 
 
+	_STARPU_TRACE_START_UNPARTITION(root_handle, gathering_node);
 	_starpu_spin_lock(&root_handle->header_lock);
 	_starpu_spin_lock(&root_handle->header_lock);
 
 
 	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle);
 	STARPU_ASSERT_MSG(root_handle->nchildren != 0, "data %p is not partitioned, can not unpartition it", root_handle);
@@ -435,6 +436,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
 
 	/* now the parent may be used again so we release the lock */
 	/* now the parent may be used again so we release the lock */
 	_starpu_spin_unlock(&root_handle->header_lock);
 	_starpu_spin_unlock(&root_handle->header_lock);
+	_STARPU_TRACE_END_UNPARTITION(root_handle, gathering_node);
 }
 }
 
 
 /* each child may have his own interface type */
 /* each child may have his own interface type */

+ 1 - 0
src/datawizard/footprint.c

@@ -18,6 +18,7 @@
 #include <datawizard/footprint.h>
 #include <datawizard/footprint.h>
 #include <starpu_hash.h>
 #include <starpu_hash.h>
 #include <core/task.h>
 #include <core/task.h>
+#include <starpu_scheduler.h>
 
 
 uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
 uint32_t _starpu_compute_buffers_footprint(struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned nimpl, struct _starpu_job *j)
 {
 {

+ 5 - 5
src/datawizard/interfaces/block_interface.c

@@ -589,9 +589,9 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
 		{
 			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
 			ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
-								dst_block->dev_handle, dst_block->offset, dst_node,
-							       src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
-							       event);
+							    dst_block->dev_handle, dst_block->offset, dst_node,
+							    src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
+							    event);
                 }
                 }
 		else
 		else
 		{
 		{
@@ -615,8 +615,8 @@ static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_
 								    dst_block->dev_handle,
 								    dst_block->dev_handle,
 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
 								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
 								    dst_node,
 								    dst_node,
-								       src_block->nx*src_block->elemsize,
-								       event);
+								    src_block->nx*src_block->elemsize,
+								    event);
                         }
                         }
                 }
                 }
         }
         }

+ 5 - 4
src/datawizard/interfaces/data_interface.c

@@ -283,7 +283,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	/* now the data is available ! */
 	/* now the data is available ! */
 	_starpu_spin_unlock(&handle->header_lock);
 	_starpu_spin_unlock(&handle->header_lock);
 
 
-	ptr = starpu_data_handle_to_pointer(handle, 0);
+	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
 	if (ptr != NULL)
 	if (ptr != NULL)
 	{
 	{
 		_starpu_data_register_ram_pointer(handle, ptr);
 		_starpu_data_register_ram_pointer(handle, ptr);
@@ -442,7 +442,8 @@ int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 	return 0;
 	return 0;
 }
 }
 
 
-int starpu_data_release_tag(starpu_data_handle_t handle)
+static
+int _starpu_data_release_tag(starpu_data_handle_t handle)
 {
 {
 	struct handle_tag_entry *tag_entry;
 	struct handle_tag_entry *tag_entry;
 
 
@@ -476,7 +477,7 @@ void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	unsigned worker;
 	unsigned worker;
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 
 
-	ram_ptr = starpu_data_handle_to_pointer(handle, 0);
+	ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
 
 
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 		free(handle->per_node[node].data_interface);
 		free(handle->per_node[node].data_interface);
@@ -734,7 +735,7 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
 	STARPU_PTHREAD_COND_DESTROY(&handle->busy_cond);
 	STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
 	STARPU_PTHREAD_MUTEX_DESTROY(&handle->sequential_consistency_mutex);
 
 
-	starpu_data_release_tag(handle);
+	_starpu_data_release_tag(handle);
 
 
 	free(handle);
 	free(handle);
 }
 }

+ 15 - 9
src/datawizard/interfaces/matrix_interface.c

@@ -404,7 +404,11 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 		src_matrix->nx*elemsize, src_matrix->ny, kind);
 		src_matrix->nx*elemsize, src_matrix->ny, kind);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	{
+		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
+		if (ret == -EAGAIN) return ret;
+		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
+	}
 #endif
 #endif
 
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
@@ -644,15 +648,17 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 			ret = -EAGAIN;
 			ret = -EAGAIN;
 	}
 	}
 	else
 	else
-	for (y = 0; y < ny; y++)
 	{
 	{
-		uint32_t src_offset = y*ld_src*elemsize;
-		uint32_t dst_offset = y*ld_dst*elemsize;
-
-		if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset + src_offset, src_node,
-		                          dst_matrix->dev_handle, dst_matrix->offset + dst_offset, dst_node,
-		                          nx*elemsize, async_data))
-			ret = -EAGAIN;
+	     for (y = 0; y < ny; y++)
+	     {
+		     uint32_t src_offset = y*ld_src*elemsize;
+		     uint32_t dst_offset = y*ld_dst*elemsize;
+
+		     if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset + src_offset, src_node,
+					       dst_matrix->dev_handle, dst_matrix->offset + dst_offset, dst_node,
+					       nx*elemsize, async_data))
+			     ret = -EAGAIN;
+	     }
 	}
 	}
 
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);

+ 37 - 17
src/datawizard/malloc.c

@@ -24,6 +24,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/malloc.h>
 
 
 static size_t _malloc_align = sizeof(void*);
 static size_t _malloc_align = sizeof(void*);
 
 
@@ -91,7 +92,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
 
 	if (flags & STARPU_MALLOC_COUNT)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
-		if (_starpu_memory_manager_can_allocate_size(dim, 0) == 0)
+		if (_starpu_memory_manager_can_allocate_size(dim, STARPU_MAIN_RAM) == 0)
 		{
 		{
 			size_t freed;
 			size_t freed;
 			size_t reclaim = 2 * dim;
 			size_t reclaim = 2 * dim;
@@ -108,9 +109,15 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 		}
 		}
 	}
 	}
 
 
-#ifndef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_PINNED)
 	if (flags & STARPU_MALLOC_PINNED)
 	{
 	{
+#ifdef STARPU_SIMGRID
+		/* FIXME: CUDA seems to be taking 650µs every 1MiB.
+		 * Ideally we would simulate this batching in 1MiB requests
+		 * instead of computing an average value.
+		 */
+		MSG_process_sleep((float) dim * 0.000650 / 1048576.);
+#else /* STARPU_SIMGRID */
 		if (_starpu_can_submit_cuda_task())
 		if (_starpu_can_submit_cuda_task())
 		{
 		{
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -176,8 +183,8 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 //			goto end;
 //			goto end;
 //#endif /* STARPU_USE_OPENCL */
 //#endif /* STARPU_USE_OPENCL */
 //		}
 //		}
-	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
+	}
 
 
 	if (_starpu_can_submit_scc_task())
 	if (_starpu_can_submit_scc_task())
 	{
 	{
@@ -338,7 +345,7 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 out:
 out:
 	if (flags & STARPU_MALLOC_COUNT)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 	{
-		_starpu_memory_manager_deallocate_size(dim, 0);
+		_starpu_memory_manager_deallocate_size(dim, STARPU_MAIN_RAM);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -381,14 +388,20 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 		}
 		}
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 		case STARPU_CUDA_RAM:
 		case STARPU_CUDA_RAM:
+		{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
+			static uintptr_t last[STARPU_MAXNODES];
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning TODO: record used memory, using a simgrid property to know the available memory
 #warning TODO: record used memory, using a simgrid property to know the available memory
 #endif
 #endif
-			/* Sleep 10µs for the allocation */
+			/* Sleep for the allocation */
 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			MSG_process_sleep(0.000010);
-			addr = 1;
+			MSG_process_sleep(0.000175);
+			if (!last[dst_node])
+				last[dst_node] = 1<<10;
+			addr = last[dst_node];
+			last[dst_node]+=size;
+			STARPU_ASSERT(last[dst_node] >= addr);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
 			status = cudaMalloc((void **)&addr, size);
 			status = cudaMalloc((void **)&addr, size);
@@ -400,15 +413,21 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 			}
 			}
 #endif
 #endif
 			break;
 			break;
+		}
 #endif
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 	        case STARPU_OPENCL_RAM:
 	        case STARPU_OPENCL_RAM:
-			{
+		{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
-				/* Sleep 10µs for the allocation */
+				static uintptr_t last[STARPU_MAXNODES];
+				/* Sleep for the allocation */
 				STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 				STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-				MSG_process_sleep(0.000010);
-				addr = 1;
+				MSG_process_sleep(0.000175);
+				if (!last[dst_node])
+					last[dst_node] = 1<<10;
+				addr = last[dst_node];
+				last[dst_node]+=size;
+				STARPU_ASSERT(last[dst_node] >= addr);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 #else
 #else
                                 int ret;
                                 int ret;
@@ -425,14 +444,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 				}
 				}
 				break;
 				break;
 #endif
 #endif
-			}
+		}
 #endif
 #endif
 	        case STARPU_DISK_RAM:
 	        case STARPU_DISK_RAM:
 		{
 		{
 			addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
 			addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
 			break;
 			break;
 		}
 		}
-			
+
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 		case STARPU_MIC_RAM:
 		case STARPU_MIC_RAM:
 			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
 			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
@@ -481,8 +500,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 		{
 		{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			/* Sleep 10µs for the free */
-			MSG_process_sleep(0.000010);
+			/* Sleep for the free */
+			MSG_process_sleep(0.000125);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 #else
 #else
 			cudaError_t err;
 			cudaError_t err;
@@ -498,8 +517,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 		{
 		{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-			/* Sleep 10µs for the free */
-			MSG_process_sleep(0.000010);
+			/* Sleep for the free */
+			MSG_process_sleep(0.000125);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 #else
 #else
 			cl_int err;
 			cl_int err;
@@ -774,6 +793,7 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 	{
 	{
 		STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
 		STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
 		nextblock = bitmap[prevblock].next;
 		nextblock = bitmap[prevblock].next;
+		STARPU_ASSERT_MSG(nextblock != block, "It seems data 0x%lx (size %u) on node %u is being freed a second time\n", (unsigned long) addr, (unsigned) size, dst_node);
 		if (nextblock > block || nextblock == -1)
 		if (nextblock > block || nextblock == -1)
 			break;
 			break;
 	}
 	}

+ 2 - 0
src/datawizard/malloc.h

@@ -20,4 +20,6 @@
 void _starpu_malloc_init(unsigned dst_node);
 void _starpu_malloc_init(unsigned dst_node);
 void _starpu_malloc_shutdown(unsigned dst_node);
 void _starpu_malloc_shutdown(unsigned dst_node);
 
 
+void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
+
 #endif
 #endif

+ 72 - 65
src/datawizard/memalloc.c

@@ -24,7 +24,7 @@
 
 
 /* This per-node RW-locks protect mc_list and memchunk_cache entries */
 /* This per-node RW-locks protect mc_list and memchunk_cache entries */
 /* Note: handle header lock is always taken before this */
 /* Note: handle header lock is always taken before this */
-static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
+static struct _starpu_spinlock mc_lock[STARPU_MAXNODES];
 
 
 /* Potentially in use memory chunks */
 /* Potentially in use memory chunks */
 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
@@ -34,6 +34,7 @@ struct mc_cache_entry
 {
 {
 	UT_hash_handle hh;
 	UT_hash_handle hh;
 	struct _starpu_mem_chunk_list *list;
 	struct _starpu_mem_chunk_list *list;
+	uint32_t footprint;
 };
 };
 static struct mc_cache_entry *mc_cache[STARPU_MAXNODES];
 static struct mc_cache_entry *mc_cache[STARPU_MAXNODES];
 
 
@@ -49,7 +50,7 @@ void _starpu_init_mem_chunk_lists(void)
 	unsigned i;
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	for (i = 0; i < STARPU_MAXNODES; i++)
 	{
 	{
-		STARPU_PTHREAD_RWLOCK_INIT(&mc_rwlock[i], NULL);
+		_starpu_spin_init(&mc_lock[i]);
 		mc_list[i] = _starpu_mem_chunk_list_new();
 		mc_list[i] = _starpu_mem_chunk_list_new();
 	}
 	}
 }
 }
@@ -67,7 +68,7 @@ void _starpu_deinit_mem_chunk_lists(void)
 			_starpu_mem_chunk_list_delete(entry->list);
 			_starpu_mem_chunk_list_delete(entry->list);
 			free(entry);
 			free(entry);
 		}
 		}
-		STARPU_PTHREAD_RWLOCK_DESTROY(&mc_rwlock[i]);
+		_starpu_spin_destroy(&mc_lock[i]);
 	}
 	}
 }
 }
 
 
@@ -399,7 +400,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 }
 }
 
 
 #ifdef STARPU_USE_ALLOCATION_CACHE
 #ifdef STARPU_USE_ALLOCATION_CACHE
-/* We assume that mc_rwlock[node] is taken. is_already_in_mc_list indicates
+/* We assume that mc_lock[node] is taken. is_already_in_mc_list indicates
  * that the mc is already in the list of buffers that are possibly used, and
  * that the mc is already in the list of buffers that are possibly used, and
  * therefore not in the cache. */
  * therefore not in the cache. */
 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
 static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_replicate, struct _starpu_mem_chunk *mc, unsigned is_already_in_mc_list)
@@ -408,14 +409,6 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 	 * of the "to free" list, and reassign it to the new
 	 * of the "to free" list, and reassign it to the new
 	 * piece of data */
 	 * piece of data */
 
 
-	if (!is_already_in_mc_list)
-	{
-		uint32_t footprint = _starpu_compute_data_footprint(new_replicate->handle);
-		struct mc_cache_entry *entry;
-		HASH_FIND(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
-		_starpu_mem_chunk_list_erase(entry->list, mc);
-	}
-
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	struct _starpu_data_replicate *old_replicate = mc->replicate;
 	old_replicate->allocated = 0;
 	old_replicate->allocated = 0;
 	old_replicate->automatically_allocated = 0;
 	old_replicate->automatically_allocated = 0;
@@ -427,7 +420,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
 
 	STARPU_ASSERT(new_replicate->data_interface);
 	STARPU_ASSERT(new_replicate->data_interface);
 	STARPU_ASSERT(mc->chunk_interface);
 	STARPU_ASSERT(mc->chunk_interface);
-	memcpy(new_replicate->data_interface, mc->chunk_interface, old_replicate->handle->ops->interface_size);
+	memcpy(new_replicate->data_interface, mc->chunk_interface, mc->size_interface);
 
 
 	mc->data = new_replicate->handle;
 	mc->data = new_replicate->handle;
 	/* mc->ops, mc->footprint and mc->interface should be
 	/* mc->ops, mc->footprint and mc->interface should be
@@ -484,7 +477,7 @@ static int _starpu_data_interface_compare(void *data_interface_a, struct starpu_
 	return ret;
 	return ret;
 }
 }
 
 
-/* This function must be called with mc_rwlock[node] taken in write mode */
+/* This function must be called with mc_lock[node] taken */
 static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(unsigned node, starpu_data_handle_t handle, uint32_t footprint)
 static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(unsigned node, starpu_data_handle_t handle, uint32_t footprint)
 {
 {
 	/* go through all buffers in the cache */
 	/* go through all buffers in the cache */
@@ -517,7 +510,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_cache_lookup_locked(unsigned n
 
 
 /* this function looks for a memory chunk that matches a given footprint in the
 /* this function looks for a memory chunk that matches a given footprint in the
  * list of mem chunk that need to be freed. This function must be called with
  * list of mem chunk that need to be freed. This function must be called with
- * mc_rwlock[node] taken in write mode. */
+ * mc_lock[node] taken. */
 static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
 static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
 {
 {
 	struct _starpu_mem_chunk *mc, *next_mc;
 	struct _starpu_mem_chunk *mc, *next_mc;
@@ -558,7 +551,7 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 
 
 /*
 /*
  * Free the memory chuncks that are explicitely tagged to be freed. The
  * Free the memory chuncks that are explicitely tagged to be freed. The
- * mc_rwlock[node] rw-lock should be taken prior to calling this function.
+ * mc_lock[node] rw-lock should be taken prior to calling this function.
  */
  */
 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 {
 {
@@ -568,7 +561,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 
 
 	size_t freed = 0;
 	size_t freed = 0;
 
 
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_starpu_spin_lock(&mc_lock[node]);
 	HASH_ITER(hh, mc_cache[node], entry, tmp)
 	HASH_ITER(hh, mc_cache[node], entry, tmp)
 	{
 	{
 		busy_mc_cache = _starpu_mem_chunk_list_new();
 		busy_mc_cache = _starpu_mem_chunk_list_new();
@@ -600,7 +593,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 		if (reclaim && freed >= reclaim)
 		if (reclaim && freed >= reclaim)
 			break;
 			break;
 	}
 	}
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_starpu_spin_unlock(&mc_lock[node]);
 	return freed;
 	return freed;
 }
 }
 
 
@@ -608,7 +601,7 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
  * Try to free the buffers currently in use on the memory node. If the force
  * Try to free the buffers currently in use on the memory node. If the force
  * flag is set, the memory is freed regardless of coherency concerns (this
  * flag is set, the memory is freed regardless of coherency concerns (this
  * should only be used at the termination of StarPU for instance). The
  * should only be used at the termination of StarPU for instance). The
- * mc_rwlock[node] rw-lock should be taken prior to calling this function.
+ * mc_lock[node] should be taken prior to calling this function.
  */
  */
 static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t reclaim)
 static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t reclaim)
 {
 {
@@ -617,7 +610,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 	struct _starpu_mem_chunk *mc, *next_mc;
 	struct _starpu_mem_chunk *mc, *next_mc;
 
 
 	/*
 	/*
-	 * We have to unlock mc_rwlock before locking header_lock, so we have
+	 * We have to unlock mc_lock before locking header_lock, so we have
 	 * to be careful with the list.  We try to do just one pass, by
 	 * to be careful with the list.  We try to do just one pass, by
 	 * remembering the next mc to be tried. If it gets dropped, we restart
 	 * remembering the next mc to be tried. If it gets dropped, we restart
 	 * from zero. So we continue until we go through the whole list without
 	 * from zero. So we continue until we go through the whole list without
@@ -625,7 +618,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 	 */
 	 */
 
 
 restart:
 restart:
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_starpu_spin_lock(&mc_lock[node]);
 
 
 	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
 	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
 	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
 	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
@@ -652,7 +645,7 @@ restart:
 				 * still locking the handle. That's not
 				 * still locking the handle. That's not
 				 * supposed to happen, but better be safe by
 				 * supposed to happen, but better be safe by
 				 * letting it go through. */
 				 * letting it go through. */
-				STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+				_starpu_spin_unlock(&mc_lock[node]);
 				goto restart;
 				goto restart;
 			}
 			}
 
 
@@ -664,7 +657,7 @@ restart:
 			_starpu_spin_unlock(&handle->header_lock);
 			_starpu_spin_unlock(&handle->header_lock);
 		}
 		}
 	}
 	}
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_starpu_spin_unlock(&mc_lock[node]);
 
 
 	return freed;
 	return freed;
 }
 }
@@ -721,6 +714,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 
 
 	/* Save a copy of the interface */
 	/* Save a copy of the interface */
 	mc->chunk_interface = malloc(interface_size);
 	mc->chunk_interface = malloc(interface_size);
+	mc->size_interface = interface_size;
 	STARPU_ASSERT(mc->chunk_interface);
 	STARPU_ASSERT(mc->chunk_interface);
 	memcpy(mc->chunk_interface, replicate->data_interface, interface_size);
 	memcpy(mc->chunk_interface, replicate->data_interface, interface_size);
 
 
@@ -739,11 +733,11 @@ static void register_mem_chunk(struct _starpu_data_replicate *replicate, unsigne
 	/* Put this memchunk in the list of memchunk in use */
 	/* Put this memchunk in the list of memchunk in use */
 	mc = _starpu_memchunk_init(replicate, interface_size, automatically_allocated);
 	mc = _starpu_memchunk_init(replicate, interface_size, automatically_allocated);
 
 
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
+	_starpu_spin_lock(&mc_lock[dst_node]);
 
 
 	_starpu_mem_chunk_list_push_back(mc_list[dst_node], mc);
 	_starpu_mem_chunk_list_push_back(mc_list[dst_node], mc);
 
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+	_starpu_spin_unlock(&mc_lock[dst_node]);
 }
 }
 
 
 /* This function is called when the handle is destroyed (eg. when calling
 /* This function is called when the handle is destroyed (eg. when calling
@@ -766,13 +760,13 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 	replicate->allocated = 0;
 	replicate->allocated = 0;
 	replicate->automatically_allocated = 0;
 	replicate->automatically_allocated = 0;
 
 
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_starpu_spin_lock(&mc_lock[node]);
 
 
 	mc->data = NULL;
 	mc->data = NULL;
 	/* remove it from the main list */
 	/* remove it from the main list */
 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_starpu_spin_unlock(&mc_lock[node]);
 
 
 	/* We would only flush the RAM nodes cache if memory gets tight, either
 	/* We would only flush the RAM nodes cache if memory gets tight, either
 	 * because StarPU automatically knows the total memory size of the
 	 * because StarPU automatically knows the total memory size of the
@@ -798,15 +792,16 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 		/* put it in the list of buffers to be removed */
 		/* put it in the list of buffers to be removed */
 		uint32_t footprint = mc->footprint;
 		uint32_t footprint = mc->footprint;
 		struct mc_cache_entry *entry;
 		struct mc_cache_entry *entry;
-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+		_starpu_spin_lock(&mc_lock[node]);
 		HASH_FIND(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
 		HASH_FIND(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
 		if (!entry) {
 		if (!entry) {
 			entry = malloc(sizeof(*entry));
 			entry = malloc(sizeof(*entry));
 			entry->list = _starpu_mem_chunk_list_new();
 			entry->list = _starpu_mem_chunk_list_new();
-			HASH_ADD_KEYPTR(hh, mc_cache[node], &footprint, sizeof(footprint), entry);
+			entry->footprint = footprint;
+			HASH_ADD(hh, mc_cache[node], footprint, sizeof(entry->footprint), entry);
 		}
 		}
 		_starpu_mem_chunk_list_push_front(entry->list, mc);
 		_starpu_mem_chunk_list_push_front(entry->list, mc);
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+		_starpu_spin_unlock(&mc_lock[node]);
 	}
 	}
 }
 }
 
 
@@ -838,26 +833,34 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 	uint32_t footprint = _starpu_compute_data_footprint(handle);
 	uint32_t footprint = _starpu_compute_data_footprint(handle);
 
 
 	_STARPU_TRACE_START_ALLOC_REUSE(dst_node, data_size);
 	_STARPU_TRACE_START_ALLOC_REUSE(dst_node, data_size);
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[dst_node]);
+	_starpu_spin_lock(&mc_lock[dst_node]);
 
 
 	if (try_to_find_reusable_mem_chunk(dst_node, handle, replicate, footprint))
 	if (try_to_find_reusable_mem_chunk(dst_node, handle, replicate, footprint))
 	{
 	{
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+		_starpu_spin_unlock(&mc_lock[dst_node]);
 		_starpu_allocation_cache_hit(dst_node);
 		_starpu_allocation_cache_hit(dst_node);
 		return data_size;
 		return data_size;
 	}
 	}
 
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[dst_node]);
+	_starpu_spin_unlock(&mc_lock[dst_node]);
 	_STARPU_TRACE_END_ALLOC_REUSE(dst_node);
 	_STARPU_TRACE_END_ALLOC_REUSE(dst_node);
 #endif
 #endif
+	STARPU_ASSERT(handle->ops);
+	STARPU_ASSERT(handle->ops->allocate_data_on_node);
+	STARPU_ASSERT(replicate->data_interface);
+
+	char data_interface[handle->ops->interface_size];
+
+	memcpy(data_interface, replicate->data_interface, handle->ops->interface_size);
+
+	/* Take temporary reference on the replicate */
+	replicate->refcnt++;
+	handle->busy_count++;
+	_starpu_spin_unlock(&handle->header_lock);
 
 
 	do
 	do
 	{
 	{
-		STARPU_ASSERT(handle->ops);
-		STARPU_ASSERT(handle->ops->allocate_data_on_node);
-
 		_STARPU_TRACE_START_ALLOC(dst_node, data_size);
 		_STARPU_TRACE_START_ALLOC(dst_node, data_size);
-		STARPU_ASSERT(replicate->data_interface);
 
 
 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
 		if (starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
@@ -870,7 +873,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 		}
 		}
 #endif
 #endif
 
 
-		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
+		allocated_memory = handle->ops->allocate_data_on_node(data_interface, dst_node);
 		_STARPU_TRACE_END_ALLOC(dst_node);
 		_STARPU_TRACE_END_ALLOC(dst_node);
 
 
 		if (allocated_memory == -ENOMEM)
 		if (allocated_memory == -ENOMEM)
@@ -880,11 +883,6 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			if (starpu_memstrategy_data_size_coefficient*handle_size > reclaim)
 			if (starpu_memstrategy_data_size_coefficient*handle_size > reclaim)
 				reclaim = starpu_memstrategy_data_size_coefficient*handle_size;
 				reclaim = starpu_memstrategy_data_size_coefficient*handle_size;
 
 
-			/* Take temporary reference on the replicate */
-			replicate->refcnt++;
-			handle->busy_count++;
-			_starpu_spin_unlock(&handle->header_lock);
-
 			_STARPU_TRACE_START_MEMRECLAIM(dst_node,is_prefetch);
 			_STARPU_TRACE_START_MEMRECLAIM(dst_node,is_prefetch);
 			if (is_prefetch)
 			if (is_prefetch)
 			{
 			{
@@ -893,26 +891,35 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			else
 			else
 				_starpu_memory_reclaim_generic(dst_node, 0, reclaim);
 				_starpu_memory_reclaim_generic(dst_node, 0, reclaim);
 			_STARPU_TRACE_END_MEMRECLAIM(dst_node,is_prefetch);
 			_STARPU_TRACE_END_MEMRECLAIM(dst_node,is_prefetch);
+		}
+	}
+	while((allocated_memory == -ENOMEM) && attempts++ < 2);
 
 
-			int cpt = 0;
-			while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
-			{
-				cpt++;
-				_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
-			}
-			if (cpt == STARPU_SPIN_MAXTRY)
-				_starpu_spin_lock(&handle->header_lock);
+	int cpt = 0;
+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
+	{
+		cpt++;
+		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+	}
+	if (cpt == STARPU_SPIN_MAXTRY)
+		_starpu_spin_lock(&handle->header_lock);
 
 
-			replicate->refcnt--;
-			STARPU_ASSERT(replicate->refcnt >= 0);
-			STARPU_ASSERT(handle->busy_count > 0);
-			handle->busy_count--;
-			ret = _starpu_data_check_not_busy(handle);
-			STARPU_ASSERT(ret == 0);
-		}
+	replicate->refcnt--;
+	STARPU_ASSERT(replicate->refcnt >= 0);
+	STARPU_ASSERT(handle->busy_count > 0);
+	handle->busy_count--;
+	ret = _starpu_data_check_not_busy(handle);
+	STARPU_ASSERT(ret == 0);
 
 
+	if (replicate->allocated)
+	{
+		/* Argl, somebody allocated it in between already, drop this one */
+		handle->ops->free_data_on_node(data_interface, dst_node);
+		allocated_memory = 0;
 	}
 	}
-	while((allocated_memory == -ENOMEM) && attempts++ < 2);
+	else
+		/* Install allocated interface */
+		memcpy(replicate->data_interface, data_interface, handle->ops->interface_size);
 
 
 	return allocated_memory;
 	return allocated_memory;
 }
 }
@@ -941,9 +948,9 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 	replicate->allocated = 1;
 	replicate->allocated = 1;
 	replicate->automatically_allocated = 1;
 	replicate->automatically_allocated = 1;
 
 
-	if (dst_node == 0)
+	if (dst_node == STARPU_MAIN_RAM)
 	{
 	{
-		void *ptr = starpu_data_handle_to_pointer(handle, 0);
+		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
 		if (ptr != NULL)
 		if (ptr != NULL)
 		{
 		{
 			_starpu_data_register_ram_pointer(handle, ptr);
 			_starpu_data_register_ram_pointer(handle, ptr);
@@ -965,16 +972,16 @@ void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 	if (!mc)
 	if (!mc)
 		/* user-allocated memory */
 		/* user-allocated memory */
 		return;
 		return;
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_starpu_spin_lock(&mc_lock[node]);
 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	_starpu_mem_chunk_list_erase(mc_list[node], mc);
 	_starpu_mem_chunk_list_push_back(mc_list[node], mc);
 	_starpu_mem_chunk_list_push_back(mc_list[node], mc);
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_starpu_spin_unlock(&mc_lock[node]);
 }
 }
 
 
 #ifdef STARPU_MEMORY_STATS
 #ifdef STARPU_MEMORY_STATS
 void _starpu_memory_display_stats_by_node(int node)
 void _starpu_memory_display_stats_by_node(int node)
 {
 {
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
+	_starpu_spin_lock(&mc_lock[node]);
 
 
 	if (!_starpu_mem_chunk_list_empty(mc_list[node]))
 	if (!_starpu_mem_chunk_list_empty(mc_list[node]))
 	{
 	{
@@ -993,7 +1000,7 @@ void _starpu_memory_display_stats_by_node(int node)
 
 
 	}
 	}
 
 
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	_starpu_spin_unlock(&mc_lock[node]);
 }
 }
 #endif
 #endif
 
 

+ 2 - 1
src/datawizard/memalloc.h

@@ -41,9 +41,10 @@ LIST_TYPE(_starpu_mem_chunk,
 	 */
 	 */
 	struct starpu_data_interface_ops *ops;
 	struct starpu_data_interface_ops *ops;
 	void *chunk_interface;
 	void *chunk_interface;
+	size_t size_interface;
 	unsigned automatically_allocated;
 	unsigned automatically_allocated;
 
 
-	/* the size is only set when calling _starpu_request_mem_chunk_removal(),
+	/* the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
          * it is needed by free_memory_on_node() which is called when
          * it is needed by free_memory_on_node() which is called when
          * the handle is no longer valid. It should not be used otherwise.
          * the handle is no longer valid. It should not be used otherwise.
 	 */
 	 */

+ 1 - 1
src/datawizard/user_interactions.c

@@ -191,7 +191,7 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 int starpu_data_acquire_cb(starpu_data_handle_t handle,
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
 {
 {
-	return starpu_data_acquire_on_node_cb(handle, 0, mode, callback, arg);
+	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
 }
 }
 
 
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,

+ 5 - 0
src/debug/traces/starpu_fxt.c

@@ -1380,9 +1380,14 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_worker_status(&ev, options, "P");
 				handle_worker_status(&ev, options, "P");
 				break;
 				break;
 
 
+			case _STARPU_FUT_START_UNPARTITION:
+				handle_worker_status(&ev, options, "U");
+				break;
+
 			case _STARPU_FUT_END_FETCH_INPUT:
 			case _STARPU_FUT_END_FETCH_INPUT:
 			case _STARPU_FUT_END_PROGRESS:
 			case _STARPU_FUT_END_PROGRESS:
 			case _STARPU_FUT_END_PUSH_OUTPUT:
 			case _STARPU_FUT_END_PUSH_OUTPUT:
+			case _STARPU_FUT_END_UNPARTITION:
 				handle_worker_status(&ev, options, "B");
 				handle_worker_status(&ev, options, "B");
 				break;
 				break;
 
 

+ 7 - 3
src/debug/traces/starpu_paje.c

@@ -163,6 +163,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
 	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
+	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 
 
 	/* Types for the MPI Communication Thread of the Memory Node */
 	/* Types for the MPI Communication Thread of the Memory Node */
 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
@@ -190,6 +191,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
 		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
+		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
 	}
 	}
 
 
 	/* Types for the Scheduler */
 	/* Types for the Scheduler */
@@ -228,7 +230,8 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       C       S       Callback       \".0 .3 .8\"            \n\
 6       C       S       Callback       \".0 .3 .8\"            \n\
 6       B       S       Overhead         \".5 .18 .0\"		\n\
 6       B       S       Overhead         \".5 .18 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
-6       P       S       Progressing         \".4 .1 .6\"		\n");
+6       P       S       Progressing         \".4 .1 .6\"		\n\
+6       U       S       Unpartitioning      \".0 .0 1.0\"		\n");
 	fprintf(file, "\
 	fprintf(file, "\
 6       P       CtS       Processing         \"0 0 0\"		\n\
 6       P       CtS       Processing         \"0 0 0\"		\n\
 6       Sl       CtS      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       CtS      Sleeping         \".9 .1 .0\"		\n\
@@ -247,8 +250,9 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
 6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
 6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n",
-		i, i, i, i, i, i, i, i);
+6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
+6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"		\n",
+		i, i, i, i, i, i, i, i, i);
 	fprintf(file, "\
 	fprintf(file, "\
 6       A       MS      Allocating         \".4 .1 .0\"		\n\
 6       A       MS      Allocating         \".4 .1 .0\"		\n\
 6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n\
 6       Ar       MS      AllocatingReuse       \".1 .1 .8\"		\n\

+ 2 - 3
src/drivers/cpu/driver_cpu.c

@@ -190,7 +190,7 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 	return _starpu_get_worker_struct(n);
 	return _starpu_get_worker_struct(n);
 }
 }
 
 
-static size_t _starpu_cpu_get_global_mem_size(int nodeid, struct _starpu_machine_config *config)
+static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config)
 {
 {
 	size_t global_mem;
 	size_t global_mem;
 	starpu_ssize_t limit;
 	starpu_ssize_t limit;
@@ -201,12 +201,11 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid, struct _starpu_machine
 #endif
 #endif
 
 
 #if defined(STARPU_HAVE_HWLOC)
 #if defined(STARPU_HAVE_HWLOC)
-        int depth_node;
 	struct _starpu_machine_topology *topology = &config->topology;
 	struct _starpu_machine_topology *topology = &config->topology;
 
 
 #if 0
 #if 0
 	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
 	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
-        depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
+        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
 
 
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
 	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;

+ 7 - 6
src/drivers/cuda/driver_cuda.c

@@ -80,7 +80,8 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 	char name[30];
 	char name[30];
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	global_mem[devid] = props[devid].totalGlobalMem;
+	/* Find the size of the memory on the device */
+	totalGlobalMem = props[devid].totalGlobalMem;
 #endif
 #endif
 
 
 	limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
 	limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
@@ -89,17 +90,17 @@ static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
 		sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
 		sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
 		limit = starpu_get_env_number(name);
 		limit = starpu_get_env_number(name);
 	}
 	}
+#ifdef STARPU_USE_CUDA
 	if (limit == -1)
 	if (limit == -1)
 	{
 	{
-		return;
+		/* Use 90% of the available memory by default.  */
+		limit = totalGlobalMem / (1024*1024) * 0.9;
 	}
 	}
+#endif
 
 
 	global_mem[devid] = limit * 1024*1024;
 	global_mem[devid] = limit * 1024*1024;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-	/* Find the size of the memory on the device */
-	totalGlobalMem = props[devid].totalGlobalMem;
-
 	/* How much memory to waste ? */
 	/* How much memory to waste ? */
 	to_waste = totalGlobalMem - global_mem[devid];
 	to_waste = totalGlobalMem - global_mem[devid];
 
 
@@ -201,7 +202,7 @@ static void init_context(unsigned devid)
 	starpu_cuda_set_device(devid);
 	starpu_cuda_set_device(devid);
 
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 #ifdef HAVE_CUDA_MEMCPY_PEER
-	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") > 0)
+	if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
 	{
 	{
 		int nworkers = starpu_worker_get_count();
 		int nworkers = starpu_worker_get_count();
 		for (workerid = 0; workerid < nworkers; workerid++)
 		for (workerid = 0; workerid < nworkers; workerid++)

+ 1 - 0
src/drivers/disk/driver_disk.c

@@ -17,6 +17,7 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <core/disk.h>
 #include <core/disk.h>
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>
+#include <drivers/disk/driver_disk.h>
 
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 {

+ 6 - 5
src/drivers/opencl/driver_opencl.c

@@ -81,14 +81,15 @@ static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
 		sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
 		sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
 		limit = starpu_get_env_number(name);
 		limit = starpu_get_env_number(name);
 	}
 	}
+#ifdef STARPU_USE_OPENCL
 	if (limit == -1)
 	if (limit == -1)
 	{
 	{
-		global_mem[devid] = totalGlobalMem;
-	}
-	else
-	{
-		global_mem[devid] = limit * 1024*1024;
+		/* Use 90% of the available memory by default.  */
+		limit = totalGlobalMem / (1024*1024) * 0.9;
 	}
 	}
+#endif
+
+	global_mem[devid] = limit * 1024*1024;
 
 
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	/* How much memory to waste ? */
 	/* How much memory to waste ? */

+ 2 - 1
src/profiling/profiling.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -69,5 +69,6 @@ void _starpu_profiling_set_task_push_end_time(struct starpu_task *task);
 void _starpu_profiling_init(void);
 void _starpu_profiling_init(void);
 
 
 void _starpu_profiling_terminate(void);
 void _starpu_profiling_terminate(void);
+void _starpu_profiling_reset_counters();
 
 
 #endif // __PROFILING_H__
 #endif // __PROFILING_H__

+ 1 - 0
src/sched_policies/eager_central_priority_policy.c

@@ -230,6 +230,7 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 						taskq->ntasks[priolevel]--;
 						taskq->ntasks[priolevel]--;
 						taskq->total_ntasks--;
 						taskq->total_ntasks--;
 						_STARPU_TRACE_JOB_POP(task, 0);
 						_STARPU_TRACE_JOB_POP(task, 0);
+						break;
 					} else skipped = 1;
 					} else skipped = 1;
 				}
 				}
 			}
 			}

+ 1 - 2
src/sched_policies/parallel_eager.c

@@ -157,15 +157,14 @@ static int push_task_peager_policy(struct starpu_task *task)
 
 
         /*if there are no tasks block */
         /*if there are no tasks block */
         /* wake people waiting for a task */
         /* wake people waiting for a task */
-        int worker = -1;
         struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
         struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 
 
         struct starpu_sched_ctx_iterator it;
         struct starpu_sched_ctx_iterator it;
         if(workers->init_iterator)
         if(workers->init_iterator)
                 workers->init_iterator(workers, &it);
                 workers->init_iterator(workers, &it);
 
 
-
 #ifndef STARPU_NON_BLOCKING_DRIVERS
 #ifndef STARPU_NON_BLOCKING_DRIVERS
+        int worker = -1;
 	while(workers->has_next(workers, &it))
 	while(workers->has_next(workers, &it))
 	{
 	{
 		worker = workers->get_next(workers, &it);
 		worker = workers->get_next(workers, &it);

+ 12 - 0
src/sched_policies/parallel_heft.c

@@ -33,6 +33,10 @@
 #define DBL_MAX __DBL_MAX__
 #define DBL_MAX __DBL_MAX__
 #endif
 #endif
 
 
+/* if no priority is set when creating the scheduling context, we use the following ones */
+#define DEFAULT_MIN_PRIORITY 0
+#define DEFAULT_MAX_PRIORITY 1
+
 //static unsigned ncombinedworkers;
 //static unsigned ncombinedworkers;
 //static enum starpu_perfmodel_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static enum starpu_perfmodel_archtype applicable_perf_archtypes[STARPU_NARCH_VARIATIONS];
 //static unsigned napplicable_perf_archtypes = 0;
 //static unsigned napplicable_perf_archtypes = 0;
@@ -552,6 +556,14 @@ static void initialize_parallel_heft_policy(unsigned sched_ctx_id)
 	hd->_gamma = _STARPU_SCHED_GAMMA_DEFAULT;
 	hd->_gamma = _STARPU_SCHED_GAMMA_DEFAULT;
 	hd->idle_power = 0.0;
 	hd->idle_power = 0.0;
 
 
+	if (starpu_sched_ctx_min_priority_is_set(sched_ctx_id) == 0)
+		starpu_sched_ctx_set_min_priority(sched_ctx_id, DEFAULT_MIN_PRIORITY);
+	if (starpu_sched_ctx_max_priority_is_set(sched_ctx_id) == 0)
+		starpu_sched_ctx_set_max_priority(sched_ctx_id, DEFAULT_MAX_PRIORITY);
+	STARPU_ASSERT_MSG(starpu_sched_ctx_get_min_priority(sched_ctx_id) < starpu_sched_ctx_get_max_priority(sched_ctx_id),
+			  "Priority min %d should be lower than priority max %d\n",
+			  starpu_sched_ctx_get_min_priority(sched_ctx_id), starpu_sched_ctx_get_max_priority(sched_ctx_id));
+
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)hd);
 	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)hd);
 
 
 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");
 	const char *strval_alpha = getenv("STARPU_SCHED_ALPHA");

+ 1 - 0
src/sched_policies/work_stealing_policy.c

@@ -332,6 +332,7 @@ static struct starpu_task *ws_pop_task(unsigned sched_ctx_id)
 	return task;
 	return task;
 }
 }
 
 
+static
 int ws_push_task(struct starpu_task *task)
 int ws_push_task(struct starpu_task *task)
 {
 {
 	unsigned sched_ctx_id = task->sched_ctx;
 	unsigned sched_ctx_id = task->sched_ctx;

+ 1 - 0
src/util/starpu_task_insert.c

@@ -64,6 +64,7 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 	va_end(varg_list);
 	va_end(varg_list);
 }
 }
 
 
+static
 int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 {
 {
 	void *arg_buffer = NULL;
 	void *arg_buffer = NULL;

+ 0 - 158
src/util/starpu_task_list_inline.h

@@ -1,158 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STARPU_TASK_LIST_INLINE_H
-#define __STARPU_TASK_LIST_INLINE_H
-
-#include <starpu_task.h>
-
-#ifndef STARPU_INLINE
-#ifdef __GNUC_GNU_INLINE__
-#define STARPU_INLINE extern inline
-#else
-#define STARPU_INLINE static inline
-#endif
-#endif
-
-STARPU_INLINE
-void starpu_task_list_init(struct starpu_task_list *list)
-{
-	list->head = NULL;
-	list->tail = NULL;
-}
-
-STARPU_INLINE
-void starpu_task_list_push_front(struct starpu_task_list *list,
-				struct starpu_task *task)
-{
-	if (list->tail == NULL)
-	{
-		list->tail = task;
-	}
-	else
-	{
-		list->head->prev = task;
-	}
-
-	task->prev = NULL;
-	task->next = list->head;
-	list->head = task;
-}
-
-STARPU_INLINE
-void starpu_task_list_push_back(struct starpu_task_list *list,
-				struct starpu_task *task)
-{
-	if (list->head == NULL)
-	{
-		list->head = task;
-	}
-	else
-	{
-		list->tail->next = task;
-	}
-
-	task->next = NULL;
-	task->prev = list->tail;
-	list->tail = task;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
-{
-	return list->head;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
-{
-	return list->tail;
-}
-
-STARPU_INLINE
-int starpu_task_list_empty(struct starpu_task_list *list)
-{
-	return (list->head == NULL);
-}
-
-STARPU_INLINE
-void starpu_task_list_erase(struct starpu_task_list *list,
-				struct starpu_task *task)
-{
-	struct starpu_task *p = task->prev;
-
-	if (p)
-	{
-		p->next = task->next;
-	}
-	else
-	{
-		list->head = task->next;
-	}
-
-	if (task->next)
-	{
-		task->next->prev = p;
-	}
-	else
-	{
-		list->tail = p;
-	}
-
-	task->prev = NULL;
-	task->next = NULL;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->head;
-
-	if (task)
-		starpu_task_list_erase(list, task);
-
-	return task;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->tail;
-
-	if (task)
-		starpu_task_list_erase(list, task);
-
-	return task;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
-{
-	return list->head;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
-{
-	return NULL;
-}
-
-STARPU_INLINE
-struct starpu_task *starpu_task_list_next(struct starpu_task *task)
-{
-	return task->next;
-}
-#endif /* __STARPU_TASK_LIST_INLINE_H */

+ 2 - 1
tests/Makefile.am

@@ -23,6 +23,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 EXTRA_DIST =					\
 EXTRA_DIST =					\
 	helper.h				\
 	helper.h				\
 	datawizard/scal.h			\
 	datawizard/scal.h			\
+	datawizard/mpi_like.h			\
 	microbenchs/tasks_size_overhead.sh	\
 	microbenchs/tasks_size_overhead.sh	\
 	microbenchs/tasks_size_overhead.gp	\
 	microbenchs/tasks_size_overhead.gp	\
 	datawizard/scratch_opencl_kernel.cl     \
 	datawizard/scratch_opencl_kernel.cl     \
@@ -61,7 +62,7 @@ if STARPU_USE_CUDA
 # TODO define NVCCFLAGS
 # TODO define NVCCFLAGS
 NVCC ?= nvcc
 NVCC ?= nvcc
 
 
-NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/include $(HWLOC_CFLAGS) $(SIMGRID_CFLAGS)
+NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_builddir)/src/common -I$(top_builddir)/include $(HWLOC_CFLAGS) $(SIMGRID_CFLAGS)
 
 
 .cu.cubin:
 .cu.cubin:
 	$(MKDIR_P) `dirname $@`
 	$(MKDIR_P) `dirname $@`

+ 1 - 0
tests/datawizard/acquire_cb.c

@@ -20,6 +20,7 @@
 unsigned token = 0;
 unsigned token = 0;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
+static
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	token = 42;
 	token = 42;

+ 1 - 0
tests/datawizard/acquire_cb_insert.c

@@ -58,6 +58,7 @@ struct starpu_codelet work =
 static int x;
 static int x;
 static starpu_data_handle_t x_handle, f_handle;
 static starpu_data_handle_t x_handle, f_handle;
 
 
+static
 void callback(void *arg)
 void callback(void *arg)
 {
 {
 	starpu_task_insert(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);
 	starpu_task_insert(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0);

+ 2 - 0
tests/datawizard/acquire_release.c

@@ -55,6 +55,7 @@ static struct starpu_codelet increment_cl =
 unsigned token = 0;
 unsigned token = 0;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
+static
 int increment_token(void)
 int increment_token(void)
 {
 {
 	int ret;
 	int ret;
@@ -66,6 +67,7 @@ int increment_token(void)
 	return ret;
 	return ret;
 }
 }
 
 
+static
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
         starpu_data_release(token_handle);
         starpu_data_release(token_handle);

+ 2 - 0
tests/datawizard/acquire_release2.c

@@ -55,6 +55,7 @@ static struct starpu_codelet increment_cl =
 unsigned token = 0;
 unsigned token = 0;
 starpu_data_handle_t token_handle;
 starpu_data_handle_t token_handle;
 
 
+static
 int increment_token(int synchronous)
 int increment_token(int synchronous)
 {
 {
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
@@ -64,6 +65,7 @@ int increment_token(int synchronous)
 	return starpu_task_submit(task);
 	return starpu_task_submit(task);
 }
 }
 
 
+static
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
         starpu_data_release(token_handle);
         starpu_data_release(token_handle);

+ 3 - 3
tests/datawizard/allocate.c

@@ -31,6 +31,7 @@ int main(int argc, char **argv)
 }
 }
 #else
 #else
 
 
+static
 int test_prefetch(unsigned memnodes)
 int test_prefetch(unsigned memnodes)
 {
 {
 	int ret;
 	int ret;
@@ -114,14 +115,13 @@ int test_prefetch(unsigned memnodes)
 	{
 	{
 		available_size = starpu_memory_get_available(i);
 		available_size = starpu_memory_get_available(i);
 		FPRINTF(stderr, "Available memory size on node %u: %ld\n", i, available_size);
 		FPRINTF(stderr, "Available memory size on node %u: %ld\n", i, available_size);
-#ifndef STARPU_USE_ALLOCATION_CACHE
-		STARPU_CHECK_RETURN_VALUE_IS((int)available_size, SIZE_ALLOC*1024*1024, "starpu_memory_get_available (node %u)", i);
-#endif
+		/* STARPU_CHECK_RETURN_VALUE_IS((int)available_size, SIZE_ALLOC*1024*1024, "starpu_memory_get_available (node %u)", i); */
 	}
 	}
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+static
 void test_malloc()
 void test_malloc()
 {
 {
 	int ret;
 	int ret;

+ 1 - 7
tests/datawizard/commute.c

@@ -32,8 +32,6 @@ static struct starpu_codelet codelet_begin =
 	.nbuffers = 1,
 	.nbuffers = 1,
 };
 };
 
 
-
-
 void commute1(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 void commute1(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -49,8 +47,6 @@ static struct starpu_codelet codelet_commute1 =
 	.modes = {STARPU_RW | STARPU_COMMUTE}
 	.modes = {STARPU_RW | STARPU_COMMUTE}
 };
 };
 
 
-
-
 void commute2(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 void commute2(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
@@ -78,8 +74,6 @@ static struct starpu_codelet codelet_commute3 =
 	.modes = {STARPU_RW | STARPU_COMMUTE}
 	.modes = {STARPU_RW | STARPU_COMMUTE}
 };
 };
 
 
-
-
 static struct starpu_codelet codelet_end;
 static struct starpu_codelet codelet_end;
 void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 {
 {
@@ -107,7 +101,7 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	int ret;
 	int ret;
 
 
 	codelet_begin.modes[0] = begin_mode;
 	codelet_begin.modes[0] = begin_mode;
-	codelet_end.modes[0] = end_mode;	
+	codelet_end.modes[0] = end_mode;
 
 
 	begin_t = starpu_task_create();
 	begin_t = starpu_task_create();
 	begin_t->cl = &codelet_begin;
 	begin_t->cl = &codelet_begin;

+ 1 - 1
tests/datawizard/critical_section_with_void_interface.c

@@ -27,7 +27,7 @@ starpu_data_handle_t void_handle;
 
 
 int critical_var;
 int critical_var;
 
 
-static void critical_section(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void critical_section(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 {
 	/* We do not protect this variable because it is only accessed when the
 	/* We do not protect this variable because it is only accessed when the
 	 * "void_handle" piece of data is accessed. */
 	 * "void_handle" piece of data is accessed. */

+ 2 - 1
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010  Université de Bordeaux 1
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
  */
  */
 
 
 #include <starpu.h>
 #include <starpu.h>
+#include "./mpi_like.h"
 
 
 static __global__ void _cuda_unsigned_inc(unsigned *val)
 static __global__ void _cuda_unsigned_inc(unsigned *val)
 {
 {

+ 1 - 1
tests/datawizard/data_implicit_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
tests/datawizard/data_invalidation.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Université de Bordeaux 1
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 1
tests/datawizard/dining_philosophers.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -38,6 +38,7 @@ static struct starpu_codelet eating_cl =
 	.nbuffers = 2
 	.nbuffers = 2
 };
 };
 
 
+static
 int submit_one_task(unsigned p)
 int submit_one_task(unsigned p)
 {
 {
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();

+ 1 - 3
tests/datawizard/increment_redux.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -98,8 +98,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 }
 }
 #endif
 #endif
 
 
-
-
 void redux_cpu_kernel(void *descr[], void *arg)
 void redux_cpu_kernel(void *descr[], void *arg)
 {
 {
 	STARPU_SKIP_IF_VALGRIND;
 	STARPU_SKIP_IF_VALGRIND;

+ 1 - 3
tests/datawizard/increment_redux_lazy.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -88,8 +88,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 }
 }
 #endif
 #endif
 
 
-
-
 void redux_cpu_kernel(void *descr[], void *arg)
 void redux_cpu_kernel(void *descr[], void *arg)
 {
 {
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);

+ 0 - 2
tests/datawizard/increment_redux_v2.c

@@ -97,8 +97,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 }
 }
 #endif
 #endif
 
 
-
-
 void redux_cpu_kernel(void *descr[], void *arg)
 void redux_cpu_kernel(void *descr[], void *arg)
 {
 {
 	STARPU_SKIP_IF_VALGRIND;
 	STARPU_SKIP_IF_VALGRIND;

+ 0 - 1
tests/datawizard/interfaces/block/block_interface.c

@@ -33,7 +33,6 @@ extern void test_block_cuda_func(void *buffers[], void *_args);
 extern void test_block_opencl_func(void *buffers[], void *args);
 extern void test_block_opencl_func(void *buffers[], void *args);
 #endif
 #endif
 
 
-
 static starpu_data_handle_t _block_handle;
 static starpu_data_handle_t _block_handle;
 static starpu_data_handle_t _block2_handle;
 static starpu_data_handle_t _block2_handle;
 
 

+ 2 - 0
tests/datawizard/interfaces/multiformat/advanced/generic.c

@@ -73,6 +73,7 @@ void opencl_func(void *buffers[], void *args)
 	global_stats.opencl++;
 	global_stats.opencl++;
 }
 }
 
 
+static
 void cpu_to_opencl_func(void *buffers[], void *args)
 void cpu_to_opencl_func(void *buffers[], void *args)
 {
 {
 	STARPU_SKIP_IF_VALGRIND;
 	STARPU_SKIP_IF_VALGRIND;
@@ -80,6 +81,7 @@ void cpu_to_opencl_func(void *buffers[], void *args)
 	global_stats.cpu_to_opencl++;
 	global_stats.cpu_to_opencl++;
 }
 }
 
 
+static
 void opencl_to_cpu_func(void *buffers[], void *args)
 void opencl_to_cpu_func(void *buffers[], void *args)
 {
 {
 	STARPU_SKIP_IF_VALGRIND;
 	STARPU_SKIP_IF_VALGRIND;

+ 2 - 2
tests/datawizard/lazy_unregister.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@
 
 
 #include "../helper.h"
 #include "../helper.h"
 
 
-static void dummy_func(void ** buffers, void * args)
+void dummy_func(void ** buffers, void * args)
 {
 {
 	(void) buffers;
 	(void) buffers;
 	(void) args;
 	(void) args;

+ 2 - 2
tests/datawizard/manual_reduction.c

@@ -75,7 +75,7 @@ static void initialize_per_worker_handle(void *arg STARPU_ATTRIBUTE_UNUSED)
  *	Implement reduction method
  *	Implement reduction method
  */
  */
 
 
-static void cpu_redux_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
+void cpu_redux_func(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	unsigned *a = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *a = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *b = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned *b = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
@@ -98,7 +98,7 @@ static struct starpu_codelet reduction_codelet =
  *	Use per-worker local copy
  *	Use per-worker local copy
  */
  */
 
 
-static void cpu_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
+void cpu_func_incr(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*val = *val + 1;
 	*val = *val + 1;

+ 4 - 8
tests/datawizard/mpi_like.c

@@ -18,8 +18,9 @@
 #include <config.h>
 #include <config.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <errno.h>
 #include <errno.h>
-#include "../helper.h"
 #include <common/thread.h>
 #include <common/thread.h>
+#include "../helper.h"
+#include "./mpi_like.h"
 
 
 #define NTHREADS	4
 #define NTHREADS	4
 #define NITER		2
 #define NITER		2
@@ -46,12 +47,6 @@ static struct thread_data problem_data[NTHREADS];
 /* We implement some ring transfer, every thread will try to receive a piece of
 /* We implement some ring transfer, every thread will try to receive a piece of
  * data from its neighbour and increment it before transmitting it to its
  * data from its neighbour and increment it before transmitting it to its
  * successor. */
  * successor. */
-#ifdef STARPU_USE_CUDA
-void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
-#endif
-#ifdef STARPU_USE_OPENCL
-void opencl_codelet_unsigned_inc(void *buffers[], void *args);
-#endif
 
 
 void increment_handle_cpu_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 void increment_handle_cpu_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 {
 {
@@ -73,7 +68,8 @@ static struct starpu_codelet increment_handle_cl =
 	.nbuffers = 1
 	.nbuffers = 1
 };
 };
 
 
-static void increment_handle(struct thread_data *thread_data)
+static
+void increment_handle(struct thread_data *thread_data)
 {
 {
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
 	task->cl = &increment_handle_cl;
 	task->cl = &increment_handle_cl;

+ 16 - 2
src/util/starpu_inlines.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,5 +15,18 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
  */
 
 
-#define STARPU_INLINE
+#include <config.h>
 #include <starpu.h>
 #include <starpu.h>
+
+#ifdef STARPU_USE_CUDA
+#ifdef __CUDACC__
+extern "C"
+#endif
+void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void opencl_codelet_unsigned_inc(void *buffers[], void *args);
+#endif
+
+

+ 0 - 0
tests/datawizard/readers_and_writers.c


Some files were not shown because too many files changed in this diff