浏览代码

- merge trunk

Olivier Aumage 11 年之前
父节点
当前提交
f281168e46
共有 73 个文件被更改,包括 1526 次插入292 次删除
  1. 1 0
      AUTHORS
  2. 71 47
      ChangeLog
  3. 11 1
      configure.ac
  4. 5 0
      doc/doxygen/chapters/08scheduling.doxy
  5. 2 2
      doc/doxygen/chapters/12online_performance_tools.doxy
  6. 3 0
      doc/doxygen/chapters/13offline_performance_tools.doxy
  7. 46 42
      doc/doxygen/chapters/16mpi_support.doxy
  8. 8 0
      doc/doxygen/chapters/40environment_variables.doxy
  9. 8 0
      doc/doxygen/chapters/41configure_options.doxy
  10. 5 0
      examples/Makefile.am
  11. 1 0
      examples/binary/binary.c
  12. 3 1
      examples/cpp/incrementer_cpp.cpp
  13. 212 0
      examples/sched_ctx/nested_sched_ctxs.c
  14. 6 9
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  15. 1 0
      examples/worker_collections/worker_list_example.c
  16. 1 0
      include/starpu_config.h.in
  17. 8 0
      include/starpu_sched_ctx.h
  18. 2 0
      include/starpu_task.h
  19. 6 0
      include/starpu_thread.h
  20. 7 0
      include/starpu_worker.h
  21. 3 1
      mpi/include/starpu_mpi.h
  22. 61 45
      mpi/src/starpu_mpi.c
  23. 1 0
      src/Makefile.am
  24. 42 4
      src/common/fxt.h
  25. 27 1
      src/common/thread.c
  26. 9 0
      src/core/jobs.c
  27. 3 0
      src/core/jobs.h
  28. 66 2
      src/core/sched_ctx.c
  29. 3 0
      src/core/sched_ctx.h
  30. 26 2
      src/core/sched_policy.c
  31. 1 0
      src/core/sched_policy.h
  32. 21 3
      src/core/simgrid.c
  33. 7 0
      src/core/task.c
  34. 54 38
      src/core/workers.c
  35. 3 0
      src/core/workers.h
  36. 15 8
      src/datawizard/coherency.c
  37. 11 11
      src/datawizard/coherency.h
  38. 2 1
      src/datawizard/filters.c
  39. 2 1
      src/datawizard/interfaces/data_interface.c
  40. 5 1
      src/datawizard/reduction.c
  41. 2 4
      src/datawizard/user_interactions.c
  42. 109 7
      src/debug/traces/starpu_fxt.c
  43. 13 0
      src/debug/traces/starpu_paje.c
  44. 1 1
      src/drivers/cpu/driver_cpu.c
  45. 7 3
      src/drivers/cuda/driver_cuda.c
  46. 4 2
      src/drivers/driver_common/driver_common.c
  47. 2 2
      src/drivers/driver_common/driver_common.h
  48. 1 1
      src/drivers/mp_common/source_common.c
  49. 4 2
      src/drivers/opencl/driver_opencl.c
  50. 17 10
      src/sched_policies/deque_modeling_policy_data_aware.c
  51. 13 2
      src/sched_policies/eager_central_policy.c
  52. 373 0
      src/sched_policies/locality_work_stealing_policy.c
  53. 2 2
      src/starpu_parameters.h
  54. 65 3
      src/worker_collection/worker_list.c
  55. 84 4
      src/worker_collection/worker_tree.c
  56. 1 0
      tests/datawizard/commute.c
  57. 2 2
      tests/heat/dmda.sh
  58. 5 3
      tests/heat/gflops_sched.gp
  59. 10 1
      tests/heat/gflops_sched.sh
  60. 2 2
      tests/heat/granularity.r
  61. 2 2
      tests/heat/granularity_model.r
  62. 2 2
      tests/heat/model.r
  63. 4 3
      tests/heat/random.r
  64. 2 2
      tests/heat/sched.r
  65. 2 2
      tests/heat/sched.sh
  66. 5 2
      tests/main/driver_api/init_run_deinit.c
  67. 3 1
      tests/main/subgraph_repeat.c
  68. 3 1
      tests/main/subgraph_repeat_regenerate.c
  69. 2 1
      tests/main/subgraph_repeat_regenerate_tag.c
  70. 2 1
      tests/main/subgraph_repeat_tag.c
  71. 5 2
      tests/perfmodels/feed.c
  72. 3 1
      tests/regression/profiles.in
  73. 5 1
      tests/regression/regression_test.sh

+ 1 - 0
AUTHORS

@@ -1,6 +1,7 @@
 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
 William Braik <wbraik@gmail.com>
 William Braik <wbraik@gmail.com>
+Alfredo Buttari <alfredo.buttari@enseeiht.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>

+ 71 - 47
ChangeLog

@@ -17,24 +17,6 @@
 StarPU 1.2.0 (svn revision xxxx)
 StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 ==============================================
 
 
-Small features:
-  * New function starpu_sched_ctx_display_workers() to display worker
-    information belonging to a given scheduler context
-  * The option --enable-verbose can be called with
-    --enable-verbose=extra to increase the verbosity
-
-StarPU 1.1.2 (svn revision xxxx)
-==============================================
-The scheduling context release
-
-New features:
-  * The reduction init codelet is automatically used to initialize temporary
-    buffers.
-
-StarPU 1.1.1 (svn revision 12638)
-==============================================
-The scheduling context release
-
 New features:
 New features:
   * Xeon Phi support
   * Xeon Phi support
   * SCC support
   * SCC support
@@ -48,46 +30,88 @@ New features:
 	  before the corresponding data, which allows the receiver to
 	  before the corresponding data, which allows the receiver to
 	  allocate data correctly, and to submit the matching receive of
 	  allocate data correctly, and to submit the matching receive of
 	  the envelope.
 	  the envelope.
+        - New function
+   	  starpu_mpi_irecv_detached_sequential_consistency which
+	  allows to enable or disable the sequential consistency for
+	  the given data handle (sequential consistency will be
+	  enabled or disabled based on the value of the function
+	  parameter and the value of the sequential consistency
+	  defined for the given data)
+        - New functions starpu_mpi_task_build() and
+  	  starpu_mpi_task_post_build()
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
     let starpu commute write accesses.
     let starpu commute write accesses.
   * Out-of-core support, through registration of disk areas as additional memory
   * Out-of-core support, through registration of disk areas as additional memory
     nodes.
     nodes.
-  * StarPU-MPI: new function
-    starpu_mpi_irecv_detached_sequential_consistency which allows to
-    enable or disable the sequential consistency for the given data
-    handle (sequential consistency will be enabled or disabled based
-    on the value of the function parameter and the value of the
-    sequential consistency defined for the given data)
-  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
-  * New functions starpu_pause() and starpu_resume()
-  * New codelet specific_nodes field to specify explicit target nodes for data.
-  * Use streams for all CUDA transfers, even initiated by CPUs.
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
     CUDA and OpenCL kernel execution.
-  * Add paje traces statistics tools.
   * Add CUDA concurrent kernel execution support through
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
     the STARPU_NWORKER_PER_CUDA environment variable.
-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
 
 
 Small features:
 Small features:
+  * Tasks can now have a name (via the field const char *name of
+    struct starpu_task)
   * New functions starpu_data_acquire_cb_sequential_consistency() and
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
     to enable or disable sequential consistency
   * New configure option --enable-fxt-lock which enables additional
   * New configure option --enable-fxt-lock which enables additional
     trace events focused on locks behaviour during the execution
     trace events focused on locks behaviour during the execution
-  * New function starpu_perfmodel_directory() to print directory
-    storing performance models. Available through the new option -d of
-    the tool starpu_perfmodel_display
-  * New batch files to execute StarPU applications under Microsoft
-    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
   * Functions starpu_insert_task and starpu_mpi_insert_task are
   * Functions starpu_insert_task and starpu_mpi_insert_task are
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     names are kept to avoid breaking old codes.
     names are kept to avoid breaking old codes.
   * New configure option --enable-calibration-heuristic which allows
   * New configure option --enable-calibration-heuristic which allows
     the user to set the maximum authorized deviation of the
     the user to set the maximum authorized deviation of the
     history-based calibrator.
     history-based calibrator.
-  * Tasks can now have a name (via the field const char *name of
-    struct starpu_task)
+  * Allow application to provide the task footprint itself.
+  * New function starpu_sched_ctx_display_workers() to display worker
+    information belonging to a given scheduler context
+  * The option --enable-verbose can be called with
+    --enable-verbose=extra to increase the verbosity
+  * Add codelet size, footprint and tag id in the paje trace.
+
+Changes:
+  * Data interfaces (variable, vector, matrix and block) now define
+    pack und unpack functions
+  * StarPU-MPI: Fix for being able to receive data which have not yet
+    been registered by the application (i.e it did not call
+    starpu_data_set_tag(), data are received as a raw memory)
+  * StarPU-MPI: Fix for being able to receive data with the same tag
+    from several nodes (see mpi/tests/gather.c)
+
+Small changes:
+  * Rename function starpu_trace_user_event() as
+    starpu_fxt_trace_user_event()
+
+StarPU 1.1.2 (svn revision xxx)
+==============================================
+The scheduling context release
+
+New features:
+  * The reduction init codelet is automatically used to initialize temporary
+    buffers.
+  * Traces now include a "scheduling" state, to show the overhead of the
+    scheduler.
+  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
+    number of calibration measurements.
+
+StarPU 1.1.1 (svn revision 12638)
+==============================================
+The scheduling context release
+
+New features:
+  * MPI:
+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
+   	  cache holding received data.
+        - New function starpu_mpi_data_register() which sets the rank
+  	  and tag of a data, and also allows to automatically clear
+	  the MPI communication cache when unregistering the data. It
+	  should be called instead of both calling
+	  starpu_data_set_tag() and starpu_data_set_rank()
+  * Use streams for all CUDA transfers, even initiated by CPUs.
+  * Add paje traces statistics tools.
+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+
+Small features:
   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
     to execute the task.
     to execute the task.
   * New STARPU_DISABLE_PINNING environment variable to disable host memory
   * New STARPU_DISABLE_PINNING environment variable to disable host memory
@@ -97,23 +121,23 @@ Small features:
   * New starpu_memory_get_total function to get the size of a memory node.
   * New starpu_memory_get_total function to get the size of a memory node.
   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
     a set of workers without going through combined workers.
     a set of workers without going through combined workers.
-  * Allow application to provide the task footprint itself.
 
 
 Changes:
 Changes:
-  * Data interfaces (variable, vector, matrix and block) now define
-    pack und unpack functions
-  * StarPU-MPI: Fix for being able to receive data which have not yet
-    been registered by the application (i.e it did not call
-    starpu_data_set_tag(), data are received as a raw memory)
-  * StarPU-MPI: Fix for being able to receive data with the same tag
-    from several nodes (see mpi/tests/gather.c)
+  * Fix simgrid execution.
+  * Rename starpu_get_nready_tasks_of_sched_ctx to starpu_sched_ctx_get_nready_tasks
+  * Rename starpu_get_nready_flops_of_sched_ctx to starpu_sched_ctx_get_nready_flops
+  * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
   * StarPU-MPI: Fix overzealous allocation of memory.
   * StarPU-MPI: Fix overzealous allocation of memory.
   * Interfaces: Allow interface implementation to change pointers at will, in
   * Interfaces: Allow interface implementation to change pointers at will, in
     unpack notably.
     unpack notably.
 
 
 Small changes:
 Small changes:
-  * Rename function starpu_trace_user_event() as
-    starpu_fxt_trace_user_event()
+  * Use big fat abortions when one tries to make a task or callback
+    sleep, instead of just returning EDEADLCK which few people will test
+  * By default, StarPU FFT examples are not compiled and checked, the
+    configure option --enable-starpufft-examples needs to be specified
+    to change this behaviour.
 
 
 StarPU 1.1.0 (svn revision 11960)
 StarPU 1.1.0 (svn revision 11960)
 ==============================================
 ==============================================

+ 11 - 1
configure.ac

@@ -975,16 +975,19 @@ AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
 if test x$enable_simgrid = xyes ; then
 if test x$enable_simgrid = xyes ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	fi
 	fi
 	if test -n "$SIMGRID_LIBS" ; then
 	if test -n "$SIMGRID_LIBS" ; then
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
 	fi
 	fi
 	if test "$simgrid_dir" != "no" ; then
 	if test "$simgrid_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
 	fi
 	fi
 	if test "$simgrid_include_dir" != "no" ; then
 	if test "$simgrid_include_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
 	fi
 	fi
 	if test "$simgrid_lib_dir" != "no" ; then
 	if test "$simgrid_lib_dir" != "no" ; then
 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
@@ -994,7 +997,8 @@ if test x$enable_simgrid = xyes ; then
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 		]
 		]
 	)
 	)
-   	AC_CHECK_FUNCS([MSG_process_join])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		    		[[#include <msg/msg.h>]],
 		    		[[#include <msg/msg.h>]],
 				[[msg_host_t foo; ]]
 				[[msg_host_t foo; ]]
@@ -1478,6 +1482,12 @@ if test x$use_fxt = xyes; then
 	AC_CHECK_DECLS([fut_set_filename])
 	AC_CHECK_DECLS([fut_set_filename])
 	CFLAGS="$save_CFLAGS"
 	CFLAGS="$save_CFLAGS"
 
 
+        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
+			[enable details about codelets in the paje trace])],
+			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
+        if  test x$enable_paje_codelet_details = xyes; then
+        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
+        fi
 	##########################################
 	##########################################
 	# Poti is a library to generate paje trace files
 	# Poti is a library to generate paje trace files
 	##########################################
 	##########################################

+ 5 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -45,6 +45,11 @@ a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from the most loaded
 default. When a worker becomes idle, it steals a task from the most loaded
 worker.
 worker.
 
 
+The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
+a task on the worker which released it by
+default. When a worker becomes idle, it steals a task from neighbour workers. It
+also takes into account priorities.
+
 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 termination time will be minimal. The difference with HEFT is that <b>dm</b>
 termination time will be minimal. The difference with HEFT is that <b>dm</b>

+ 2 - 2
doc/doxygen/chapters/12online_performance_tools.doxy

@@ -389,11 +389,11 @@ parameters through starpu_hash_crc32c_be for instance.
 StarPU will automatically determine when the performance model is calibrated,
 StarPU will automatically determine when the performance model is calibrated,
 or rather, it will assume the performance model is calibrated until the
 or rather, it will assume the performance model is calibrated until the
 application submits a task for which the performance can not be predicted. For
 application submits a task for which the performance can not be predicted. For
-::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
+::STARPU_HISTORY_BASED, StarPU will require 10 (STARPU_CALIBRATE_MINIMUM)
 measurements for a given size before estimating that an average can be taken as
 measurements for a given size before estimating that an average can be taken as
 estimation for further executions with the same size. For
 estimation for further executions with the same size. For
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
-10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+10 (STARPU_CALIBRATE_MINIMUM) measurements, and that the minimum measured
 data size is smaller than 90% of the maximum measured data size (i.e. the
 data size is smaller than 90% of the maximum measured data size (i.e. the
 measurement interval is large enough for a regression to have a meaning).
 measurement interval is large enough for a regression to have a meaning).
 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment

+ 3 - 0
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -118,6 +118,9 @@ $ vite paje.trace
 
 
 To get names of tasks instead of "unknown", fill the optional
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
 starpu_codelet::name, or use a performance model for them.
+Details of the codelet execution can be obtained by passing
+<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
+(at least r1430).
 
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:

+ 46 - 42
doc/doxygen/chapters/16mpi_support.doxy

@@ -121,49 +121,53 @@ automatically released. This mechanism is similar to the pthread
 detach state attribute which determines whether a thread will be
 detach state attribute which determines whether a thread will be
 created in a joinable or a detached state.
 created in a joinable or a detached state.
 
 
-For any communication, the call of the function will result in the
-creation of a StarPU-MPI request, the function
-starpu_data_acquire_cb() is then called to asynchronously request
-StarPU to fetch the data in main memory; when the data is available in
-main memory, a StarPU-MPI function is called to put the new request in
-the list of the ready requests if it is a send request, or in an
-hashmap if it is a receive request.
-
-Internally, all MPI communications submitted by StarPU uses a unique
-tag which has a default value, and can be accessed with the functions
+Internally, all communication are divided in 2 communications, a first
+message is used to exchange an envelope describing the data (i.e its
+tag and its size), the data itself is sent in a second message. All
+MPI communications submitted by StarPU uses a unique tag which has a
+default value, and can be accessed with the functions
 starpu_mpi_get_communication_tag() and
 starpu_mpi_get_communication_tag() and
-starpu_mpi_set_communication_tag().
-
-The matching of tags with corresponding requests is done into StarPU-MPI.
-To handle this, any communication is a double-communication based on a
-envelope + data system. Every data which will be sent needs to send an
-envelope which describes the data (particularly its tag) before sending
-the data, so the receiver can get the matching pending receive request
-from the hashmap, and submit it to recieve the data correctly.
-
-To this aim, the StarPU-MPI progression thread has a permanent-submitted
-request destined to receive incoming envelopes from all sources.
-
-The StarPU-MPI progression thread regularly polls this list of ready
-requests. For each new ready request, the appropriate function is
-called to post the corresponding MPI call. For example, calling
-starpu_mpi_isend() will result in posting <c>MPI_Isend</c>. If
-the request is marked as detached, the request will be put in the list
-of detached requests.
-
-The StarPU-MPI progression thread also polls the list of detached
-requests. For each detached request, it regularly tests the completion
-of the MPI request by calling <c>MPI_Test</c>. On completion, the data
-handle is released, and if a callback was defined, it is called.
-
-Finally, the StarPU-MPI progression thread checks if an envelope has
-arrived. If it is, it'll check if the corresponding receive has already
-been submitted by the application. If it is, it'll submit the request
-just as like as it does with those on the list of ready requests.
-If it is not, it'll allocate a temporary handle to store the data that
-will arrive just after, so as when the corresponding receive request
-will be submitted by the application, it'll copy this temporary handle
-into its one instead of submitting a new StarPU-MPI request.
+starpu_mpi_set_communication_tag(). The matching of tags with
+corresponding requests is done within StarPU-MPI.
+
+For any userland communication, the call of the corresponding function
+(e.g starpu_mpi_isend()) will result in the creation of a StarPU-MPI
+request, the function starpu_data_acquire_cb() is then called to
+asynchronously request StarPU to fetch the data in main memory; when
+the data is ready and the corresponding buffer has already been
+received by MPI, it will be copied in the memory of the data,
+otherwise the request is stored in the <em>early requests list</em>. Sending
+requests are stored in the <em>ready requests list</em>.
+
+While requests need to be processed, the StarPU-MPI progression thread
+does the following:
+
+<ol>
+<li> it polls the <em>ready requests list</em>. For all the ready
+requests, the appropriate function is called to post the corresponding
+MPI call. For example, an initial call to starpu_mpi_isend() will
+result in a call to <c>MPI_Isend</c>. If the request is marked as
+detached, the request will then be added in the <em>detached requests
+list</em>.
+</li>
+<li> it posts a <c>MPI_Irecv()</c> to retrieve a data envelope.
+</li>
+<li> it polls the <em>detached requests list</em>. For all the detached
+requests, it tests its completion of the MPI request by calling
+<c>MPI_Test</c>. On completion, the data handle is released, and if a
+callback was defined, it is called.
+</li>
+<li> finally, it checks if a data envelope has been received. If so,
+if the data envelope matches a request in the <em>early requests list</em> (i.e
+the request has already been posted by the application), the
+corresponding MPI call is posted (similarly to the first step above).
+
+If the data envelope does not match any application request, a
+temporary handle is created to receive the data, a StarPU-MPI request
+is created and added into the <em>ready requests list</em>, and thus will be
+processed in the first step of the next loop.
+</li>
+</ol>
 
 
 \ref MPIPtpCommunication "Communication" gives the list of all the
 \ref MPIPtpCommunication "Communication" gives the list of all the
 point to point communications defined in StarPU-MPI.
 point to point communications defined in StarPU-MPI.

+ 8 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -314,6 +314,14 @@ is the default behaviour.
 Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
 Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
 </dd>
 </dd>
 
 
+<dt>STARPU_CALIBRATE_MINIMUM</dt>
+<dd>
+\anchor STARPU_CALIBRATE_MINIMUM
+\addindex __env__STARPU_CALIBRATE_MINIMUM
+This defines the minimum number of calibration measurements that will be made
+before considering that the performance model is calibrated. The default value is 10.
+</dd>
+
 <dt>STARPU_BUS_CALIBRATE</dt>
 <dt>STARPU_BUS_CALIBRATE</dt>
 <dd>
 <dd>
 \anchor STARPU_BUS_CALIBRATE
 \anchor STARPU_BUS_CALIBRATE

+ 8 - 0
doc/doxygen/chapters/41configure_options.doxy

@@ -372,6 +372,14 @@ Enable performance debugging through gprof.
 Enable performance model debugging.
 Enable performance model debugging.
 </dd>
 </dd>
 
 
+<dt>--enable-paje-codelet-details</dt>
+<dd>
+\anchor enable-paje-codelet-details
+\addindex __configure__--enable-paje-codelet-details
+Enable details about codelets in the paje trace. This requires a recent enough
+version of ViTE (at least r1430).
+</dd>
+
 <dt>--enable-fxt-lock</dt>
 <dt>--enable-fxt-lock</dt>
 <dd>
 <dd>
 \anchor enable-fxt-lock
 \anchor enable-fxt-lock

+ 5 - 0
examples/Makefile.am

@@ -190,6 +190,7 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/prio				\
 	sched_ctx/prio				\
 	sched_ctx/sched_ctx_without_sched_policy\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
 	reductions/dot_product			\
@@ -270,6 +271,7 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/prio				\
 	sched_ctx/prio				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/sched_ctx_without_sched_policy\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
 	reductions/dot_product			\
@@ -925,6 +927,9 @@ sched_ctx_parallel_code_CFLAGS = \
 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
 	$(AM_CFLAGS) -fopenmp
 	$(AM_CFLAGS) -fopenmp
 
 
+sched_ctx_nested_sched_ctxs_CFLAGS = \
+	$(AM_CFLAGS) -fopenmp
+
 endif
 endif
 
 
 showcheck:
 showcheck:

+ 1 - 0
examples/binary/binary.c

@@ -29,6 +29,7 @@ struct starpu_codelet cl =
 {
 {
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet, NULL},
 	.opencl_funcs = {opencl_codelet, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 #endif
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.modes = {STARPU_RW}
 	.modes = {STARPU_RW}

+ 3 - 1
examples/cpp/incrementer_cpp.cpp

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  * Copyright (C) 2012 inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -61,9 +61,11 @@ int main(int argc, char **argv)
         cl.cpu_funcs[0] = cpu_codelet;
         cl.cpu_funcs[0] = cpu_codelet;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
         cl.cuda_funcs[0] = cuda_codelet;
         cl.cuda_funcs[0] = cuda_codelet;
+	cl.cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 	cl.opencl_funcs[0] = opencl_codelet;
 	cl.opencl_funcs[0] = opencl_codelet;
+	cl.opencl_flags[0] = STARPU_OPENCL_ASYNC;
 #endif
 #endif
         cl.nbuffers = 1;
         cl.nbuffers = 1;
         cl.modes[0] = STARPU_RW;
         cl.modes[0] = STARPU_RW;

+ 212 - 0
examples/sched_ctx/nested_sched_ctxs.c

@@ -0,0 +1,212 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <omp.h>
+
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 64
+#else
+#define NTASKS 100
+#endif
+
+int tasks_executed[2];
+starpu_pthread_mutex_t mut;
+
+int parallel_code(int sched_ctx)
+{
+	int i;
+	int t = 0;
+	int *cpuids = NULL;
+	int ncpuids = 0;
+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
+
+//	printf("execute task of %d threads \n", ncpuids);
+#pragma omp parallel num_threads(ncpuids)
+	{
+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
+#pragma omp for
+		for(i = 0; i < NTASKS; i++)
+			t++;
+	}
+
+	free(cpuids);
+	return t;
+}
+
+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	int w = starpu_worker_get_id();
+	unsigned sched_ctx = (unsigned)arg;
+	int n = parallel_code(sched_ctx);
+//	printf("w %d executed %d it \n", w, n);
+}
+
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = {NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
+int main(int argc, char **argv)
+{
+	tasks_executed[0] = 0;
+	tasks_executed[1] = 0;
+	int ntasks = NTASKS;
+	int ret, j, k;
+	unsigned ncpus = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int *procs1, *procs2;
+
+#ifdef STARPU_USE_CPU
+	ncpus =  starpu_cpu_worker_get_count();
+	procs1 = (int*)malloc(ncpus*sizeof(int));
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	if (ncpus > 1)
+	{
+		nprocs1 = ncpus/2;
+		nprocs2 =  nprocs1;
+		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
+			procs2[k++] = procs1[j];
+	}
+	else
+	{
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		procs2[0] = procs1[0];
+	}
+#endif
+
+	if (ncpus == 0)
+	{
+#ifdef STARPU_USE_CPU
+		free(procs1);
+		free(procs2);
+#endif
+		starpu_shutdown();
+		return 77;
+	}
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
+
+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	int nprocs3 = nprocs1/2;
+	int nprocs4 = nprocs1/2;
+	int nprocs5 = nprocs2/2;
+	int nprocs6 = nprocs2/2;
+	int procs3[nprocs3];
+	int procs4[nprocs4];
+	int procs5[nprocs5];
+	int procs6[nprocs6];
+
+	k = 0;
+	for(j = 0; j < nprocs3; j++)
+		procs3[k++] = procs1[j];
+	k = 0;
+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
+		procs4[k++] = procs1[j];
+
+	k = 0;
+	for(j = 0; j < nprocs5; j++)
+		procs5[k++] = procs2[j];
+	k = 0;
+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
+		procs6[k++] = procs2[j];
+
+	unsigned sched_ctx3 = starpu_sched_ctx_create(procs3, nprocs3, "ctx3", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+	unsigned sched_ctx4 = starpu_sched_ctx_create(procs4, nprocs4, "ctx4", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+
+	unsigned sched_ctx5 = starpu_sched_ctx_create(procs5, nprocs5, "ctx5", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+	unsigned sched_ctx6 = starpu_sched_ctx_create(procs6, nprocs6, "ctx6", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx1;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx2;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	starpu_sched_ctx_delete(sched_ctx3);
+	starpu_sched_ctx_delete(sched_ctx4);
+
+	starpu_sched_ctx_delete(sched_ctx5);
+	starpu_sched_ctx_delete(sched_ctx6);
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
+
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
+	starpu_shutdown();
+	return 0;
+}

+ 6 - 9
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -88,7 +88,6 @@ int main(int argc, char **argv)
 #ifdef STARPU_USE_CPU
 #ifdef STARPU_USE_CPU
 	ncpus = starpu_cpu_worker_get_count();
 	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
 	procs1 = (int*)malloc(ncpus*sizeof(int));
-	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 
 
 	if(ncpus > 1)
 	if(ncpus > 1)
@@ -96,22 +95,16 @@ int main(int argc, char **argv)
 		nprocs1 = ncpus/2;
 		nprocs1 = ncpus/2;
 		nprocs2 =  ncpus-nprocs1;
 		nprocs2 =  ncpus-nprocs1;
 		k = 0;
 		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
 			procs2[k++] = procs1[j];
 			procs2[k++] = procs1[j];
 	}
 	}
 	else
 	else
 	{
 	{
-		procs1 = (int*)malloc(nprocs1*sizeof(int));
 		procs2 = (int*)malloc(nprocs2*sizeof(int));
 		procs2 = (int*)malloc(nprocs2*sizeof(int));
-		procs1[0] = 0;
-		procs2[0] = 0;
+		procs2[0] = procs1[0];
 
 
 	}
 	}
-#else
-	procs1 = (int*)malloc(nprocs1*sizeof(int));
-	procs2 = (int*)malloc(nprocs2*sizeof(int));
-	procs1[0] = 0;
-	procs2[0] = 0;
 #endif
 #endif
 
 
 	if (ncpus == 0) goto enodev;
 	if (ncpus == 0) goto enodev;
@@ -163,6 +156,10 @@ int main(int argc, char **argv)
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
 
 
 enodev:
 enodev:
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
 	starpu_shutdown();
 	starpu_shutdown();
 	return ncpus == 0 ? 77 : 0;
 	return ncpus == 0 ? 77 : 0;
 }
 }

+ 1 - 0
examples/worker_collections/worker_list_example.c

@@ -85,6 +85,7 @@ int main()
 
 
 	FPRINTF(stderr, "timing init = %lf \n", timing);
 	FPRINTF(stderr, "timing init = %lf \n", timing);
 	co->deinit(co);
 	co->deinit(co);
+	free(co);
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 1 - 0
include/starpu_config.h.in

@@ -32,6 +32,7 @@
 #undef STARPU_OPENMP
 #undef STARPU_OPENMP
 
 
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID
+#undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 
 
 #undef STARPU_HAVE_ICC
 #undef STARPU_HAVE_ICC
 
 

+ 8 - 0
include/starpu_sched_ctx.h

@@ -29,6 +29,7 @@ extern "C"
 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
+#define STARPU_SCHED_CTX_NESTED                  (6<<16)
 
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
 
@@ -127,6 +128,13 @@ int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids
 
 
 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
 
 
+/* return the first context (child of sched_ctx_id) where the workerid is master */
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id);
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */
 #endif /* STARPU_USE_SC_HYPERVISOR */

+ 2 - 0
include/starpu_task.h

@@ -255,6 +255,8 @@ void starpu_task_destroy(struct starpu_task *task);
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 
 
+int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
+
 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
 
 int starpu_task_wait_for_all(void);
 int starpu_task_wait_for_all(void);

+ 6 - 0
include/starpu_thread.h

@@ -200,6 +200,11 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 
 #if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
 #if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
 
 
+#if defined(STARPU_SIMGRID) && defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
+typedef xbt_bar_t starpu_pthread_barrier_t;
+typedef int starpu_pthread_barrierattr_t;
+#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
+#else
 typedef struct {
 typedef struct {
 	starpu_pthread_mutex_t mutex;
 	starpu_pthread_mutex_t mutex;
 	starpu_pthread_cond_t cond;
 	starpu_pthread_cond_t cond;
@@ -208,6 +213,7 @@ typedef struct {
 } starpu_pthread_barrier_t;
 } starpu_pthread_barrier_t;
 typedef int starpu_pthread_barrierattr_t;
 typedef int starpu_pthread_barrierattr_t;
 #define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
 #define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
+#endif
 
 
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count);
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count);
 int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier);
 int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier);

+ 7 - 0
include/starpu_worker.h

@@ -57,10 +57,15 @@ struct starpu_worker_collection
 {
 {
 	void *workerids;
 	void *workerids;
 	unsigned nworkers;
 	unsigned nworkers;
+	void *masters;
+	unsigned nmasters;
 	int present[STARPU_NMAXWORKERS];
 	int present[STARPU_NMAXWORKERS];
+	int is_master[STARPU_NMAXWORKERS];
 	enum starpu_worker_collection_type type;
 	enum starpu_worker_collection_type type;
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	unsigned (*has_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	int (*get_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*add)(struct starpu_worker_collection *workers, int worker);
 	int (*add)(struct starpu_worker_collection *workers, int worker);
 	int (*remove)(struct starpu_worker_collection *workers, int worker);
 	int (*remove)(struct starpu_worker_collection *workers, int worker);
 	void (*init)(struct starpu_worker_collection *workers);
 	void (*init)(struct starpu_worker_collection *workers);
@@ -109,6 +114,8 @@ int starpu_worker_get_mp_nodeid(int id);
 struct starpu_tree* starpu_workers_get_tree(void);
 struct starpu_tree* starpu_workers_get_tree(void);
 
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
+
+unsigned starpu_worker_is_slave(int workerid);
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 3 - 1
mpi/include/starpu_mpi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,6 +71,8 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 
 
+int starpu_mpi_world_rank(void);
+
 int starpu_mpi_get_communication_tag(void);
 int starpu_mpi_get_communication_tag(void);
 void starpu_mpi_set_communication_tag(int tag);
 void starpu_mpi_set_communication_tag(int tag);
 
 

+ 61 - 45
mpi/src/starpu_mpi.c

@@ -30,7 +30,7 @@
 #include <datawizard/coherency.h>
 #include <datawizard/coherency.h>
 
 
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 static void _starpu_mpi_add_sync_point_in_fxt(void);
-static void _starpu_mpi_submit_new_mpi_request(void *arg);
+static void _starpu_mpi_submit_ready_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
@@ -46,8 +46,8 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 							ssize_t count);
 							ssize_t count);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
 
-/* The list of requests that have been newly submitted by the application */
-static struct _starpu_mpi_req_list *new_requests;
+/* The list of ready requests */
+static struct _starpu_mpi_req_list *ready_requests;
 
 
 /* The list of detached requests that have already been submitted to MPI */
 /* The list of detached requests that have already been submitted to MPI */
 static struct _starpu_mpi_req_list *detached_requests;
 static struct _starpu_mpi_req_list *detached_requests;
@@ -61,7 +61,7 @@ static starpu_pthread_mutex_t mutex;
 static starpu_pthread_t progress_thread;
 static starpu_pthread_t progress_thread;
 static int running = 0;
 static int running = 0;
 
 
-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
+/* Count requests posted by the application and not yet submitted to MPI */
 static starpu_pthread_mutex_t mutex_posted_requests;
 static starpu_pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
 
@@ -151,9 +151,9 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	req->count = count;
 	req->count = count;
 
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
 	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
+	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
 	 * the request is actually submitted */
 	 * the request is actually submitted */
-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
 
 
 	_STARPU_MPI_LOG_OUT();
 	_STARPU_MPI_LOG_OUT();
 	return req;
 	return req;
@@ -447,7 +447,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 	waiting_req->request_type = WAIT_REQ;
 
 
-	_starpu_mpi_submit_new_mpi_request(waiting_req);
+	_starpu_mpi_submit_ready_request(waiting_req);
 
 
 	/* We wait for the MPI request to finish */
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -532,7 +532,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->request_type = TEST_REQ;
 		testing_req->request_type = TEST_REQ;
 
 
 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
-		_starpu_mpi_submit_new_mpi_request(testing_req);
+		_starpu_mpi_submit_ready_request(testing_req);
 
 
 		/* We wait for the test request to finish */
 		/* We wait for the test request to finish */
 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
@@ -619,7 +619,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	barrier_req->comm = comm;
 	barrier_req->comm = comm;
 
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
-	_starpu_mpi_submit_new_mpi_request(barrier_req);
+	_starpu_mpi_submit_ready_request(barrier_req);
 
 
 	/* We wait for the MPI request to finish */
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
@@ -785,24 +785,25 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	free(args);
 	free(args);
 }
 }
 
 
-static void _starpu_mpi_submit_new_mpi_request(void *arg)
+static void _starpu_mpi_submit_ready_request(void *arg)
 {
 {
 	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 	struct _starpu_mpi_req *req = arg;
 
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
 
-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
+	_STARPU_MPI_DEBUG(3, "new req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
 	if (req->request_type == RECV_REQ)
 	if (req->request_type == RECV_REQ)
 	{
 	{
-		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
-		 * incoming data without a matching pending receive already submitted by the application.
-		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
-		 * the list of new_requests, so as the real MPI request can be submitted before the next
-		 * submission of the envelope-catching request. */
+		/* Case : the request is the internal receive request submitted
+		 * by StarPU-MPI to receive incoming data without a matching
+		 * early_request from the application. We immediately allocate the
+		 * pointer associated to the data_handle, and push it into the
+		 * ready_requests list, so as the real MPI request can be submitted
+		 * before the next submission of the envelope-catching request. */
 		if (req->is_internal_req)
 		if (req->is_internal_req)
 		{
 		{
 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
@@ -818,10 +819,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 			}
 			}
 
 
-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-			_starpu_mpi_req_list_push_front(new_requests, req);
+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
+					  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
+					  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+			_starpu_mpi_req_list_push_front(ready_requests, req);
 
 
-			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
+			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
 			req->posted = 1;
 			req->posted = 1;
@@ -834,10 +837,10 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 
 
-			/* Case : the request has already been submitted internally by StarPU.
-			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
-			 * bring the data back to the original data handle associated to the request.*/
+			/* Case: a receive request for a data with the given tag and source has already been
+			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			if (early_data_handle)
 			{
 			{
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -861,8 +864,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 			}
 			}
-			/* Case : a classic receive request with no send received earlier than expected.
-			 * We just add the pending receive request to the requests' hashmap. */
+			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			else
 			{
 			{
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
@@ -872,7 +874,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	}
 	}
 	else
 	else
 	{
 	{
-		_starpu_mpi_req_list_push_front(new_requests, req);
+		_starpu_mpi_req_list_push_front(ready_requests, req);
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 	}
 	}
@@ -986,7 +988,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 	}
 	}
 }
 }
 
 
-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 {
 {
 	_STARPU_MPI_LOG_IN();
 	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT_MSG(req, "Invalid request");
 	STARPU_ASSERT_MSG(req, "Invalid request");
@@ -1080,10 +1082,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
  	int header_req_submitted = 0;
  	int header_req_submitted = 0;
 
 
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 	{
 		/* shall we block ? */
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
+		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0;
 
 
 #ifndef STARPU_MPI_ACTIVITY
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1107,21 +1109,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
 		/* get one request */
 		/* get one request */
 		struct _starpu_mpi_req *req;
 		struct _starpu_mpi_req *req;
-		while (!_starpu_mpi_req_list_empty(new_requests))
+		while (!_starpu_mpi_req_list_empty(ready_requests))
 		{
 		{
-			req = _starpu_mpi_req_list_pop_back(new_requests);
+			req = _starpu_mpi_req_list_pop_back(ready_requests);
 
 
 			/* handling a request is likely to block for a while
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
 			 * (on a sync_data_with_mem call), we want to let the
 			 * application submit requests in the meantime, so we
 			 * application submit requests in the meantime, so we
 			 * release the lock. */
 			 * release the lock. */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-			_starpu_mpi_handle_new_request(req);
+			_starpu_mpi_handle_ready_request(req);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 		}
 
 
-		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
-		 * requests in our side, we resubmit a header request. */
+		/* If there is no currently submitted header_req submitted to
+                 * catch envelopes from senders, and there is some pending
+                 * receive requests on our side, we resubmit a header request. */
 		MPI_Request header_req;
 		MPI_Request header_req;
 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		{
 		{
@@ -1151,11 +1154,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 
 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 
 
-				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
-				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
-				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
+				/* Case: a data will arrive before a matching receive is
+				 * posted by the application. Create a temporary handle to
+				 * store the incoming data, submit a starpu_mpi_irecv_detached
+				 * on this handle, and store it as an early_data
+				 */
 				if (!found_req)
 				if (!found_req)
 				{
 				{
+
 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 
 					starpu_data_handle_t data_handle = NULL;
 					starpu_data_handle_t data_handle = NULL;
@@ -1198,8 +1204,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 
 					// We wait until the request is pushed in the
 					// We wait until the request is pushed in the
-					// new_request list, that ensures that the next loop
-					// will call _starpu_mpi_handle_new_request
+					// ready_request list, that ensures that the next loop
+					// will call _starpu_mpi_handle_ready_request
 					// on the request and post the corresponding mpi_irecv,
 					// on the request and post the corresponding mpi_irecv,
 					// otherwise, it may lead to read data as envelop
 					// otherwise, it may lead to read data as envelop
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -1214,8 +1220,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				}
-				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
-				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
+				/* Case: a matching application request has been found for
+				 * the incoming data, we handle the correct allocation
+				 * of the pointer associated to the data handle, then
+				 * submit the corresponding receive with
+				 * _starpu_mpi_handle_ready_request. */
 				else
 				else
 				{
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
@@ -1242,7 +1251,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * application submit requests in the meantime, so we
 					 * application submit requests in the meantime, so we
 					 * release the lock. */
 					 * release the lock. */
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					_starpu_mpi_handle_new_request(found_req);
+					_starpu_mpi_handle_ready_request(found_req);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				}
 				header_req_submitted = 0;
 				header_req_submitted = 0;
@@ -1255,7 +1264,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	}
 	}
 
 
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
-	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
+	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	_starpu_mpi_early_request_check_termination();
 	_starpu_mpi_early_request_check_termination();
 	_starpu_mpi_early_data_check_termination();
 	_starpu_mpi_early_data_check_termination();
@@ -1326,7 +1335,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
-	new_requests = _starpu_mpi_req_list_new();
+	ready_requests = _starpu_mpi_req_list_new();
 
 
 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	detached_requests = _starpu_mpi_req_list_new();
 	detached_requests = _starpu_mpi_req_list_new();
@@ -1402,7 +1411,7 @@ int starpu_mpi_shutdown(void)
 
 
 	/* free the request queues */
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);
 	_starpu_mpi_req_list_delete(detached_requests);
-	_starpu_mpi_req_list_delete(new_requests);
+	_starpu_mpi_req_list_delete(ready_requests);
 
 
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_free();
 	_starpu_mpi_comm_amounts_free();
@@ -1423,3 +1432,10 @@ void starpu_mpi_data_register(starpu_data_handle_t data_handle, int tag, int ran
 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
 
 
 }
 }
+
+int starpu_mpi_world_rank(void)
+{
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	return rank;
+}

+ 1 - 0
src/Makefile.am

@@ -181,6 +181,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/eager_central_policy.c			\
 	sched_policies/eager_central_policy.c			\
 	sched_policies/eager_central_priority_policy.c		\
 	sched_policies/eager_central_priority_policy.c		\
 	sched_policies/work_stealing_policy.c			\
 	sched_policies/work_stealing_policy.c			\
+	sched_policies/locality_work_stealing_policy.c		\
 	sched_policies/deque_modeling_policy_data_aware.c	\
 	sched_policies/deque_modeling_policy_data_aware.c	\
 	sched_policies/random_policy.c				\
 	sched_policies/random_policy.c				\
 	sched_policies/stack_queues.c				\
 	sched_policies/stack_queues.c				\

+ 42 - 4
src/common/fxt.h

@@ -106,6 +106,9 @@
 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
 
 
 #define _STARPU_FUT_EVENT	0x513c
 #define _STARPU_FUT_EVENT	0x513c
+#define _STARPU_FUT_THREAD_EVENT	0x513d
+
+#define	_STARPU_FUT_CODELET_DETAILS	0x513e
 
 
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
@@ -193,6 +196,31 @@ void _starpu_fxt_register_thread(unsigned);
 #define _STARPU_FUT_COMMIT(size) do { } while (0)
 #define _STARPU_FUT_COMMIT(size) do { } while (0)
 #endif
 #endif
 
 
+#ifdef FUT_DO_PROBE1STR
+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str) FUT_DO_PROBE1STR(CODE, P1, str)
+#else
+/* Sometimes we need something a little more specific than the wrappers from
+ * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
+ * by a string. */
+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str)			\
+do {									\
+    if(fut_active) {							\
+	/* No more than FXT_MAX_PARAMS args are allowed */		\
+	/* we add a \0 just in case ... */				\
+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	unsigned nbargs = 1 + nbargs_str;				\
+	size_t total_len = FUT_SIZE(nbargs);				\
+	unsigned long *futargs =					\
+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
+	*(futargs++) = (unsigned long)(P1);				\
+	snprintf((char *)futargs, len, "%s", str);			\
+	((char *)futargs)[len - 1] = '\0';				\
+	_STARPU_FUT_COMMIT(total_len);					\
+    }									\
+} while (0);
+#endif
+
 #ifdef FUT_DO_PROBE2STR
 #ifdef FUT_DO_PROBE2STR
 #define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
 #define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
 #else
 #else
@@ -297,7 +325,7 @@ do {									\
 #ifdef FUT_DO_PROBE6STR
 #ifdef FUT_DO_PROBE6STR
 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #else
 #else
-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
+#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
 do {									\
 do {									\
     if(fut_active) {							\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -324,7 +352,7 @@ do {									\
 #ifdef FUT_DO_PROBE7STR
 #ifdef FUT_DO_PROBE7STR
 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #else
 #else
-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
+#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
 do {									\
 do {									\
     if(fut_active) {							\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -378,7 +406,7 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
 
-#define _STARPU_TRACE_START_CODELET_BODY(job)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
 do {									\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
 	if (model_name)                                                 \
@@ -389,6 +417,11 @@ do {									\
 	else {                                                          \
 	else {                                                          \
 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 	}								\
 	}								\
+	{								\
+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
+	}								\
 } while(0);
 } while(0);
 
 
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
@@ -563,6 +596,9 @@ do {										\
 #define _STARPU_TRACE_EVENT(S)			\
 #define _STARPU_TRACE_EVENT(S)			\
 	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
 	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
 
 
+#define _STARPU_TRACE_THREAD_EVENT(S)			\
+	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
+
 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
 
 
@@ -746,7 +782,7 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
@@ -794,6 +830,8 @@ do {										\
 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
+#define _STARPU_TRACE_EVENT(S)		do {} while(0)
+#define _STARPU_TRACE_THREAD_EVENT(S)		do {} while(0)
 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
 #define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
 #define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)

+ 27 - 1
src/common/thread.c

@@ -288,9 +288,35 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
 
 	return p_ret;
 	return p_ret;
 }
 }
+
+#if defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
+{
+	*barrier = xbt_barrier_init(count);
+	return 0;
+}
+
+int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
+{
+	if (*barrier)
+		xbt_barrier_destroy(*barrier);
+	return 0;
+}
+
+int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
+{
+	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
+
+	xbt_barrier_wait(*barrier);
+
+	_STARPU_TRACE_BARRIER_WAIT_END();
+	return 0;
+}
+#endif /* defined(STARPU_SIMGRID) */
+
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
+#if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
 {
 {
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);

+ 9 - 0
src/core/jobs.c

@@ -116,6 +116,15 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	_starpu_job_delete(j);
 	_starpu_job_delete(j);
 }
 }
 
 
+int _starpu_job_finished(struct _starpu_job *j)
+{
+	int ret;
+	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	ret = j->terminated == 2;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	return ret;
+}
+
 void _starpu_wait_job(struct _starpu_job *j)
 void _starpu_wait_job(struct _starpu_job *j)
 {
 {
 	STARPU_ASSERT(j->task);
 	STARPU_ASSERT(j->task);

+ 3 - 0
src/core/jobs.h

@@ -182,6 +182,9 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 /* Destroy the data structure associated to the job structure */
 /* Destroy the data structure associated to the job structure */
 void _starpu_job_destroy(struct _starpu_job *j);
 void _starpu_job_destroy(struct _starpu_job *j);
 
 
+/* Test for the termination of the job */
+int _starpu_job_finished(struct _starpu_job *j);
+
 /* Wait for the termination of the job */
 /* Wait for the termination of the job */
 void _starpu_wait_job(struct _starpu_job *j);
 void _starpu_wait_job(struct _starpu_job *j);
 
 

+ 66 - 2
src/core/sched_ctx.c

@@ -60,7 +60,11 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 	{
 	{
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
 		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
 			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
 		worker->nsched_ctxs--;
 		worker->nsched_ctxs--;
 	}
 	}
@@ -185,6 +189,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 	}
 	}
 	else if(sched_ctx->sched_policy->add_workers)
 	else if(sched_ctx->sched_policy->add_workers)
 	{
 	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		if(added_workers)
 		if(added_workers)
 		{
 		{
 			if(*n_added_workers > 0)
 			if(*n_added_workers > 0)
@@ -192,6 +197,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 		}
 		}
 		else
 		else
 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 	}
 	}
 	return;
 	return;
 }
 }
@@ -229,7 +235,11 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 
 
 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 
 
 	free(workerids);
 	free(workerids);
 	return;
 	return;
@@ -523,6 +533,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	int max_prio = 0;
 	int max_prio = 0;
 	struct starpu_sched_policy *sched_policy = NULL;
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
 	unsigned hierarchy_level = 0;
+	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 
 	va_start(varg_list, sched_ctx_name);
 	va_start(varg_list, sched_ctx_name);
 	while ((arg_type = va_arg(varg_list, int)) != 0)
 	while ((arg_type = va_arg(varg_list, int)) != 0)
@@ -551,6 +562,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		{
 		{
 			hierarchy_level = va_arg(varg_list, unsigned);
 			hierarchy_level = va_arg(varg_list, unsigned);
 		}
 		}
+		else if (arg_type == STARPU_SCHED_CTX_NESTED)
+		{
+			nesting_sched_ctx = va_arg(varg_list, unsigned);
+		}
 		else
 		else
 		{
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -562,6 +577,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
 	sched_ctx->hierarchy_level = hierarchy_level;
 	sched_ctx->hierarchy_level = hierarchy_level;
+	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
 
 	_starpu_unlock_mutex_if_prev_locked();
 	_starpu_unlock_mutex_if_prev_locked();
 	int *added_workerids;
 	int *added_workerids;
@@ -1132,6 +1148,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	case STARPU_WORKER_TREE:
 	case STARPU_WORKER_TREE:
 		sched_ctx->workers->has_next = worker_tree.has_next;
 		sched_ctx->workers->has_next = worker_tree.has_next;
 		sched_ctx->workers->get_next = worker_tree.get_next;
 		sched_ctx->workers->get_next = worker_tree.get_next;
+		sched_ctx->workers->has_next_master = worker_tree.has_next_master;
+		sched_ctx->workers->get_next_master = worker_tree.get_next_master;
 		sched_ctx->workers->add = worker_tree.add;
 		sched_ctx->workers->add = worker_tree.add;
 		sched_ctx->workers->remove = worker_tree.remove;
 		sched_ctx->workers->remove = worker_tree.remove;
 		sched_ctx->workers->init = worker_tree.init;
 		sched_ctx->workers->init = worker_tree.init;
@@ -1144,6 +1162,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	default:
 	default:
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
+		sched_ctx->workers->has_next_master = worker_list.has_next_master;
+		sched_ctx->workers->get_next_master = worker_list.get_next_master;
 		sched_ctx->workers->add = worker_list.add;
 		sched_ctx->workers->add = worker_list.add;
 		sched_ctx->workers->remove = worker_list.remove;
 		sched_ctx->workers->remove = worker_list.remove;
 		sched_ctx->workers->init = worker_list.init;
 		sched_ctx->workers->init = worker_list.init;
@@ -1171,6 +1191,7 @@ void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
 		starpu_worker_get_name(workerids[i], name, 256);
 		starpu_worker_get_name(workerids[i], name, 256);
 		fprintf(f, "\t\t%s\n", name);
 		fprintf(f, "\t\t%s\n", name);
 	}
 	}
+	free(workerids);
 }
 }
 
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
@@ -1605,6 +1626,44 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
 
 }
 }
 
 
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx_list *l = NULL;
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
+	{ 
+		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
+			return sched_ctx->id;
+	}
+	return STARPU_NMAX_SCHED_CTXS;
+
+}
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
+{
+        _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
+}
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
+{
+	int workerid = starpu_worker_get_id();
+	struct _starpu_worker *worker  = NULL;
+	if(workerid != -1)
+	{
+		worker = _starpu_get_worker_struct(workerid);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	}
+
+	task->sched_ctx = sched_ctx;
+	_starpu_task_submit_nodeps(task);
+
+	if(workerid != -1)
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+}
+
 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
 {
 {
 	int s;
 	int s;
@@ -1620,6 +1679,7 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 	return 0;
 	return 0;
 
 
 }
 }
+
 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
@@ -1643,7 +1703,6 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 		workerid = workerids[w];
 		workerid = workerids[w];
 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
 		{
 		{
-			sched_ctx->sleeping[workerids[w]] = 1;
 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
 		}
 		}
 	}
 	}
@@ -1652,7 +1711,10 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
 
 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
 {
 {
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 1;
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->sleeping[workerid] = 1;
 	int master = sched_ctx->master[workerid];
 	int master = sched_ctx->master[workerid];
 	sem_post(&sched_ctx->fall_asleep_sem[master]);
 	sem_post(&sched_ctx->fall_asleep_sem[master]);
 
 
@@ -1666,6 +1728,9 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 	sem_post(&sched_ctx->wake_up_sem[master]);
 	sem_post(&sched_ctx->wake_up_sem[master]);
 	sched_ctx->sleeping[workerid] = 0;
 	sched_ctx->sleeping[workerid] = 0;
 	sched_ctx->master[workerid] = -1;
 	sched_ctx->master[workerid] = -1;
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 0;
+
 	return;
 	return;
 }
 }
 
 
@@ -1720,7 +1785,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	int current_worker_id = starpu_worker_get_id();
 	int current_worker_id = starpu_worker_get_id();
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
 	struct starpu_worker_collection *workers = sched_ctx->workers;
-
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	int w = 0;
 	int w = 0;
 
 

+ 3 - 0
src/core/sched_ctx.h

@@ -147,6 +147,9 @@ struct _starpu_sched_ctx
 	/* bool indicating if the workers is sleeping in this ctx */
 	/* bool indicating if the workers is sleeping in this ctx */
 	unsigned sleeping[STARPU_NMAXWORKERS];
 	unsigned sleeping[STARPU_NMAXWORKERS];
 
 
+	/* ctx nesting the current ctx */
+	unsigned nesting_sched_ctx;
+
 };
 };
 
 
 struct _starpu_machine_config;
 struct _starpu_machine_config;

+ 26 - 2
src/core/sched_policy.c

@@ -38,6 +38,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_eager_policy,
 	&_starpu_sched_eager_policy,
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_random_policy,
 	&_starpu_sched_random_policy,
+	&_starpu_sched_lws_policy,
 	&_starpu_sched_ws_policy,
 	&_starpu_sched_ws_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,
 	&_starpu_sched_dmda_policy,
@@ -174,14 +175,20 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _st
 
 
 	load_sched_policy(selected_policy, sched_ctx);
 	load_sched_policy(selected_policy, sched_ctx);
 
 
+	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
+	_STARPU_TRACE_WORKER_SCHEDULING_POP;
 }
 }
 
 
 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
 {
 {
 	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
 	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
 	if (policy->deinit_sched)
 	if (policy->deinit_sched)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		policy->deinit_sched(sched_ctx->id);
 		policy->deinit_sched(sched_ctx->id);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 }
 
 
 static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task *task, struct _starpu_worker *worker, int workerid, int perf_workerid)
 static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task *task, struct _starpu_worker *worker, int workerid, int perf_workerid)
@@ -193,7 +200,11 @@ static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task
         {
         {
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
 		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 			sched_ctx->sched_policy->push_task_notify(task, workerid, perf_workerid, sched_ctx->id);
 			sched_ctx->sched_policy->push_task_notify(task, workerid, perf_workerid, sched_ctx->id);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 	}
 	}
 }
 }
 
 
@@ -867,22 +878,31 @@ profiling:
 
 
 struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
 struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
 {
 {
+	struct starpu_task *task = NULL;
 	if(sched_ctx->sched_policy)
 	if(sched_ctx->sched_policy)
 	{
 	{
 		STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
 		STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
 		
 		
 		/* TODO set profiling info */
 		/* TODO set profiling info */
 		if(sched_ctx->sched_policy->pop_every_task)
 		if(sched_ctx->sched_policy->pop_every_task)
-			return sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+			task = sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 	}
 	}
-	return NULL;
+	return task;
 }
 }
 
 
 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->pre_exec_hook(task);
 		sched_ctx->sched_policy->pre_exec_hook(task);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 }
 
 
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
@@ -890,7 +910,11 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 
 
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->post_exec_hook(task);
 		sched_ctx->sched_policy->post_exec_hook(task);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 }
 
 
 void _starpu_wait_on_sched_event(void)
 void _starpu_wait_on_sched_event(void)

+ 1 - 0
src/core/sched_policy.h

@@ -58,6 +58,7 @@ void _starpu_print_idle_time();
 /*
 /*
  *	Predefined policies
  *	Predefined policies
  */
  */
+extern struct starpu_sched_policy _starpu_sched_lws_policy;
 extern struct starpu_sched_policy _starpu_sched_ws_policy;
 extern struct starpu_sched_policy _starpu_sched_ws_policy;
 extern struct starpu_sched_policy _starpu_sched_prio_policy;
 extern struct starpu_sched_policy _starpu_sched_prio_policy;
 extern struct starpu_sched_policy _starpu_sched_random_policy;
 extern struct starpu_sched_policy _starpu_sched_random_policy;

+ 21 - 3
src/core/simgrid.c

@@ -33,6 +33,8 @@ extern int starpu_main(int argc, char *argv[]);
 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
 #pragma weak smpi_simulated_main_
 #pragma weak smpi_simulated_main_
 extern int smpi_simulated_main_(int argc, char *argv[]);
 extern int smpi_simulated_main_(int argc, char *argv[]);
+#pragma weak starpu_mpi_world_rank
+extern int starpu_mpi_world_rank(void);
 
 
 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
 
 
@@ -48,6 +50,13 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 	return starpu_main(args->argc, args->argv);
 	return starpu_main(args->argc, args->argv);
 }
 }
 
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
+#ifdef HAVE_MSG_GET_AS_BY_NAME
+static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
+{
+	return MSG_get_as_by_name(name);
+}
+#else /* HAVE_MSG_GET_AS_BY_NAME */
 static msg_as_t __starpu_simgrid_get_as_by_name(msg_as_t root, const char *name)
 static msg_as_t __starpu_simgrid_get_as_by_name(msg_as_t root, const char *name)
 {
 {
 	xbt_dict_t dict;
 	xbt_dict_t dict;
@@ -69,6 +78,8 @@ static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 {
 {
 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
 }
 }
+#endif /* HAVE_MSG_GET_AS_BY_NAME */
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 
 
 int _starpu_simgrid_get_nbhosts(const char *prefix)
 int _starpu_simgrid_get_nbhosts(const char *prefix)
 {
 {
@@ -77,13 +88,16 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 	unsigned i, nb;
 	unsigned i, nb;
 	unsigned len = strlen(prefix);
 	unsigned len = strlen(prefix);
 
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	if (_starpu_simgrid_running_smpi())
 	{
 	{
 		char name[16];
 		char name[16];
-		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
 	}
 	}
 	else
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 		hosts = MSG_hosts_as_dynar();
 	nb = xbt_dynar_length(hosts);
 	nb = xbt_dynar_length(hosts);
 
 
@@ -125,7 +139,8 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 	if (_starpu_simgrid_running_smpi())
 	if (_starpu_simgrid_running_smpi())
 	{
 	{
 		char mpiname[16];
 		char mpiname[16];
-		snprintf(mpiname, sizeof(mpiname), "%d-%s", smpi_current_rank, name);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(mpiname, sizeof(mpiname), "%d-%s", starpu_mpi_world_rank(), name);
 		return MSG_get_host_by_name(mpiname);
 		return MSG_get_host_by_name(mpiname);
 	}
 	}
 	else
 	else
@@ -178,6 +193,7 @@ void _starpu_simgrid_init()
 	xbt_dynar_t hosts;
 	xbt_dynar_t hosts;
 	int i;
 	int i;
 
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	if (_starpu_simgrid_running_smpi())
 	{
 	{
 		/* Take back hand to create the local platform for this MPI
 		/* Take back hand to create the local platform for this MPI
@@ -191,7 +207,8 @@ void _starpu_simgrid_init()
 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
 		int ret;
 		int ret;
 
 
-		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 
 
 		/* Get XML platform */
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
@@ -212,6 +229,7 @@ void _starpu_simgrid_init()
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
 	}
 	}
 	else
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 		hosts = MSG_hosts_as_dynar();
 
 
 	int nb = xbt_dynar_length(hosts);
 	int nb = xbt_dynar_length(hosts);

+ 7 - 0
src/core/task.c

@@ -187,6 +187,13 @@ void starpu_task_destroy(struct starpu_task *task)
 	_starpu_task_destroy(task);
 	_starpu_task_destroy(task);
 }
 }
 
 
+int starpu_task_finished(struct starpu_task *task)
+{
+	STARPU_ASSERT(task);
+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_finished can only be called on tasks with detach = 0");
+	return _starpu_job_finished(_starpu_get_job_associated_to_task(task));
+}
+
 int starpu_task_wait(struct starpu_task *task)
 int starpu_task_wait(struct starpu_task *task)
 {
 {
         _STARPU_LOG_IN();
         _STARPU_LOG_IN();

+ 54 - 38
src/core/workers.c

@@ -467,6 +467,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->reverse_phase[1] = 0;
 	workerarg->reverse_phase[1] = 0;
 	workerarg->pop_ctx_priority = 1;
 	workerarg->pop_ctx_priority = 1;
 	workerarg->sched_mutex_locked = 0;
 	workerarg->sched_mutex_locked = 0;
+	workerarg->slave = 0;
 
 
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }
 }
@@ -516,7 +517,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 
 	/* Launch workers asynchronously */
 	/* Launch workers asynchronously */
 	unsigned cpu = 0;
 	unsigned cpu = 0;
-	unsigned worker;
+	unsigned worker, i;
 
 
 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
 	/* Get itimer of the main thread, to set it for the worker threads */
 	/* Get itimer of the main thread, to set it for the worker threads */
@@ -526,6 +527,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #ifdef HAVE_AYUDAME_H
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 #endif
 #endif
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	for (i = 0; i < sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0]); i++)
+		cuda_worker_set[i].workers = NULL;
+#endif
+#ifdef STARPU_USE_MIC
+	for (i = 0; i < sizeof(mic_worker_set)/sizeof(mic_worker_set[0]); i++)
+		mic_worker_set[i].workers = NULL;
+#endif
+
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -575,44 +586,44 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = workerarg->devid;
 				driver.id.cuda_id = workerarg->devid;
-				if (_starpu_may_launch_driver(pconfig->conf, &driver))
-				{
-					/* We spawn only one thread per CUDA device,
-					 * which will control all CUDA workers of this
-					 * device. (by using a worker set). */
-					if (cuda_worker_set[devid].started)
-						goto worker_set_initialized;
+				workerarg->set = &cuda_worker_set[devid];
 
 
-					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
-					cuda_worker_set[devid].workers = workerarg;
-					cuda_worker_set[devid].set_is_initialized = 0;
+				/* We spawn only one thread per CUDA device,
+				 * which will control all CUDA workers of this
+				 * device. (by using a worker set). */
+				if (cuda_worker_set[devid].workers)
+					break;
 
 
-					STARPU_PTHREAD_CREATE_ON(
-						workerarg->name,
-						&cuda_worker_set[devid].worker_thread,
-						NULL,
-						_starpu_cuda_worker,
-						&cuda_worker_set[devid],
-						worker+1);
-#ifdef STARPU_USE_FXT
-					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-					while (!workerarg->worker_is_running)
-						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
-#endif
-					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
-					while (!cuda_worker_set[devid].set_is_initialized)
-						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
-									 &cuda_worker_set[devid].mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
-					cuda_worker_set[devid].started = 1;
-		worker_set_initialized:
-					workerarg->set = &cuda_worker_set[devid];
-				}
-				else
+				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+				cuda_worker_set[devid].workers = workerarg;
+				cuda_worker_set[devid].set_is_initialized = 0;
+
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
 					workerarg->run_by_starpu = 0;
 					workerarg->run_by_starpu = 0;
+					break;
 				}
 				}
+
+				STARPU_PTHREAD_CREATE_ON(
+					workerarg->name,
+					&cuda_worker_set[devid].worker_thread,
+					NULL,
+					_starpu_cuda_worker,
+					&cuda_worker_set[devid],
+					worker+1);
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
+				while (!cuda_worker_set[devid].set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
+								 &cuda_worker_set[devid].mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
+				cuda_worker_set[devid].started = 1;
+
 				break;
 				break;
 #endif
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
@@ -642,11 +653,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #endif
 #ifdef STARPU_USE_MIC
 #ifdef STARPU_USE_MIC
 			case STARPU_MIC_WORKER:
 			case STARPU_MIC_WORKER:
+				workerarg->set = &mic_worker_set[devid];
+
 				/* We spawn only one thread
 				/* We spawn only one thread
 				 * per MIC device, which will control all MIC
 				 * per MIC device, which will control all MIC
 				 * workers of this device. (by using a worker set). */
 				 * workers of this device. (by using a worker set). */
-				if (mic_worker_set[devid].started)
-					goto worker_set_initialized;
+				if (mic_worker_set[devid].workers)
+					break;
 
 
 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
 
 
@@ -678,8 +691,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
 
 
 				mic_worker_set[devid].started = 1;
 				mic_worker_set[devid].started = 1;
-		worker_set_initialized:
-				workerarg->set = &mic_worker_set[devid];
 
 
 				break;
 				break;
 #endif /* STARPU_USE_MIC */
 #endif /* STARPU_USE_MIC */
@@ -1374,6 +1385,11 @@ unsigned starpu_worker_get_count(void)
 	return config.topology.nworkers;
 	return config.topology.nworkers;
 }
 }
 
 
+unsigned starpu_worker_is_slave(int workerid)
+{
+	return config.workers[workerid].slave;
+}
+
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 {
 {
 	switch (type)
 	switch (type)

+ 3 - 0
src/core/workers.h

@@ -112,6 +112,9 @@ LIST_TYPE(_starpu_worker,
 	/* flag to know if sched_mutex is locked or not */
 	/* flag to know if sched_mutex is locked or not */
 	unsigned sched_mutex_locked;
 	unsigned sched_mutex_locked;
 
 
+	/* bool to indicate if the worker is slave in a ctx */
+	unsigned slave;
+
 #ifdef __GLIBC__
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */
 #endif /* __GLIBC__ */

+ 15 - 8
src/datawizard/coherency.c

@@ -151,7 +151,7 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
 
 	/* the data is present now */
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
 	unsigned requesting_node = requesting_replicate->memory_node;
-	requesting_replicate->requested[requesting_node] = 0;
+	requesting_replicate->requested &= ~(1UL << requesting_node);
 
 
 	if (mode & STARPU_W)
 	if (mode & STARPU_W)
 	{
 	{
@@ -656,18 +656,25 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
-static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
+static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
 {
 {
-// XXX : this is just a hint, so we don't take the lock ...
-//	_starpu_spin_lock(&handle->header_lock);
+	unsigned local_node = _starpu_memory_node_get_local_key();
+	int cpt = 0;
+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
+	{
+		cpt++;
+		_starpu_datawizard_progress(local_node, 1);
+	}
+	if (cpt == STARPU_SPIN_MAXTRY)
+		_starpu_spin_lock(&handle->header_lock);
 
 
 	if (replicate->state == STARPU_INVALID)
 	if (replicate->state == STARPU_INVALID)
 	{
 	{
 		unsigned dst_node = replicate->memory_node;
 		unsigned dst_node = replicate->memory_node;
-		replicate->requested[dst_node] = 1;
+		replicate->requested |= 1UL << dst_node;
 	}
 	}
 
 
-//	_starpu_spin_unlock(&handle->header_lock);
+	_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
@@ -686,7 +693,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		prefetch_data_on_node(handle, replicate, mode);
 		prefetch_data_on_node(handle, replicate, mode);
 
 
-		_starpu_set_data_requested_flag_if_needed(replicate);
+		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -880,7 +887,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsig
 
 
 		for (i = 0; i < nnodes; i++)
 		for (i = 0; i < nnodes; i++)
 		{
 		{
-			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
+			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
 				ret = 1;
 				ret = 1;
 		}
 		}
 
 

+ 11 - 11
src/datawizard/coherency.h

@@ -48,26 +48,26 @@ LIST_TYPE(_starpu_data_replicate,
 
 
 	unsigned memory_node;
 	unsigned memory_node;
 
 
-	/* A buffer that is used for SCRATCH or reduction cannnot be used with
-	 * filters. */
-	unsigned relaxed_coherency;
-
-	/* We may need to initialize the replicate with some value before using it. */
-	unsigned initialized;
-
 	/* describes the state of the local data in term of coherency */
 	/* describes the state of the local data in term of coherency */
 	enum _starpu_cache_state	state;
 	enum _starpu_cache_state	state;
 
 
 	int refcnt;
 	int refcnt;
 
 
+	/* A buffer that is used for SCRATCH or reduction cannnot be used with
+	 * filters. */
+	unsigned relaxed_coherency:2;
+
+	/* We may need to initialize the replicate with some value before using it. */
+	unsigned initialized:1;
+
 	/* is the data locally allocated ? */
 	/* is the data locally allocated ? */
-	uint8_t allocated;
+	unsigned allocated:1;
 	/* was it automatically allocated ? (else it's the application-provided
 	/* was it automatically allocated ? (else it's the application-provided
 	 * buffer, don't ever try to free it!) */
 	 * buffer, don't ever try to free it!) */
 	/* perhaps the allocation was perform higher in the hiearchy
 	/* perhaps the allocation was perform higher in the hiearchy
 	 * for now this is just translated into !automatically_allocated
 	 * for now this is just translated into !automatically_allocated
 	 * */
 	 * */
-	uint8_t automatically_allocated;
+	unsigned automatically_allocated:1;
 
 
         /* Pointer to memchunk for LRU strategy */
         /* Pointer to memchunk for LRU strategy */
 	struct _starpu_mem_chunk * mc;
 	struct _starpu_mem_chunk * mc;
@@ -79,7 +79,7 @@ LIST_TYPE(_starpu_data_replicate,
 	   flag when it assigns a task to a queue, policies which do not
 	   flag when it assigns a task to a queue, policies which do not
 	   use this hint can simply ignore it.
 	   use this hint can simply ignore it.
 	 */
 	 */
-	uint8_t requested[STARPU_MAXNODES];
+	uint32_t requested;
 	struct _starpu_data_request *request[STARPU_MAXNODES];
 	struct _starpu_data_request *request[STARPU_MAXNODES];
 )
 )
 
 
@@ -207,7 +207,7 @@ struct _starpu_data_state
 	 * the end of the reduction. */
 	 * the end of the reduction. */
 	struct _starpu_data_requester_list *reduction_req_list;
 	struct _starpu_data_requester_list *reduction_req_list;
 
 
-	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
+	starpu_data_handle_t *reduction_tmp_handles;
 
 
 	unsigned lazy_unregister;
 	unsigned lazy_unregister;
 
 

+ 2 - 1
src/datawizard/filters.c

@@ -176,6 +176,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		/* initialize the chunk lock */
 		/* initialize the chunk lock */
 		child->req_list = _starpu_data_requester_list_new();
 		child->req_list = _starpu_data_requester_list_new();
 		child->reduction_req_list = _starpu_data_requester_list_new();
 		child->reduction_req_list = _starpu_data_requester_list_new();
+		child->reduction_tmp_handles = NULL;
 		child->refcnt = 0;
 		child->refcnt = 0;
 		child->busy_count = 0;
 		child->busy_count = 0;
 		child->busy_waiting = 0;
 		child->busy_waiting = 0;
@@ -240,10 +241,10 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 			child_replicate->automatically_allocated = 0;
 			child_replicate->automatically_allocated = 0;
 			child_replicate->refcnt = 0;
 			child_replicate->refcnt = 0;
 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
+			child_replicate->requested = 0;
 
 
 			for (node = 0; node < STARPU_MAXNODES; node++)
 			for (node = 0; node < STARPU_MAXNODES; node++)
 			{
 			{
-				child_replicate->requested[node] = 0;
 				child_replicate->request[node] = NULL;
 				child_replicate->request[node] = NULL;
 			}
 			}
 
 

+ 2 - 1
src/datawizard/interfaces/data_interface.c

@@ -291,6 +291,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
 
 	handle->reduction_refcnt = 0;
 	handle->reduction_refcnt = 0;
 	handle->reduction_req_list = _starpu_data_requester_list_new();
 	handle->reduction_req_list = _starpu_data_requester_list_new();
+	handle->reduction_tmp_handles = NULL;
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	handle->last_submitted_ghost_sync_id_is_valid = 0;
 	handle->last_submitted_ghost_sync_id_is_valid = 0;
@@ -346,10 +347,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		replicate->state = STARPU_INVALID;
 		replicate->state = STARPU_INVALID;
 		replicate->refcnt = 0;
 		replicate->refcnt = 0;
 		replicate->handle = handle;
 		replicate->handle = handle;
+		replicate->requested = 0;
 
 
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
 		{
-			replicate->requested[node] = 0;
 			replicate->request[node] = NULL;
 			replicate->request[node] = NULL;
 		}
 		}
 
 

+ 5 - 1
src/datawizard/reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -156,6 +156,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 
 	/* Register all valid per-worker replicates */
 	/* Register all valid per-worker replicates */
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
+	STARPU_ASSERT(!handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = malloc(nworkers * sizeof(handle->reduction_tmp_handles[0]));
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		if (handle->per_worker[worker].initialized)
 		if (handle->per_worker[worker].initialized)
@@ -390,4 +392,6 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 			/* TODO put in cache */
 			/* TODO put in cache */
 		}
 		}
 	}
 	}
+	free(handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = NULL;
 }
 }

+ 2 - 4
src/datawizard/user_interactions.c

@@ -519,9 +519,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 /* Query the status of the handle on the specified memory node. */
 /* Query the status of the handle on the specified memory node. */
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 {
 {
-#ifdef STARPU_DEVEL
-#warning FIXME
-#endif
+// XXX : this is just a hint, so we don't take the lock ...
 //	_starpu_spin_lock(&handle->header_lock);
 //	_starpu_spin_lock(&handle->header_lock);
 
 
 	if (is_allocated)
 	if (is_allocated)
@@ -537,7 +535,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 		unsigned node;
 		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
 		{
-			if (handle->per_node[memory_node].requested[node])
+			if (handle->per_node[memory_node].requested & (1UL << node))
 			{
 			{
 				requested = 1;
 				requested = 1;
 				break;
 				break;

+ 109 - 7
src/debug/traces/starpu_fxt.c

@@ -275,6 +275,18 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 #endif
 #endif
 }
 }
 
 
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	/* TODO: set detailed state */
+	poti_SetState(time, container, "S", name);
+#else
+	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
+#endif
+}
+
 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 {
 {
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
@@ -631,11 +643,8 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	int worker;
 	int worker;
 	worker = find_worker_id(ev->param[2]);
 	worker = find_worker_id(ev->param[2]);
 
 
-	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
 	if (worker < 0) return;
 
 
-	char *prefix = options->file_prefix;
-
 	unsigned long has_name = ev->param[3];
 	unsigned long has_name = ev->param[3];
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
 
@@ -646,8 +655,12 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
 
 	create_paje_state_if_not_found(name, options);
 	create_paje_state_if_not_found(name, options);
 
 
+#ifndef STARPU_ENABLE_PAJE_CODELET_DETAILS
 	if (out_paje_file)
 	if (out_paje_file)
 	{
 	{
+		char *prefix = options->file_prefix;
+		unsigned sched_ctx = ev->param[1];
+
 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
 		if (sched_ctx != 0)
 		if (sched_ctx != 0)
 		{
 		{
@@ -662,9 +675,40 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 #endif
 #endif
 		}
 		}
 	}
 	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
 
 
 }
 }
 
 
+static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	int worker;
+	worker = find_worker_id(ev->param[5]);
+
+	unsigned sched_ctx = ev->param[1];
+	if (worker < 0) return;
+
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+	{
+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
+		if (sched_ctx != 0)
+		{
+#ifdef STARPU_HAVE_POTI
+			char container[STARPU_POTI_STR_LEN];
+			char ctx[6];
+			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
+			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
+			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
+#else
+			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+#endif
+		}
+	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
+}
+
 static long dumped_codelets_count;
 static long dumped_codelets_count;
 static struct starpu_fxt_codelet_event *dumped_codelets;
 static struct starpu_fxt_codelet_event *dumped_codelets;
 
 
@@ -727,7 +771,7 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 			program_container_alias (container, STARPU_POTI_STR_LEN, prefix);
 			program_container_alias (container, STARPU_POTI_STR_LEN, prefix);
 #else
 #else
-			fprintf(out_paje_file, "9	%.9f	event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
+			fprintf(out_paje_file, "9	%.9f	user_event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
 #endif
 #endif
 	}
 	}
 	else
 	else
@@ -736,12 +780,12 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 			thread_container_alias (container, STARPU_POTI_STR_LEN, prefix, ev->param[1]);
 			thread_container_alias (container, STARPU_POTI_STR_LEN, prefix, ev->param[1]);
 #else
 #else
-			fprintf(out_paje_file, "9	%.9f	event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
+			fprintf(out_paje_file, "9	%.9f	user_event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
 #endif
 #endif
 	}
 	}
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
 	if (out_paje_file)
 	if (out_paje_file)
-		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", paje_value);
+		poti_NewEvent(get_event_time_stamp(ev, options), container, "user_event", paje_value);
 #endif
 #endif
 }
 }
 
 
@@ -916,6 +960,40 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 
 }
 }
 
 
+
+static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned dst = ev->param[0];
+	unsigned src = ev->param[1];
+	unsigned size = 0;
+	unsigned comid = 0;
+	
+	char *prefix = options->file_prefix;
+
+	
+	if (out_paje_file)
+	{
+		double time = get_event_time_stamp(ev, options);
+#ifdef STARPU_HAVE_POTI
+		char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_worker_container[STARPU_POTI_STR_LEN], dst_worker_container[STARPU_POTI_STR_LEN];
+		char program_container[STARPU_POTI_STR_LEN];
+		snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
+		snprintf(paje_key, STARPU_POTI_STR_LEN, "steal_%u", comid);
+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
+		worker_container_alias(src_worker_container, STARPU_POTI_STR_LEN, prefix, src);
+		worker_container_alias(dst_worker_container, STARPU_POTI_STR_LEN, prefix, dst);
+		poti_StartLink(time, program_container, "L", src_worker_container, paje_value, paje_key);
+		poti_EndLink(time+0.000000001, program_container, "L", dst_worker_container, paje_value, paje_key);
+#else
+
+		fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time, prefix, size, prefix, src, comid);
+		fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time+0.000000001, prefix, size, prefix, dst, comid);
+#endif
+	}
+
+}
+
+
 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 {
 	unsigned dst = ev->param[1];
 	unsigned dst = ev->param[1];
@@ -1380,6 +1458,23 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 	}
 	}
 }
 }
 
 
+static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	/* Add an event in the trace */
+	if (out_paje_file)
+	{
+		char *event = (char*)&ev->param[1];
+
+#ifdef STARPU_HAVE_POTI
+		char container[STARPU_POTI_STR_LEN];
+		thread_container_alias(container, STARPU_POTI_STR_LEN, options->file_prefix, ev->param[0]);
+		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", event);
+#else
+		fprintf(out_paje_file, "9	%.9f	thread_event	%st%"PRIu64"	%s\n", get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], event);
+#endif
+	}
+}
+
 static
 static
 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 {
 {
@@ -1507,6 +1602,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 			case _STARPU_FUT_START_CODELET_BODY:
 			case _STARPU_FUT_START_CODELET_BODY:
 				handle_start_codelet_body(&ev, options);
 				handle_start_codelet_body(&ev, options);
 				break;
 				break;
+			case _STARPU_FUT_CODELET_DETAILS:
+				handle_codelet_details(&ev, options);
+				break;
 			case _STARPU_FUT_END_CODELET_BODY:
 			case _STARPU_FUT_END_CODELET_BODY:
 				handle_end_codelet_body(&ev, options);
 				handle_end_codelet_body(&ev, options);
 				break;
 				break;
@@ -1624,7 +1722,7 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				break;
 				break;
 
 
 			case _STARPU_FUT_WORK_STEALING:
 			case _STARPU_FUT_WORK_STEALING:
-				/* XXX */
+				handle_work_stealing(&ev, options);
 				break;
 				break;
 
 
 			case _STARPU_FUT_WORKER_DEINIT_START:
 			case _STARPU_FUT_WORKER_DEINIT_START:
@@ -1797,6 +1895,10 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_event(&ev, options);
 				handle_event(&ev, options);
 				break;
 				break;
 
 
+			case _STARPU_FUT_THREAD_EVENT:
+				handle_thread_event(&ev, options);
+				break;
+
 			case _STARPU_FUT_LOCKING_MUTEX:
 			case _STARPU_FUT_LOCKING_MUTEX:
 				break;
 				break;
 
 

+ 13 - 0
src/debug/traces/starpu_paje.c

@@ -130,6 +130,17 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%	DestContainer	string\n");
 	fprintf(file, "%%	DestContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%EndEventDef\n");
 	fprintf(file, "%%EndEventDef\n");
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	fprintf(file, "%%EventDef PajeSetState 20\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%	Size	string\n");
+	fprintf(file, "%%	Footprint	string\n");
+	fprintf(file, "%%	Tag	string\n");
+	fprintf(file, "%%EndEventDef\n");
+#endif
 #endif
 #endif
 
 
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
@@ -156,6 +167,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
 
 
 	/* Types for the Worker of the Memory Node */
 	/* Types for the Worker of the Memory Node */
+	poti_DefineEventType("user_event", "T", "user event type");
 	poti_DefineEventType("thread_event", "T", "thread event type");
 	poti_DefineEventType("thread_event", "T", "thread event type");
 	poti_DefineStateType("S", "T", "Thread State");
 	poti_DefineStateType("S", "T", "Thread State");
 	poti_DefineEntityValue("I", "S", "Initializing", "0.0 .7 1.0");
 	poti_DefineEntityValue("I", "S", "Initializing", "0.0 .7 1.0");
@@ -220,6 +232,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 1       MPICt   T       \"MPI Communication Thread\"              \n\
 1       MPICt   T       \"MPI Communication Thread\"              \n\
 1       Sc       P       \"Scheduler State\"                        \n\
 1       Sc       P       \"Scheduler State\"                        \n\
 2       prog_event   P       \"program event type\"				\n\
 2       prog_event   P       \"program event type\"				\n\
+2       user_event   T       \"user event type\"				\n\
 2       thread_event   T       \"thread event type\"				\n\
 2       thread_event   T       \"thread event type\"				\n\
 2       MPIev   MPICt    \"MPI event type\"			\n\
 2       MPIev   MPICt    \"MPI event type\"			\n\
 3       S       T       \"Thread State\"                        \n\
 3       S       T       \"Thread State\"                        \n\

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -89,7 +89,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 	}
 	}
 
 
 	/* Give profiling variable */
 	/* Give profiling variable */
-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
+	_starpu_driver_start_job(cpu_args, j, perf_arch, &codelet_start, rank, profiling);
 
 
 	/* In case this is a Fork-join parallel task, the worker does not
 	/* In case this is a Fork-join parallel task, the worker does not
 	 * execute the kernel at all. */
 	 * execute the kernel at all. */

+ 7 - 3
src/drivers/cuda/driver_cuda.c

@@ -396,7 +396,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 		return -EAGAIN;
 		return -EAGAIN;
 	}
 	}
 
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
 	/* We make sure we do manipulate the proper device */
@@ -517,7 +517,10 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	unsigned memnode = worker0->memory_node;
 	unsigned memnode = worker0->memory_node;
 	struct starpu_task *tasks[worker_set->nworkers], *task;
 	struct starpu_task *tasks[worker_set->nworkers], *task;
 	struct _starpu_job *j;
 	struct _starpu_job *j;
-	int i, res, idle;
+	int i, res;
+
+#ifndef STARPU_SIMGRID
+	int idle;
 
 
 	/* First poll for completed jobs */
 	/* First poll for completed jobs */
 	idle = 0;
 	idle = 0;
@@ -540,13 +543,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		if (cures != cudaSuccess)
 		if (cures != cudaSuccess)
 		{
 		{
 			STARPU_ASSERT(cures == cudaErrorNotReady);
 			STARPU_ASSERT(cures == cudaErrorNotReady);
-			idle++;
 		}
 		}
 		else
 		else
 		{
 		{
 			/* Asynchronous task completed! */
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(args);
 			_starpu_set_local_worker_key(args);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+			idle++;
 		}
 		}
 	}
 	}
 
 
@@ -556,6 +559,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		__starpu_datawizard_progress(memnode, 1, 0);
 		__starpu_datawizard_progress(memnode, 1, 0);
 		return 0;
 		return 0;
 	}
 	}
+#endif /* STARPU_SIMGRID */
 
 
 	/* Something done, make some progress */
 	/* Something done, make some progress */
 	__starpu_datawizard_progress(memnode, 1, 1);
 	__starpu_datawizard_progress(memnode, 1, 1);

+ 4 - 2
src/drivers/driver_common/driver_common.c

@@ -34,7 +34,7 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 #define BACKOFF_MIN 1
 
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
 {
 {
 	struct starpu_task *task = j->task;
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
 	struct starpu_codelet *cl = task->cl;
@@ -74,7 +74,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (starpu_top)
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 		_starpu_top_task_started(task,workerid,codelet_start);
 
 
-	_STARPU_TRACE_START_CODELET_BODY(j);
+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
 }
 }
 
 
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -398,6 +398,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		/*else try to pop a task*/
 		/*else try to pop a task*/
 		else
 		else
 		{
 		{
+			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
 			_starpu_set_local_worker_key(&workers[i]);
 			_starpu_set_local_worker_key(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
@@ -427,6 +428,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 					workers[i].current_rank = 0;
 					workers[i].current_rank = 0;
 				}
 				}
 
 
+				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
 				_starpu_worker_set_status_wakeup(workers[i].workerid);
 				_starpu_worker_set_status_wakeup(workers[i].workerid);
 			}
 			}
 			else
 			else

+ 2 - 2
src/drivers/driver_common/driver_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 #include <core/jobs.h>
 #include <core/jobs.h>
 #include <common/utils.h>
 #include <common/utils.h>
 
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			      struct timespec *codelet_start, int rank, int profiling);
 			      struct timespec *codelet_start, int rank, int profiling);
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			    struct timespec *codelet_end, int rank, int profiling);
 			    struct timespec *codelet_end, int rank, int profiling);

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -421,7 +421,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
 
 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
 
 
-	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
 
 
 
 
 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);

+ 4 - 2
src/drivers/opencl/driver_opencl.c

@@ -619,6 +619,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	struct starpu_task *task;
 	struct starpu_task *task;
 	int res;
 	int res;
 
 
+#ifndef STARPU_SIMGRID
 	task = starpu_task_get_current();
 	task = starpu_task_get_current();
 
 
 	if (task)
 	if (task)
@@ -642,6 +643,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		/* Asynchronous task completed! */
 		/* Asynchronous task completed! */
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 	}
 	}
+#endif /* STARPU_SIMGRID */
 
 
 	__starpu_datawizard_progress(memnode, 1, 1);
 	__starpu_datawizard_progress(memnode, 1, 1);
 
 
@@ -700,7 +702,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	else
 	else
 #else
 #else
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
-#warning No CUDA asynchronous execution with simgrid yet.
+#warning No OpenCL asynchronous execution with simgrid yet.
 #endif
 #endif
 #endif
 #endif
 	/* Synchronous execution */
 	/* Synchronous execution */
@@ -823,7 +825,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 		return -EAGAIN;
 	}
 	}
 
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");

+ 17 - 10
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -286,6 +286,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	/* make sure someone coule execute that task ! */
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
 	STARPU_ASSERT(best_workerid != -1);
+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(best_workerid, sched_ctx_id);
+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+        {
+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+                return 0;
+        }
 
 
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
 
 
@@ -405,9 +412,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -543,9 +550,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -692,10 +699,6 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 
 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
 
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
-
 	compute_all_performance_predictions(task,
 	compute_all_performance_predictions(task,
 					    nworkers_ctx,
 					    nworkers_ctx,
 					    local_task_length,
 					    local_task_length,
@@ -712,9 +715,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	unsigned nimpl;
 	unsigned nimpl;
 	if (forced_best == -1)
 	if (forced_best == -1)
 	{
 	{
-		while(workers->has_next(workers, &it))
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+
+		while(workers->has_next_master(workers, &it))
 		{
 		{
-			worker = workers->get_next(workers, &it);
+			worker = workers->get_next_master(workers, &it);
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 			{
 				if (!starpu_worker_can_execute_task(worker, task, nimpl))
 				if (!starpu_worker_can_execute_task(worker, task, nimpl))

+ 13 - 2
src/sched_policies/eager_central_policy.c

@@ -94,9 +94,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	if(workers->init_iterator)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 		workers->init_iterator(workers, &it);
 	
 	
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 		if (!starpu_bitmap_get(data->waiters, worker))
 		if (!starpu_bitmap_get(data->waiters, worker))
@@ -167,6 +167,17 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 
+	if(task)
+	{
+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+		{
+			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+			return NULL;
+		}
+	}
+
 	return task;
 	return task;
 }
 }
 
 

+ 373 - 0
src/sched_policies/locality_work_stealing_policy.c

@@ -0,0 +1,373 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Work stealing policy */
+
+#include <float.h>
+
+#include <core/workers.h>
+#include <sched_policies/fifo_queues.h>
+#include <core/debug.h>
+#include <starpu_bitmap.h>
+
+struct _starpu_lws_data
+{
+	struct _starpu_fifo_taskq **queue_array;
+	int **proxlist;
+	unsigned last_pop_worker;
+	unsigned last_push_worker;
+};
+
+
+#ifdef STARPU_HAVE_HWLOC
+
+/* Return a worker to steal a task from. The worker is selected
+ * according to the proximity list built using the info on te
+ * architecture provided by hwloc */
+static unsigned select_victim_neighborhood(unsigned sched_ctx_id, int workerid)
+{
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	int i;
+	int neighbor;
+	for(i=0; i<nworkers; i++){
+		neighbor = ws->proxlist[workerid][i];
+		int ntasks = ws->queue_array[neighbor]->ntasks;
+		
+		if (ntasks)
+			return neighbor;
+	}
+
+	return workerid;
+}
+#else
+/* Return a worker to steal a task from. The worker is selected
+ * in a round-robin fashion */
+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_pop_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	/* If the worker's queue is empty, let's try
+	 * the next ones */
+	while (1)
+	{
+		unsigned ntasks;
+
+		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
+		ntasks = ws->queue_array[worker]->ntasks;
+		if (ntasks)
+			break;
+
+		worker = (worker + 1) % nworkers;
+		if (worker == ws->last_pop_worker)
+		{
+			/* We got back to the first worker,
+			 * don't go in infinite loop */
+			break;
+		}
+	}
+
+	ws->last_pop_worker = (worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+#endif
+
+
+/**
+ * Return a worker to whom add a task.
+ * Selecting a worker is done in a round-robin fashion.
+ */
+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_push_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+	/* TODO: use an atomic update operation for this */
+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+/**
+ * Return a worker from which a task can be stolen.
+ */
+static inline unsigned select_victim(unsigned sched_ctx_id, int workerid)
+{
+
+#ifdef STARPU_HAVE_HWLOC
+	return select_victim_neighborhood(sched_ctx_id, workerid);
+#else
+	return select_victim_round_robin(sched_ctx_id);
+#endif
+}
+
+/**
+ * Return a worker on whose queue a task can be pushed. This is only
+ * needed when the push is done by the master
+ */
+static inline unsigned select_worker(unsigned sched_ctx_id)
+{
+	return select_worker_round_robin(sched_ctx_id);
+}
+
+
+static struct starpu_task *lws_pop_task(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	struct starpu_task *task = NULL;
+
+	int workerid = starpu_worker_get_id();
+
+	STARPU_ASSERT(workerid != -1);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+	if (task)
+	{
+		/* there was a local task */
+		/* printf("Own    task!%d\n",workerid); */
+		return task;
+	}
+	starpu_pthread_mutex_t *worker_sched_mutex;
+	starpu_pthread_cond_t *worker_sched_cond;
+	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);
+
+	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
+	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
+       
+
+	/* we need to steal someone's job */
+	unsigned victim = select_victim(sched_ctx_id, workerid);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[victim], workerid);
+	if (task)
+	{
+		_STARPU_TRACE_WORK_STEALING(workerid, victim);
+	}
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);
+
+	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
+	if(!task)
+	{
+		task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+		if (task)
+		{
+			/* there was a local task */
+			return task;
+		}
+	}
+
+	return task;
+}
+
+static int lws_push_task(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int workerid = starpu_worker_get_id();
+
+	/* If the current thread is not a worker but
+	 * the main thread (-1), we find the better one to
+	 * put task on its queue */
+	if (workerid == -1)
+		workerid = select_worker(sched_ctx_id);
+
+	/* int workerid = starpu_worker_get_id(); */
+	/* print_neighborhood(sched_ctx_id, 0); */
+	
+	starpu_pthread_mutex_t *sched_mutex;
+	starpu_pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	_starpu_fifo_push_task(ws->queue_array[workerid], task);
+	
+	starpu_push_task_end(task);
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	}
+#endif
+
+
+	
+	return 0;
+}
+
+static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+		ws->queue_array[workerid] = _starpu_create_fifo();
+
+		/* Tell helgrid that we are fine with getting outdated values,
+		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->ntasks);
+
+		ws->queue_array[workerid]->nprocessed = 0;
+		ws->queue_array[workerid]->ntasks = 0;
+	}
+
+
+#ifdef STARPU_HAVE_HWLOC
+	/* Build a proximity list for every worker. It is cheaper to
+	 * build this once and then use it for popping tasks rather
+	 * than traversing the hwloc tree every time a task must be
+	 * stolen */
+	ws->proxlist = (int**)malloc(nworkers*sizeof(int*));
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		ws->proxlist[workerid] = (int*)malloc(nworkers*sizeof(int));
+		int bindid;
+		
+		struct starpu_tree *neighbour = NULL;
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+	
+		bindid   = starpu_worker_get_bindid(workerid);
+		it.value = starpu_tree_get(tree, bindid);
+		int cnt = 0;
+		for(;;)
+		{
+			neighbour = (struct starpu_tree*)it.value;
+			int workerids[STARPU_NMAXWORKERS];
+			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+			int w;
+			for(w = 0; w < nworkers; w++)
+			{
+				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
+				{
+					ws->proxlist[workerid][cnt++] = workerids[w];
+					it.visited[workerids[w]] = 1;
+				}
+			}
+			if(!workers->has_next(workers, &it))
+				break;
+			it.value = it.possible_value;
+			it.possible_value = NULL;
+		} 
+	}
+#endif	
+}
+
+static void lws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_fifo(ws->queue_array[workerid]);
+#ifdef STARPU_HAVE_HWLOC
+		free(ws->proxlist[workerid]);
+#endif
+	}
+}
+
+static void lws_initialize_policy(unsigned sched_ctx_id)
+{
+#ifdef STARPU_HAVE_HWLOC
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_TREE);
+#else
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+#endif
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)malloc(sizeof(struct _starpu_lws_data));
+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
+
+	ws->last_pop_worker = 0;
+	ws->last_push_worker = 0;
+
+	/* unsigned nw = starpu_sched_ctx_get_nworkers(sched_ctx_id); */
+	unsigned nw = starpu_worker_get_count();
+	ws->queue_array = (struct _starpu_fifo_taskq**)malloc(nw*sizeof(struct _starpu_fifo_taskq*));
+
+}
+	
+static void lws_deinit_policy(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	free(ws->queue_array);
+#ifdef STARPU_HAVE_HWLOC
+	free(ws->proxlist);
+#endif
+	free(ws);
+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
+}
+
+struct starpu_sched_policy _starpu_sched_lws_policy =
+{
+	.init_sched = lws_initialize_policy,
+	.deinit_sched = lws_deinit_policy,
+	.add_workers = lws_add_workers,
+	.remove_workers = lws_remove_workers,
+	.push_task = lws_push_task,
+	.pop_task = lws_pop_task,
+	.pre_exec_hook = NULL,
+	.post_exec_hook = NULL,
+	.pop_every_task = NULL,
+	.policy_name = "nws",
+	.policy_description = "new work stealing"
+};

+ 2 - 2
src/starpu_parameters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
 
 
 /* How many executions a codelet will have to be measured before we
 /* How many executions a codelet will have to be measured before we
  * consider that calibration will provide a value good enough for scheduling */
  * consider that calibration will provide a value good enough for scheduling */
-#define _STARPU_CALIBRATION_MINIMUM 10
+#define _STARPU_CALIBRATION_MINIMUM ((unsigned) starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10))
 
 
 /* Assumed relative performance ratios */
 /* Assumed relative performance ratios */
 /* TODO: benchmark a bit instead */
 /* TODO: benchmark a bit instead */

+ 65 - 3
src/worker_collection/worker_list.c

@@ -42,6 +42,30 @@ static int list_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 	return ret;
 }
 }
 
 
+static unsigned list_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int nworkers = workers->nmasters;
+	STARPU_ASSERT(it != NULL);
+
+	unsigned ret = it->cursor < nworkers ;
+
+	if(!ret) it->cursor = 0;
+
+	return ret;
+}
+
+static int list_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int *workerids = (int *)workers->masters;
+	int nworkers = (int)workers->nmasters;
+
+	STARPU_ASSERT_MSG(it->cursor < nworkers, "cursor %d nworkers %d\n", it->cursor, nworkers);
+
+	int ret = workerids[it->cursor++];
+
+	return ret;
+}
+
 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
 {
 {
 	int *workerids = (int *)workers->workerids;
 	int *workerids = (int *)workers->workerids;
@@ -108,9 +132,12 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 {
 {
 	int *workerids = (int *)workers->workerids;
 	int *workerids = (int *)workers->workerids;
 	unsigned nworkers = workers->nworkers;
 	unsigned nworkers = workers->nworkers;
+
+	int *masters = (int *)workers->masters;
+	unsigned nmasters = workers->nmasters;
 	
 	
-	int found_worker = -1;
 	unsigned i;
 	unsigned i;
+	int found_worker = -1;
 	for(i = 0; i < nworkers; i++)
 	for(i = 0; i < nworkers; i++)
 	{
 	{
 		if(workerids[i] == worker)
 		if(workerids[i] == worker)
@@ -125,13 +152,29 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 	if(found_worker != -1)
 	if(found_worker != -1)
 		workers->nworkers--;
 		workers->nworkers--;
 
 
+	int found_master = -1;
+	for(i = 0; i < nmasters; i++)
+	{
+		if(masters[i] == worker)
+		{
+			masters[i] = -1;
+			found_master = worker;
+			break;
+		}
+	}
+
+	_rearange_workerids(masters, nmasters);
+	if(found_master != -1)
+		workers->nmasters--;
+	printf("rem %d\n", found_worker);
 	return found_worker;
 	return found_worker;
 }
 }
 
 
 static void _init_workers(int *workerids)
 static void _init_workers(int *workerids)
 {
 {
 	unsigned i;
 	unsigned i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
 		workerids[i] = -1;
 		workerids[i] = -1;
 	return;
 	return;
 }
 }
@@ -139,10 +182,14 @@ static void _init_workers(int *workerids)
 static void list_init(struct starpu_worker_collection *workers)
 static void list_init(struct starpu_worker_collection *workers)
 {
 {
 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
+	int *masters = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
 	_init_workers(workerids);
 	_init_workers(workerids);
+	_init_workers(masters);
 
 
 	workers->workerids = (void*)workerids;
 	workers->workerids = (void*)workerids;
 	workers->nworkers = 0;
 	workers->nworkers = 0;
+	workers->masters = (void*)masters;
+	workers->nmasters = 0;
 
 
 	return;
 	return;
 }
 }
@@ -150,17 +197,32 @@ static void list_init(struct starpu_worker_collection *workers)
 static void list_deinit(struct starpu_worker_collection *workers)
 static void list_deinit(struct starpu_worker_collection *workers)
 {
 {
 	free(workers->workerids);
 	free(workers->workerids);
+	free(workers->masters);
 }
 }
 
 
-static void list_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void list_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 {
 	it->cursor = 0;
 	it->cursor = 0;
+
+	int *workerids = (int *)workers->workerids;
+	unsigned nworkers = workers->nworkers;
+	unsigned i;
+	int nm = 0;
+	for(i = 0;  i < nworkers; i++)
+	{
+		if(!starpu_worker_is_slave(workerids[i]))
+			((int*)workers->masters)[nm++] = workerids[i];
+	}
+	workers->nmasters = nm;
+
 }
 }
 
 
 struct starpu_worker_collection worker_list =
 struct starpu_worker_collection worker_list =
 {
 {
 	.has_next = list_has_next,
 	.has_next = list_has_next,
 	.get_next = list_get_next,
 	.get_next = list_get_next,
+	.has_next_master = list_has_next_master,
+	.get_next_master = list_get_next_master,
 	.add = list_add,
 	.add = list_add,
 	.remove = list_remove,
 	.remove = list_remove,
 	.init = list_init,
 	.init = list_init,

+ 84 - 4
src/worker_collection/worker_tree.c

@@ -89,6 +89,75 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 	return ret;
 }
 }
 
 
+static unsigned tree_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	STARPU_ASSERT(it != NULL);
+	if(workers->nworkers == 0)
+		return 0;
+
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	if(!neighbour)
+	{
+		starpu_tree_reset_visited(tree, it->visited);
+		it->value = NULL;
+		it->possible_value = NULL;
+		return 0;
+	}
+	int id = -1;
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			id = workerids[w];
+			it->possible_value = neighbour;
+		}
+	}
+
+	STARPU_ASSERT_MSG(id != -1, "bind id (%d) for workerid (%d) not correct", neighbour->id, id);
+
+	return 1;
+}
+
+static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int ret = -1;
+	
+	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
+	struct starpu_tree *neighbour = NULL;
+	if(it->possible_value)
+	{
+		neighbour = it->possible_value;
+		it->possible_value = NULL;
+	}
+	else
+		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	STARPU_ASSERT_MSG(neighbour, "no element anymore");
+	
+	
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			ret = workerids[w];
+			it->visited[workerids[w]] = 1;
+			it->value = neighbour;
+		}
+	}
+	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
+
+	return ret;
+}
+
+
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 {
 {
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
@@ -111,6 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 	if(workers->present[worker])
 	if(workers->present[worker])
 	{
 	{
 		workers->present[worker] = 0;
 		workers->present[worker] = 0;
+		workers->is_master[worker] = 0;
 		workers->nworkers--;
 		workers->nworkers--;
 		return worker;
 		return worker;
 	}
 	}
@@ -122,10 +192,14 @@ static void tree_init(struct starpu_worker_collection *workers)
 {
 {
 	workers->workerids = (void*)starpu_workers_get_tree();
 	workers->workerids = (void*)starpu_workers_get_tree();
 	workers->nworkers = 0;
 	workers->nworkers = 0;
-	
+
 	int i;
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
 		workers->present[i] = 0;
 		workers->present[i] = 0;
+		workers->is_master[i] = 0;
+	}
 	
 	
 	return;
 	return;
 }
 }
@@ -135,19 +209,25 @@ static void tree_deinit(struct starpu_worker_collection *workers)
 //	free(workers->workerids);
 //	free(workers->workerids);
 }
 }
 
 
-static void tree_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void tree_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 {
 	it->value = NULL;
 	it->value = NULL;
 	it->possible_value = NULL;
 	it->possible_value = NULL;
 	int i;
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
+		workers->is_master[i] = (workers->present[i] && !starpu_worker_is_slave(i));
 		it->visited[i] = 0;
 		it->visited[i] = 0;
+	}
 }
 }
 
 
 struct starpu_worker_collection worker_tree =
 struct starpu_worker_collection worker_tree =
 {
 {
 	.has_next = tree_has_next,
 	.has_next = tree_has_next,
 	.get_next = tree_get_next,
 	.get_next = tree_get_next,
+	.has_next_master = tree_has_next_master,
+	.get_next_master = tree_get_next_master,
 	.add = tree_add,
 	.add = tree_add,
 	.remove = tree_remove,
 	.remove = tree_remove,
 	.init = tree_init,
 	.init = tree_init,

+ 1 - 0
tests/datawizard/commute.c

@@ -171,6 +171,7 @@ int main(int argc, char **argv)
 		test(STARPU_R, STARPU_RW, i);
 		test(STARPU_R, STARPU_RW, i);
 	}
 	}
 
 
+	starpu_data_unregister(x_handle);
 	starpu_shutdown();
 	starpu_shutdown();
 	STARPU_RETURN(0);
 	STARPU_RETURN(0);
 
 

+ 2 - 2
tests/heat/dmda.sh

@@ -2,7 +2,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 mkdir -p $SAMPLINGDIR
 
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 #schedlist="random random random random"
 
 
 export STARPU_NCUDA=3
 export STARPU_NCUDA=3

+ 5 - 3
tests/heat/gflops_sched.gp

@@ -3,7 +3,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2008, 2009  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,8 @@ set key right bottom
 set datafile missing 'x'
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
-     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "ws" 
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "ws" 	    ,\
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "lws" 
 
 
 set output "gflops_sched_gain.eps"
 set output "gflops_sched_gain.eps"
 set title "LU Decomposition : scheduling strategies : gain"
 set title "LU Decomposition : scheduling strategies : gain"
@@ -43,4 +44,5 @@ set logscale x
 set key right bottom
 set key right bottom
 set datafile missing 'x'
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
-	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    ,\
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain lws"    

+ 10 - 1
tests/heat/gflops_sched.sh

@@ -2,7 +2,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -137,6 +137,15 @@ do
 done
 done
 
 
 
 
+filename=$TIMINGDIR/gflops.lws.data
+policy=lws
+trace_header 
+for size in $sizelist
+do
+	trace_size $size;
+done
+
+
 filename=$TIMINGDIR/gflops.noprio.data
 filename=$TIMINGDIR/gflops.noprio.data
 policy=no-prio
 policy=no-prio
 trace_header 
 trace_header 

+ 2 - 2
tests/heat/granularity.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@ max <- 28
 maxy <- 400
 maxy <- 400
 
 
 sizelist <- seq(2048, max*1024, 64);
 sizelist <- seq(2048, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 #schedlist <- c("greedy", "prio", "dm", "random");
 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
 grainlist <- c(256, 512, 1024, 2048);
 grainlist <- c(256, 512, 1024, 2048);

+ 2 - 2
tests/heat/granularity_model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -17,7 +17,7 @@
 max <- 30
 max <- 30
 
 
 sizelist <- seq(64, max*1024, 64);
 sizelist <- seq(64, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 #schedlist <- c("greedy", "prio", "dm", "random");
 #grainlist <- c(256, 512, 1024)
 #grainlist <- c(256, 512, 1024)
 grainlist <- c(512, 1024)
 grainlist <- c(512, 1024)

+ 2 - 2
tests/heat/model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
 sizelist <- seq(2048, 24576, 2048);
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio", "dm", "random");
 schedlist <- c("prio", "dm", "random");
 
 
 print(schedlist);
 print(schedlist);

+ 4 - 3
tests/heat/random.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
 sizelist <- seq(2048, 24576, 2048);
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio","random");
 schedlist <- c("prio","random");
 
 
 print(schedlist);
 print(schedlist);
@@ -97,13 +97,14 @@ display_sched <- function()
 	trace_sched("prio", "red", 4);
 	trace_sched("prio", "red", 4);
 	#trace_sched("no-prio", "black");
 	#trace_sched("no-prio", "black");
 	#trace_sched("ws", "purple");
 	#trace_sched("ws", "purple");
+	#trace_sched("lws", "purple");
 
 
 	axis(1, at=sizelist)
 	axis(1, at=sizelist)
 	axis(2, at=seq(0, 100, 10), tck=1)
 	axis(2, at=seq(0, 100, 10), tck=1)
 #	axis(4, at=seq(0, 100, 10))
 #	axis(4, at=seq(0, 100, 10))
 	box(bty="u")
 	box(bty="u")
 
 
-        #labels <- c("greedy", "priority", "model", "random", "black", "ws")
+        #labels <- c("greedy", "priority", "model", "random", "black", "ws", "lws")
 #        labels <- c("greedy", "priority", "model", "random")
 #        labels <- c("greedy", "priority", "model", "random")
 	#labels <- c("model", "weighted random", "greedy", "priority")
 	#labels <- c("model", "weighted random", "greedy", "priority")
 	labels <- c("weighted random", "priority")
 	labels <- c("weighted random", "priority")

+ 2 - 2
tests/heat/sched.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
 sizelist <- seq(2048, 24576, 2048);
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("greedy", "prio", "dm", "random");
 schedlist <- c("greedy", "prio", "dm", "random");
 
 
 print(schedlist);
 print(schedlist);

+ 2 - 2
tests/heat/sched.sh

@@ -2,7 +2,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -94,7 +94,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 mkdir -p $SAMPLINGDIR
 
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 #schedlist="random random random random"
 
 
 export STARPU_NCUDA=3
 export STARPU_NCUDA=3

+ 5 - 2
tests/main/driver_api/init_run_deinit.c

@@ -49,8 +49,11 @@ run(struct starpu_task *task, struct starpu_driver *d)
 	int ret;
 	int ret;
 	ret = starpu_task_submit(task);
 	ret = starpu_task_submit(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	ret = starpu_driver_run_once(d);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+	while (!starpu_task_finished(task))
+	{
+		ret = starpu_driver_run_once(d);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+	}
 	ret = starpu_task_wait(task);
 	ret = starpu_task_wait(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 }
 }

+ 3 - 1
tests/main/subgraph_repeat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 
 	starpu_free(check_cnt);
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
@@ -179,6 +180,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 	return STARPU_TEST_SKIPPED;
 }
 }

+ 3 - 1
tests/main/subgraph_repeat_regenerate.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -168,6 +168,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 
 	starpu_free(check_cnt);
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 
 	starpu_shutdown();
 	starpu_shutdown();
 
 
@@ -183,6 +184,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 	return STARPU_TEST_SKIPPED;
 }
 }

+ 2 - 1
tests/main/subgraph_repeat_regenerate_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -198,6 +198,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 	return STARPU_TEST_SKIPPED;
 }
 }

+ 2 - 1
tests/main/subgraph_repeat_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -182,6 +182,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 	return STARPU_TEST_SKIPPED;
 }
 }

+ 5 - 2
tests/perfmodels/feed.c

@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
-	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 {
+		 starpu_shutdown();
 		 return STARPU_TEST_SKIPPED;
 		 return STARPU_TEST_SKIPPED;
+	 }
 
 
 	starpu_task_init(&task);
 	starpu_task_init(&task);
 	task.cl = &cl;
 	task.cl = &cl;
@@ -76,7 +79,7 @@ int main(int argc, char **argv)
 		arch.devid = 0;
 		arch.devid = 0;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
-		
+
 		/* Simulate Slow GPU */
 		/* Simulate Slow GPU */
 		arch.devid = 1;
 		arch.devid = 1;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);

+ 3 - 1
tests/regression/profiles.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 #
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -32,6 +32,8 @@ STARPU_NCUDA=1
 # Execution configuration
 # Execution configuration
 STARPU_SCHED=ws
 STARPU_SCHED=ws
 # Execution configuration
 # Execution configuration
+STARPU_SCHED=lws
+# Execution configuration
 STARPU_SCHED=prio
 STARPU_SCHED=prio
 # Execution configuration
 # Execution configuration
 STARPU_SCHED=no-prio
 STARPU_SCHED=no-prio

+ 5 - 1
tests/regression/regression_test.sh

@@ -2,7 +2,7 @@
 
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # 
 # StarPU is free software; you can redistribute it and/or modify
 # StarPU is free software; you can redistribute it and/or modify
@@ -65,6 +65,10 @@ echo "heat.ws.8k.v2"
 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.ws.8k.v2";
 save_cov "heat.ws.8k.v2";
 
 
+echo "heat.lws.8k.v2"
+timing=`STARPU_SCHED="lws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
+save_cov "heat.lws.8k.v2";
+
 echo "heat.greedy.8k.v2"
 echo "heat.greedy.8k.v2"
 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.greedy.8k.v2";
 save_cov "heat.greedy.8k.v2";