Browse Source

- merge trunk

Olivier Aumage 11 years ago
parent
commit
f281168e46
73 changed files with 1526 additions and 292 deletions
  1. 1 0
      AUTHORS
  2. 71 47
      ChangeLog
  3. 11 1
      configure.ac
  4. 5 0
      doc/doxygen/chapters/08scheduling.doxy
  5. 2 2
      doc/doxygen/chapters/12online_performance_tools.doxy
  6. 3 0
      doc/doxygen/chapters/13offline_performance_tools.doxy
  7. 46 42
      doc/doxygen/chapters/16mpi_support.doxy
  8. 8 0
      doc/doxygen/chapters/40environment_variables.doxy
  9. 8 0
      doc/doxygen/chapters/41configure_options.doxy
  10. 5 0
      examples/Makefile.am
  11. 1 0
      examples/binary/binary.c
  12. 3 1
      examples/cpp/incrementer_cpp.cpp
  13. 212 0
      examples/sched_ctx/nested_sched_ctxs.c
  14. 6 9
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  15. 1 0
      examples/worker_collections/worker_list_example.c
  16. 1 0
      include/starpu_config.h.in
  17. 8 0
      include/starpu_sched_ctx.h
  18. 2 0
      include/starpu_task.h
  19. 6 0
      include/starpu_thread.h
  20. 7 0
      include/starpu_worker.h
  21. 3 1
      mpi/include/starpu_mpi.h
  22. 61 45
      mpi/src/starpu_mpi.c
  23. 1 0
      src/Makefile.am
  24. 42 4
      src/common/fxt.h
  25. 27 1
      src/common/thread.c
  26. 9 0
      src/core/jobs.c
  27. 3 0
      src/core/jobs.h
  28. 66 2
      src/core/sched_ctx.c
  29. 3 0
      src/core/sched_ctx.h
  30. 26 2
      src/core/sched_policy.c
  31. 1 0
      src/core/sched_policy.h
  32. 21 3
      src/core/simgrid.c
  33. 7 0
      src/core/task.c
  34. 54 38
      src/core/workers.c
  35. 3 0
      src/core/workers.h
  36. 15 8
      src/datawizard/coherency.c
  37. 11 11
      src/datawizard/coherency.h
  38. 2 1
      src/datawizard/filters.c
  39. 2 1
      src/datawizard/interfaces/data_interface.c
  40. 5 1
      src/datawizard/reduction.c
  41. 2 4
      src/datawizard/user_interactions.c
  42. 109 7
      src/debug/traces/starpu_fxt.c
  43. 13 0
      src/debug/traces/starpu_paje.c
  44. 1 1
      src/drivers/cpu/driver_cpu.c
  45. 7 3
      src/drivers/cuda/driver_cuda.c
  46. 4 2
      src/drivers/driver_common/driver_common.c
  47. 2 2
      src/drivers/driver_common/driver_common.h
  48. 1 1
      src/drivers/mp_common/source_common.c
  49. 4 2
      src/drivers/opencl/driver_opencl.c
  50. 17 10
      src/sched_policies/deque_modeling_policy_data_aware.c
  51. 13 2
      src/sched_policies/eager_central_policy.c
  52. 373 0
      src/sched_policies/locality_work_stealing_policy.c
  53. 2 2
      src/starpu_parameters.h
  54. 65 3
      src/worker_collection/worker_list.c
  55. 84 4
      src/worker_collection/worker_tree.c
  56. 1 0
      tests/datawizard/commute.c
  57. 2 2
      tests/heat/dmda.sh
  58. 5 3
      tests/heat/gflops_sched.gp
  59. 10 1
      tests/heat/gflops_sched.sh
  60. 2 2
      tests/heat/granularity.r
  61. 2 2
      tests/heat/granularity_model.r
  62. 2 2
      tests/heat/model.r
  63. 4 3
      tests/heat/random.r
  64. 2 2
      tests/heat/sched.r
  65. 2 2
      tests/heat/sched.sh
  66. 5 2
      tests/main/driver_api/init_run_deinit.c
  67. 3 1
      tests/main/subgraph_repeat.c
  68. 3 1
      tests/main/subgraph_repeat_regenerate.c
  69. 2 1
      tests/main/subgraph_repeat_regenerate_tag.c
  70. 2 1
      tests/main/subgraph_repeat_tag.c
  71. 5 2
      tests/perfmodels/feed.c
  72. 3 1
      tests/regression/profiles.in
  73. 5 1
      tests/regression/regression_test.sh

+ 1 - 0
AUTHORS

@@ -1,6 +1,7 @@
 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
 William Braik <wbraik@gmail.com>
+Alfredo Buttari <alfredo.buttari@enseeiht.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>

+ 71 - 47
ChangeLog

@@ -17,24 +17,6 @@
 StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 
-Small features:
-  * New function starpu_sched_ctx_display_workers() to display worker
-    information belonging to a given scheduler context
-  * The option --enable-verbose can be called with
-    --enable-verbose=extra to increase the verbosity
-
-StarPU 1.1.2 (svn revision xxxx)
-==============================================
-The scheduling context release
-
-New features:
-  * The reduction init codelet is automatically used to initialize temporary
-    buffers.
-
-StarPU 1.1.1 (svn revision 12638)
-==============================================
-The scheduling context release
-
 New features:
   * Xeon Phi support
   * SCC support
@@ -48,46 +30,88 @@ New features:
 	  before the corresponding data, which allows the receiver to
 	  allocate data correctly, and to submit the matching receive of
 	  the envelope.
+        - New function
+   	  starpu_mpi_irecv_detached_sequential_consistency which
+	  allows to enable or disable the sequential consistency for
+	  the given data handle (sequential consistency will be
+	  enabled or disabled based on the value of the function
+	  parameter and the value of the sequential consistency
+	  defined for the given data)
+        - New functions starpu_mpi_task_build() and
+  	  starpu_mpi_task_post_build()
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
     let starpu commute write accesses.
   * Out-of-core support, through registration of disk areas as additional memory
     nodes.
-  * StarPU-MPI: new function
-    starpu_mpi_irecv_detached_sequential_consistency which allows to
-    enable or disable the sequential consistency for the given data
-    handle (sequential consistency will be enabled or disabled based
-    on the value of the function parameter and the value of the
-    sequential consistency defined for the given data)
-  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
-  * New functions starpu_pause() and starpu_resume()
-  * New codelet specific_nodes field to specify explicit target nodes for data.
-  * Use streams for all CUDA transfers, even initiated by CPUs.
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
-  * Add paje traces statistics tools.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
 
 Small features:
+  * Tasks can now have a name (via the field const char *name of
+    struct starpu_task)
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
   * New configure option --enable-fxt-lock which enables additional
     trace events focused on locks behaviour during the execution
-  * New function starpu_perfmodel_directory() to print directory
-    storing performance models. Available through the new option -d of
-    the tool starpu_perfmodel_display
-  * New batch files to execute StarPU applications under Microsoft
-    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
   * Functions starpu_insert_task and starpu_mpi_insert_task are
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     names are kept to avoid breaking old codes.
   * New configure option --enable-calibration-heuristic which allows
     the user to set the maximum authorized deviation of the
     history-based calibrator.
-  * Tasks can now have a name (via the field const char *name of
-    struct starpu_task)
+  * Allow application to provide the task footprint itself.
+  * New function starpu_sched_ctx_display_workers() to display worker
+    information belonging to a given scheduler context
+  * The option --enable-verbose can be called with
+    --enable-verbose=extra to increase the verbosity
+  * Add codelet size, footprint and tag id in the paje trace.
+
+Changes:
+  * Data interfaces (variable, vector, matrix and block) now define
+    pack und unpack functions
+  * StarPU-MPI: Fix for being able to receive data which have not yet
+    been registered by the application (i.e it did not call
+    starpu_data_set_tag(), data are received as a raw memory)
+  * StarPU-MPI: Fix for being able to receive data with the same tag
+    from several nodes (see mpi/tests/gather.c)
+
+Small changes:
+  * Rename function starpu_trace_user_event() as
+    starpu_fxt_trace_user_event()
+
+StarPU 1.1.2 (svn revision xxx)
+==============================================
+The scheduling context release
+
+New features:
+  * The reduction init codelet is automatically used to initialize temporary
+    buffers.
+  * Traces now include a "scheduling" state, to show the overhead of the
+    scheduler.
+  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
+    number of calibration measurements.
+
+StarPU 1.1.1 (svn revision 12638)
+==============================================
+The scheduling context release
+
+New features:
+  * MPI:
+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
+   	  cache holding received data.
+        - New function starpu_mpi_data_register() which sets the rank
+  	  and tag of a data, and also allows to automatically clear
+	  the MPI communication cache when unregistering the data. It
+	  should be called instead of both calling
+	  starpu_data_set_tag() and starpu_data_set_rank()
+  * Use streams for all CUDA transfers, even initiated by CPUs.
+  * Add paje traces statistics tools.
+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+
+Small features:
   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
     to execute the task.
   * New STARPU_DISABLE_PINNING environment variable to disable host memory
@@ -97,23 +121,23 @@ Small features:
   * New starpu_memory_get_total function to get the size of a memory node.
   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
     a set of workers without going through combined workers.
-  * Allow application to provide the task footprint itself.
 
 Changes:
-  * Data interfaces (variable, vector, matrix and block) now define
-    pack und unpack functions
-  * StarPU-MPI: Fix for being able to receive data which have not yet
-    been registered by the application (i.e it did not call
-    starpu_data_set_tag(), data are received as a raw memory)
-  * StarPU-MPI: Fix for being able to receive data with the same tag
-    from several nodes (see mpi/tests/gather.c)
+  * Fix simgrid execution.
+  * Rename starpu_get_nready_tasks_of_sched_ctx to starpu_sched_ctx_get_nready_tasks
+  * Rename starpu_get_nready_flops_of_sched_ctx to starpu_sched_ctx_get_nready_flops
+  * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
   * StarPU-MPI: Fix overzealous allocation of memory.
   * Interfaces: Allow interface implementation to change pointers at will, in
     unpack notably.
 
 Small changes:
-  * Rename function starpu_trace_user_event() as
-    starpu_fxt_trace_user_event()
+  * Use big fat abortions when one tries to make a task or callback
+    sleep, instead of just returning EDEADLCK which few people will test
+  * By default, StarPU FFT examples are not compiled and checked, the
+    configure option --enable-starpufft-examples needs to be specified
+    to change this behaviour.
 
 StarPU 1.1.0 (svn revision 11960)
 ==============================================

+ 11 - 1
configure.ac

@@ -975,16 +975,19 @@ AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
 if test x$enable_simgrid = xyes ; then
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	fi
 	if test -n "$SIMGRID_LIBS" ; then
 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
 	fi
 	if test "$simgrid_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
 	fi
 	if test "$simgrid_include_dir" != "no" ; then
 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
 	fi
 	if test "$simgrid_lib_dir" != "no" ; then
 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
@@ -994,7 +997,8 @@ if test x$enable_simgrid = xyes ; then
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 		]
 	)
-   	AC_CHECK_FUNCS([MSG_process_join])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		    		[[#include <msg/msg.h>]],
 				[[msg_host_t foo; ]]
@@ -1478,6 +1482,12 @@ if test x$use_fxt = xyes; then
 	AC_CHECK_DECLS([fut_set_filename])
 	CFLAGS="$save_CFLAGS"
 
+        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
+			[enable details about codelets in the paje trace])],
+			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
+        if  test x$enable_paje_codelet_details = xyes; then
+        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
+        fi
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################

+ 5 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -45,6 +45,11 @@ a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from the most loaded
 worker.
 
+The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
+a task on the worker which released it by
+default. When a worker becomes idle, it steals a task from neighbour workers. It
+also takes into account priorities.
+
 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 termination time will be minimal. The difference with HEFT is that <b>dm</b>

+ 2 - 2
doc/doxygen/chapters/12online_performance_tools.doxy

@@ -389,11 +389,11 @@ parameters through starpu_hash_crc32c_be for instance.
 StarPU will automatically determine when the performance model is calibrated,
 or rather, it will assume the performance model is calibrated until the
 application submits a task for which the performance can not be predicted. For
-::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
+::STARPU_HISTORY_BASED, StarPU will require 10 (STARPU_CALIBRATE_MINIMUM)
 measurements for a given size before estimating that an average can be taken as
 estimation for further executions with the same size. For
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
-10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+10 (STARPU_CALIBRATE_MINIMUM) measurements, and that the minimum measured
 data size is smaller than 90% of the maximum measured data size (i.e. the
 measurement interval is large enough for a regression to have a meaning).
 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment

+ 3 - 0
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -118,6 +118,9 @@ $ vite paje.trace
 
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
+Details of the codelet execution can be obtained by passing
+<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
+(at least r1430).
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:

+ 46 - 42
doc/doxygen/chapters/16mpi_support.doxy

@@ -121,49 +121,53 @@ automatically released. This mechanism is similar to the pthread
 detach state attribute which determines whether a thread will be
 created in a joinable or a detached state.
 
-For any communication, the call of the function will result in the
-creation of a StarPU-MPI request, the function
-starpu_data_acquire_cb() is then called to asynchronously request
-StarPU to fetch the data in main memory; when the data is available in
-main memory, a StarPU-MPI function is called to put the new request in
-the list of the ready requests if it is a send request, or in an
-hashmap if it is a receive request.
-
-Internally, all MPI communications submitted by StarPU uses a unique
-tag which has a default value, and can be accessed with the functions
+Internally, all communication are divided in 2 communications, a first
+message is used to exchange an envelope describing the data (i.e its
+tag and its size), the data itself is sent in a second message. All
+MPI communications submitted by StarPU uses a unique tag which has a
+default value, and can be accessed with the functions
 starpu_mpi_get_communication_tag() and
-starpu_mpi_set_communication_tag().
-
-The matching of tags with corresponding requests is done into StarPU-MPI.
-To handle this, any communication is a double-communication based on a
-envelope + data system. Every data which will be sent needs to send an
-envelope which describes the data (particularly its tag) before sending
-the data, so the receiver can get the matching pending receive request
-from the hashmap, and submit it to recieve the data correctly.
-
-To this aim, the StarPU-MPI progression thread has a permanent-submitted
-request destined to receive incoming envelopes from all sources.
-
-The StarPU-MPI progression thread regularly polls this list of ready
-requests. For each new ready request, the appropriate function is
-called to post the corresponding MPI call. For example, calling
-starpu_mpi_isend() will result in posting <c>MPI_Isend</c>. If
-the request is marked as detached, the request will be put in the list
-of detached requests.
-
-The StarPU-MPI progression thread also polls the list of detached
-requests. For each detached request, it regularly tests the completion
-of the MPI request by calling <c>MPI_Test</c>. On completion, the data
-handle is released, and if a callback was defined, it is called.
-
-Finally, the StarPU-MPI progression thread checks if an envelope has
-arrived. If it is, it'll check if the corresponding receive has already
-been submitted by the application. If it is, it'll submit the request
-just as like as it does with those on the list of ready requests.
-If it is not, it'll allocate a temporary handle to store the data that
-will arrive just after, so as when the corresponding receive request
-will be submitted by the application, it'll copy this temporary handle
-into its one instead of submitting a new StarPU-MPI request.
+starpu_mpi_set_communication_tag(). The matching of tags with
+corresponding requests is done within StarPU-MPI.
+
+For any userland communication, the call of the corresponding function
+(e.g starpu_mpi_isend()) will result in the creation of a StarPU-MPI
+request, the function starpu_data_acquire_cb() is then called to
+asynchronously request StarPU to fetch the data in main memory; when
+the data is ready and the corresponding buffer has already been
+received by MPI, it will be copied in the memory of the data,
+otherwise the request is stored in the <em>early requests list</em>. Sending
+requests are stored in the <em>ready requests list</em>.
+
+While requests need to be processed, the StarPU-MPI progression thread
+does the following:
+
+<ol>
+<li> it polls the <em>ready requests list</em>. For all the ready
+requests, the appropriate function is called to post the corresponding
+MPI call. For example, an initial call to starpu_mpi_isend() will
+result in a call to <c>MPI_Isend</c>. If the request is marked as
+detached, the request will then be added in the <em>detached requests
+list</em>.
+</li>
+<li> it posts a <c>MPI_Irecv()</c> to retrieve a data envelope.
+</li>
+<li> it polls the <em>detached requests list</em>. For all the detached
+requests, it tests its completion of the MPI request by calling
+<c>MPI_Test</c>. On completion, the data handle is released, and if a
+callback was defined, it is called.
+</li>
+<li> finally, it checks if a data envelope has been received. If so,
+if the data envelope matches a request in the <em>early requests list</em> (i.e
+the request has already been posted by the application), the
+corresponding MPI call is posted (similarly to the first step above).
+
+If the data envelope does not match any application request, a
+temporary handle is created to receive the data, a StarPU-MPI request
+is created and added into the <em>ready requests list</em>, and thus will be
+processed in the first step of the next loop.
+</li>
+</ol>
 
 \ref MPIPtpCommunication "Communication" gives the list of all the
 point to point communications defined in StarPU-MPI.

+ 8 - 0
doc/doxygen/chapters/40environment_variables.doxy

@@ -314,6 +314,14 @@ is the default behaviour.
 Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
 </dd>
 
+<dt>STARPU_CALIBRATE_MINIMUM</dt>
+<dd>
+\anchor STARPU_CALIBRATE_MINIMUM
+\addindex __env__STARPU_CALIBRATE_MINIMUM
+This defines the minimum number of calibration measurements that will be made
+before considering that the performance model is calibrated. The default value is 10.
+</dd>
+
 <dt>STARPU_BUS_CALIBRATE</dt>
 <dd>
 \anchor STARPU_BUS_CALIBRATE

+ 8 - 0
doc/doxygen/chapters/41configure_options.doxy

@@ -372,6 +372,14 @@ Enable performance debugging through gprof.
 Enable performance model debugging.
 </dd>
 
+<dt>--enable-paje-codelet-details</dt>
+<dd>
+\anchor enable-paje-codelet-details
+\addindex __configure__--enable-paje-codelet-details
+Enable details about codelets in the paje trace. This requires a recent enough
+version of ViTE (at least r1430).
+</dd>
+
 <dt>--enable-fxt-lock</dt>
 <dd>
 \anchor enable-fxt-lock

+ 5 - 0
examples/Makefile.am

@@ -190,6 +190,7 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/prio				\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
@@ -270,6 +271,7 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/prio				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
@@ -925,6 +927,9 @@ sched_ctx_parallel_code_CFLAGS = \
 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
 	$(AM_CFLAGS) -fopenmp
 
+sched_ctx_nested_sched_ctxs_CFLAGS = \
+	$(AM_CFLAGS) -fopenmp
+
 endif
 
 showcheck:

+ 1 - 0
examples/binary/binary.c

@@ -29,6 +29,7 @@ struct starpu_codelet cl =
 {
 #ifdef STARPU_USE_OPENCL
 	.opencl_funcs = {opencl_codelet, NULL},
+	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_RW}

+ 3 - 1
examples/cpp/incrementer_cpp.cpp

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -61,9 +61,11 @@ int main(int argc, char **argv)
         cl.cpu_funcs[0] = cpu_codelet;
 #ifdef STARPU_USE_CUDA
         cl.cuda_funcs[0] = cuda_codelet;
+	cl.cuda_flags[0] = STARPU_CUDA_ASYNC;
 #endif
 #ifdef STARPU_USE_OPENCL
 	cl.opencl_funcs[0] = opencl_codelet;
+	cl.opencl_flags[0] = STARPU_OPENCL_ASYNC;
 #endif
         cl.nbuffers = 1;
         cl.modes[0] = STARPU_RW;

+ 212 - 0
examples/sched_ctx/nested_sched_ctxs.c

@@ -0,0 +1,212 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <omp.h>
+
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 64
+#else
+#define NTASKS 100
+#endif
+
+int tasks_executed[2];
+starpu_pthread_mutex_t mut;
+
+int parallel_code(int sched_ctx)
+{
+	int i;
+	int t = 0;
+	int *cpuids = NULL;
+	int ncpuids = 0;
+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
+
+//	printf("execute task of %d threads \n", ncpuids);
+#pragma omp parallel num_threads(ncpuids)
+	{
+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
+#pragma omp for
+		for(i = 0; i < NTASKS; i++)
+			t++;
+	}
+
+	free(cpuids);
+	return t;
+}
+
+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	int w = starpu_worker_get_id();
+	unsigned sched_ctx = (unsigned)arg;
+	int n = parallel_code(sched_ctx);
+//	printf("w %d executed %d it \n", w, n);
+}
+
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = {NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
+int main(int argc, char **argv)
+{
+	tasks_executed[0] = 0;
+	tasks_executed[1] = 0;
+	int ntasks = NTASKS;
+	int ret, j, k;
+	unsigned ncpus = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int *procs1, *procs2;
+
+#ifdef STARPU_USE_CPU
+	ncpus =  starpu_cpu_worker_get_count();
+	procs1 = (int*)malloc(ncpus*sizeof(int));
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	if (ncpus > 1)
+	{
+		nprocs1 = ncpus/2;
+		nprocs2 =  nprocs1;
+		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
+			procs2[k++] = procs1[j];
+	}
+	else
+	{
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		procs2[0] = procs1[0];
+	}
+#endif
+
+	if (ncpus == 0)
+	{
+#ifdef STARPU_USE_CPU
+		free(procs1);
+		free(procs2);
+#endif
+		starpu_shutdown();
+		return 77;
+	}
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
+
+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	int nprocs3 = nprocs1/2;
+	int nprocs4 = nprocs1/2;
+	int nprocs5 = nprocs2/2;
+	int nprocs6 = nprocs2/2;
+	int procs3[nprocs3];
+	int procs4[nprocs4];
+	int procs5[nprocs5];
+	int procs6[nprocs6];
+
+	k = 0;
+	for(j = 0; j < nprocs3; j++)
+		procs3[k++] = procs1[j];
+	k = 0;
+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
+		procs4[k++] = procs1[j];
+
+	k = 0;
+	for(j = 0; j < nprocs5; j++)
+		procs5[k++] = procs2[j];
+	k = 0;
+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
+		procs6[k++] = procs2[j];
+
+	unsigned sched_ctx3 = starpu_sched_ctx_create(procs3, nprocs3, "ctx3", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+	unsigned sched_ctx4 = starpu_sched_ctx_create(procs4, nprocs4, "ctx4", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+
+	unsigned sched_ctx5 = starpu_sched_ctx_create(procs5, nprocs5, "ctx5", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+	unsigned sched_ctx6 = starpu_sched_ctx_create(procs6, nprocs6, "ctx6", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx1;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx2;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	starpu_sched_ctx_delete(sched_ctx3);
+	starpu_sched_ctx_delete(sched_ctx4);
+
+	starpu_sched_ctx_delete(sched_ctx5);
+	starpu_sched_ctx_delete(sched_ctx6);
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
+
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
+	starpu_shutdown();
+	return 0;
+}

+ 6 - 9
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -88,7 +88,6 @@ int main(int argc, char **argv)
 #ifdef STARPU_USE_CPU
 	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
-	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 
 	if(ncpus > 1)
@@ -96,22 +95,16 @@ int main(int argc, char **argv)
 		nprocs1 = ncpus/2;
 		nprocs2 =  ncpus-nprocs1;
 		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
 			procs2[k++] = procs1[j];
 	}
 	else
 	{
-		procs1 = (int*)malloc(nprocs1*sizeof(int));
 		procs2 = (int*)malloc(nprocs2*sizeof(int));
-		procs1[0] = 0;
-		procs2[0] = 0;
+		procs2[0] = procs1[0];
 
 	}
-#else
-	procs1 = (int*)malloc(nprocs1*sizeof(int));
-	procs2 = (int*)malloc(nprocs2*sizeof(int));
-	procs1[0] = 0;
-	procs2[0] = 0;
 #endif
 
 	if (ncpus == 0) goto enodev;
@@ -163,6 +156,10 @@ int main(int argc, char **argv)
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
 
 enodev:
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
 	starpu_shutdown();
 	return ncpus == 0 ? 77 : 0;
 }

+ 1 - 0
examples/worker_collections/worker_list_example.c

@@ -85,6 +85,7 @@ int main()
 
 	FPRINTF(stderr, "timing init = %lf \n", timing);
 	co->deinit(co);
+	free(co);
 	starpu_shutdown();
 
 	return 0;

+ 1 - 0
include/starpu_config.h.in

@@ -32,6 +32,7 @@
 #undef STARPU_OPENMP
 
 #undef STARPU_SIMGRID
+#undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 
 #undef STARPU_HAVE_ICC
 

+ 8 - 0
include/starpu_sched_ctx.h

@@ -29,6 +29,7 @@ extern "C"
 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
+#define STARPU_SCHED_CTX_NESTED                  (6<<16)
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
@@ -127,6 +128,13 @@ int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids
 
 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
 
+/* return the first context (child of sched_ctx_id) where the workerid is master */
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id);
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */

+ 2 - 0
include/starpu_task.h

@@ -255,6 +255,8 @@ void starpu_task_destroy(struct starpu_task *task);
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 
+int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
+
 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
 int starpu_task_wait_for_all(void);

+ 6 - 0
include/starpu_thread.h

@@ -200,6 +200,11 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 #if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
 
+#if defined(STARPU_SIMGRID) && defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
+typedef xbt_bar_t starpu_pthread_barrier_t;
+typedef int starpu_pthread_barrierattr_t;
+#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
+#else
 typedef struct {
 	starpu_pthread_mutex_t mutex;
 	starpu_pthread_cond_t cond;
@@ -208,6 +213,7 @@ typedef struct {
 } starpu_pthread_barrier_t;
 typedef int starpu_pthread_barrierattr_t;
 #define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
+#endif
 
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count);
 int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier);

+ 7 - 0
include/starpu_worker.h

@@ -57,10 +57,15 @@ struct starpu_worker_collection
 {
 	void *workerids;
 	unsigned nworkers;
+	void *masters;
+	unsigned nmasters;
 	int present[STARPU_NMAXWORKERS];
+	int is_master[STARPU_NMAXWORKERS];
 	enum starpu_worker_collection_type type;
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	unsigned (*has_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	int (*get_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*add)(struct starpu_worker_collection *workers, int worker);
 	int (*remove)(struct starpu_worker_collection *workers, int worker);
 	void (*init)(struct starpu_worker_collection *workers);
@@ -109,6 +114,8 @@ int starpu_worker_get_mp_nodeid(int id);
 struct starpu_tree* starpu_workers_get_tree(void);
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
+
+unsigned starpu_worker_is_slave(int workerid);
 #ifdef __cplusplus
 }
 #endif

+ 3 - 1
mpi/include/starpu_mpi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,6 +71,8 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 
+int starpu_mpi_world_rank(void);
+
 int starpu_mpi_get_communication_tag(void);
 void starpu_mpi_set_communication_tag(int tag);
 

+ 61 - 45
mpi/src/starpu_mpi.c

@@ -30,7 +30,7 @@
 #include <datawizard/coherency.h>
 
 static void _starpu_mpi_add_sync_point_in_fxt(void);
-static void _starpu_mpi_submit_new_mpi_request(void *arg);
+static void _starpu_mpi_submit_ready_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
@@ -46,8 +46,8 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 							ssize_t count);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
-/* The list of requests that have been newly submitted by the application */
-static struct _starpu_mpi_req_list *new_requests;
+/* The list of ready requests */
+static struct _starpu_mpi_req_list *ready_requests;
 
 /* The list of detached requests that have already been submitted to MPI */
 static struct _starpu_mpi_req_list *detached_requests;
@@ -61,7 +61,7 @@ static starpu_pthread_mutex_t mutex;
 static starpu_pthread_t progress_thread;
 static int running = 0;
 
-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
+/* Count requests posted by the application and not yet submitted to MPI */
 static starpu_pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
@@ -151,9 +151,9 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	req->count = count;
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
+	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
 	 * the request is actually submitted */
-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
 
 	_STARPU_MPI_LOG_OUT();
 	return req;
@@ -447,7 +447,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 
-	_starpu_mpi_submit_new_mpi_request(waiting_req);
+	_starpu_mpi_submit_ready_request(waiting_req);
 
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -532,7 +532,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->request_type = TEST_REQ;
 
 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
-		_starpu_mpi_submit_new_mpi_request(testing_req);
+		_starpu_mpi_submit_ready_request(testing_req);
 
 		/* We wait for the test request to finish */
 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
@@ -619,7 +619,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	barrier_req->comm = comm;
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
-	_starpu_mpi_submit_new_mpi_request(barrier_req);
+	_starpu_mpi_submit_ready_request(barrier_req);
 
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
@@ -785,24 +785,25 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	free(args);
 }
 
-static void _starpu_mpi_submit_new_mpi_request(void *arg)
+static void _starpu_mpi_submit_ready_request(void *arg)
 {
 	_STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
+	_STARPU_MPI_DEBUG(3, "new req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 	if (req->request_type == RECV_REQ)
 	{
-		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
-		 * incoming data without a matching pending receive already submitted by the application.
-		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
-		 * the list of new_requests, so as the real MPI request can be submitted before the next
-		 * submission of the envelope-catching request. */
+		/* Case : the request is the internal receive request submitted
+		 * by StarPU-MPI to receive incoming data without a matching
+		 * early_request from the application. We immediately allocate the
+		 * pointer associated to the data_handle, and push it into the
+		 * ready_requests list, so as the real MPI request can be submitted
+		 * before the next submission of the envelope-catching request. */
 		if (req->is_internal_req)
 		{
 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
@@ -818,10 +819,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 			}
 
-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-			_starpu_mpi_req_list_push_front(new_requests, req);
+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
+					  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
+					  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+			_starpu_mpi_req_list_push_front(ready_requests, req);
 
-			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
+			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
 			req->posted = 1;
@@ -834,10 +837,10 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 
-			/* Case : the request has already been submitted internally by StarPU.
-			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
-			 * bring the data back to the original data handle associated to the request.*/
+			/* Case: a receive request for a data with the given tag and source has already been
+			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			{
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -861,8 +864,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 			}
-			/* Case : a classic receive request with no send received earlier than expected.
-			 * We just add the pending receive request to the requests' hashmap. */
+			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			{
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
@@ -872,7 +874,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	}
 	else
 	{
-		_starpu_mpi_req_list_push_front(new_requests, req);
+		_starpu_mpi_req_list_push_front(ready_requests, req);
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 	}
@@ -986,7 +988,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 	}
 }
 
-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT_MSG(req, "Invalid request");
@@ -1080,10 +1082,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
  	int header_req_submitted = 0;
 
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
+		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0;
 
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1107,21 +1109,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 		/* get one request */
 		struct _starpu_mpi_req *req;
-		while (!_starpu_mpi_req_list_empty(new_requests))
+		while (!_starpu_mpi_req_list_empty(ready_requests))
 		{
-			req = _starpu_mpi_req_list_pop_back(new_requests);
+			req = _starpu_mpi_req_list_pop_back(ready_requests);
 
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
 			 * application submit requests in the meantime, so we
 			 * release the lock. */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-			_starpu_mpi_handle_new_request(req);
+			_starpu_mpi_handle_ready_request(req);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 
-		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
-		 * requests in our side, we resubmit a header request. */
+		/* If there is no currently submitted header_req submitted to
+                 * catch envelopes from senders, and there is some pending
+                 * receive requests on our side, we resubmit a header request. */
 		MPI_Request header_req;
 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		{
@@ -1151,11 +1154,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 
-				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
-				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
-				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
+				/* Case: a data will arrive before a matching receive is
+				 * posted by the application. Create a temporary handle to
+				 * store the incoming data, submit a starpu_mpi_irecv_detached
+				 * on this handle, and store it as an early_data
+				 */
 				if (!found_req)
 				{
+
 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 					starpu_data_handle_t data_handle = NULL;
@@ -1198,8 +1204,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 					// We wait until the request is pushed in the
-					// new_request list, that ensures that the next loop
-					// will call _starpu_mpi_handle_new_request
+					// ready_request list, that ensures that the next loop
+					// will call _starpu_mpi_handle_ready_request
 					// on the request and post the corresponding mpi_irecv,
 					// otherwise, it may lead to read data as envelop
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -1214,8 +1220,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
-				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
-				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
+				/* Case: a matching application request has been found for
+				 * the incoming data, we handle the correct allocation
+				 * of the pointer associated to the data handle, then
+				 * submit the corresponding receive with
+				 * _starpu_mpi_handle_ready_request. */
 				else
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
@@ -1242,7 +1251,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * application submit requests in the meantime, so we
 					 * release the lock. */
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					_starpu_mpi_handle_new_request(found_req);
+					_starpu_mpi_handle_ready_request(found_req);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				header_req_submitted = 0;
@@ -1255,7 +1264,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	}
 
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
-	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
+	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	_starpu_mpi_early_request_check_termination();
 	_starpu_mpi_early_data_check_termination();
@@ -1326,7 +1335,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
-	new_requests = _starpu_mpi_req_list_new();
+	ready_requests = _starpu_mpi_req_list_new();
 
 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	detached_requests = _starpu_mpi_req_list_new();
@@ -1402,7 +1411,7 @@ int starpu_mpi_shutdown(void)
 
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);
-	_starpu_mpi_req_list_delete(new_requests);
+	_starpu_mpi_req_list_delete(ready_requests);
 
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_free();
@@ -1423,3 +1432,10 @@ void starpu_mpi_data_register(starpu_data_handle_t data_handle, int tag, int ran
 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
 
 }
+
+int starpu_mpi_world_rank(void)
+{
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	return rank;
+}

+ 1 - 0
src/Makefile.am

@@ -181,6 +181,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/eager_central_policy.c			\
 	sched_policies/eager_central_priority_policy.c		\
 	sched_policies/work_stealing_policy.c			\
+	sched_policies/locality_work_stealing_policy.c		\
 	sched_policies/deque_modeling_policy_data_aware.c	\
 	sched_policies/random_policy.c				\
 	sched_policies/stack_queues.c				\

+ 42 - 4
src/common/fxt.h

@@ -106,6 +106,9 @@
 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
 
 #define _STARPU_FUT_EVENT	0x513c
+#define _STARPU_FUT_THREAD_EVENT	0x513d
+
+#define	_STARPU_FUT_CODELET_DETAILS	0x513e
 
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
@@ -193,6 +196,31 @@ void _starpu_fxt_register_thread(unsigned);
 #define _STARPU_FUT_COMMIT(size) do { } while (0)
 #endif
 
+#ifdef FUT_DO_PROBE1STR
+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str) FUT_DO_PROBE1STR(CODE, P1, str)
+#else
+/* Sometimes we need something a little more specific than the wrappers from
+ * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
+ * by a string. */
+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str)			\
+do {									\
+    if(fut_active) {							\
+	/* No more than FXT_MAX_PARAMS args are allowed */		\
+	/* we add a \0 just in case ... */				\
+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
+	unsigned nbargs = 1 + nbargs_str;				\
+	size_t total_len = FUT_SIZE(nbargs);				\
+	unsigned long *futargs =					\
+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
+	*(futargs++) = (unsigned long)(P1);				\
+	snprintf((char *)futargs, len, "%s", str);			\
+	((char *)futargs)[len - 1] = '\0';				\
+	_STARPU_FUT_COMMIT(total_len);					\
+    }									\
+} while (0);
+#endif
+
 #ifdef FUT_DO_PROBE2STR
 #define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
 #else
@@ -297,7 +325,7 @@ do {									\
 #ifdef FUT_DO_PROBE6STR
 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #else
-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
+#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
 do {									\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -324,7 +352,7 @@ do {									\
 #ifdef FUT_DO_PROBE7STR
 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #else
-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
+#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
 do {									\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -378,7 +406,7 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
-#define _STARPU_TRACE_START_CODELET_BODY(job)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
@@ -389,6 +417,11 @@ do {									\
 	else {                                                          \
 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 	}								\
+	{								\
+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
+	}								\
 } while(0);
 
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
@@ -563,6 +596,9 @@ do {										\
 #define _STARPU_TRACE_EVENT(S)			\
 	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
 
+#define _STARPU_TRACE_THREAD_EVENT(S)			\
+	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
+
 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
 
@@ -746,7 +782,7 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
@@ -794,6 +830,8 @@ do {										\
 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
+#define _STARPU_TRACE_EVENT(S)		do {} while(0)
+#define _STARPU_TRACE_THREAD_EVENT(S)		do {} while(0)
 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
 #define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)

+ 27 - 1
src/common/thread.c

@@ -288,9 +288,35 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
 	return p_ret;
 }
+
+#if defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
+{
+	*barrier = xbt_barrier_init(count);
+	return 0;
+}
+
+int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
+{
+	if (*barrier)
+		xbt_barrier_destroy(*barrier);
+	return 0;
+}
+
+int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
+{
+	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
+
+	xbt_barrier_wait(*barrier);
+
+	_STARPU_TRACE_BARRIER_WAIT_END();
+	return 0;
+}
+#endif /* defined(STARPU_SIMGRID) */
+
 #endif /* STARPU_SIMGRID */
 
-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
+#if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
 {
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);

+ 9 - 0
src/core/jobs.c

@@ -116,6 +116,15 @@ void _starpu_job_destroy(struct _starpu_job *j)
 	_starpu_job_delete(j);
 }
 
+int _starpu_job_finished(struct _starpu_job *j)
+{
+	int ret;
+	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+	ret = j->terminated == 2;
+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+	return ret;
+}
+
 void _starpu_wait_job(struct _starpu_job *j)
 {
 	STARPU_ASSERT(j->task);

+ 3 - 0
src/core/jobs.h

@@ -182,6 +182,9 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 /* Destroy the data structure associated to the job structure */
 void _starpu_job_destroy(struct _starpu_job *j);
 
+/* Test for the termination of the job */
+int _starpu_job_finished(struct _starpu_job *j);
+
 /* Wait for the termination of the job */
 void _starpu_wait_job(struct _starpu_job *j);
 

+ 66 - 2
src/core/sched_ctx.c

@@ -60,7 +60,11 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 	{
 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
 		worker->nsched_ctxs--;
 	}
@@ -185,6 +189,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 	}
 	else if(sched_ctx->sched_policy->add_workers)
 	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		if(added_workers)
 		{
 			if(*n_added_workers > 0)
@@ -192,6 +197,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 		}
 		else
 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
 	}
 	return;
 }
@@ -229,7 +235,11 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
 
 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 
 	free(workerids);
 	return;
@@ -523,6 +533,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	int max_prio = 0;
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
+	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 	va_start(varg_list, sched_ctx_name);
 	while ((arg_type = va_arg(varg_list, int)) != 0)
@@ -551,6 +562,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		{
 			hierarchy_level = va_arg(varg_list, unsigned);
 		}
+		else if (arg_type == STARPU_SCHED_CTX_NESTED)
+		{
+			nesting_sched_ctx = va_arg(varg_list, unsigned);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -562,6 +577,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
 	sched_ctx->hierarchy_level = hierarchy_level;
+	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
 	_starpu_unlock_mutex_if_prev_locked();
 	int *added_workerids;
@@ -1132,6 +1148,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	case STARPU_WORKER_TREE:
 		sched_ctx->workers->has_next = worker_tree.has_next;
 		sched_ctx->workers->get_next = worker_tree.get_next;
+		sched_ctx->workers->has_next_master = worker_tree.has_next_master;
+		sched_ctx->workers->get_next_master = worker_tree.get_next_master;
 		sched_ctx->workers->add = worker_tree.add;
 		sched_ctx->workers->remove = worker_tree.remove;
 		sched_ctx->workers->init = worker_tree.init;
@@ -1144,6 +1162,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	default:
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
+		sched_ctx->workers->has_next_master = worker_list.has_next_master;
+		sched_ctx->workers->get_next_master = worker_list.get_next_master;
 		sched_ctx->workers->add = worker_list.add;
 		sched_ctx->workers->remove = worker_list.remove;
 		sched_ctx->workers->init = worker_list.init;
@@ -1171,6 +1191,7 @@ void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
 		starpu_worker_get_name(workerids[i], name, 256);
 		fprintf(f, "\t\t%s\n", name);
 	}
+	free(workerids);
 }
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
@@ -1605,6 +1626,44 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
 }
 
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx_list *l = NULL;
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
+	{ 
+		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
+			return sched_ctx->id;
+	}
+	return STARPU_NMAX_SCHED_CTXS;
+
+}
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
+{
+        _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
+}
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
+{
+	int workerid = starpu_worker_get_id();
+	struct _starpu_worker *worker  = NULL;
+	if(workerid != -1)
+	{
+		worker = _starpu_get_worker_struct(workerid);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	}
+
+	task->sched_ctx = sched_ctx;
+	_starpu_task_submit_nodeps(task);
+
+	if(workerid != -1)
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+}
+
 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
 {
 	int s;
@@ -1620,6 +1679,7 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 	return 0;
 
 }
+
 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
@@ -1643,7 +1703,6 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 		workerid = workerids[w];
 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
 		{
-			sched_ctx->sleeping[workerids[w]] = 1;
 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
 		}
 	}
@@ -1652,7 +1711,10 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
 {
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 1;
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->sleeping[workerid] = 1;
 	int master = sched_ctx->master[workerid];
 	sem_post(&sched_ctx->fall_asleep_sem[master]);
 
@@ -1666,6 +1728,9 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 	sem_post(&sched_ctx->wake_up_sem[master]);
 	sched_ctx->sleeping[workerid] = 0;
 	sched_ctx->master[workerid] = -1;
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 0;
+
 	return;
 }
 
@@ -1720,7 +1785,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	int current_worker_id = starpu_worker_get_id();
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
-
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	int w = 0;
 

+ 3 - 0
src/core/sched_ctx.h

@@ -147,6 +147,9 @@ struct _starpu_sched_ctx
 	/* bool indicating if the workers is sleeping in this ctx */
 	unsigned sleeping[STARPU_NMAXWORKERS];
 
+	/* ctx nesting the current ctx */
+	unsigned nesting_sched_ctx;
+
 };
 
 struct _starpu_machine_config;

+ 26 - 2
src/core/sched_policy.c

@@ -38,6 +38,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_eager_policy,
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_random_policy,
+	&_starpu_sched_lws_policy,
 	&_starpu_sched_ws_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,
@@ -174,14 +175,20 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _st
 
 	load_sched_policy(selected_policy, sched_ctx);
 
+	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
+	_STARPU_TRACE_WORKER_SCHEDULING_POP;
 }
 
 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
 {
 	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
 	if (policy->deinit_sched)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		policy->deinit_sched(sched_ctx->id);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 
 static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task *task, struct _starpu_worker *worker, int workerid, int perf_workerid)
@@ -193,7 +200,11 @@ static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task
         {
 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
 		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 			sched_ctx->sched_policy->push_task_notify(task, workerid, perf_workerid, sched_ctx->id);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 	}
 }
 
@@ -867,22 +878,31 @@ profiling:
 
 struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
 {
+	struct starpu_task *task = NULL;
 	if(sched_ctx->sched_policy)
 	{
 		STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
 		
 		/* TODO set profiling info */
 		if(sched_ctx->sched_policy->pop_every_task)
-			return sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
+		{
+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
+			task = sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
+		}
 	}
-	return NULL;
+	return task;
 }
 
 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->pre_exec_hook(task);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 
 void _starpu_sched_post_exec_hook(struct starpu_task *task)
@@ -890,7 +910,11 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 
 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
+	{
+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
 		sched_ctx->sched_policy->post_exec_hook(task);
+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
+	}
 }
 
 void _starpu_wait_on_sched_event(void)

+ 1 - 0
src/core/sched_policy.h

@@ -58,6 +58,7 @@ void _starpu_print_idle_time();
 /*
  *	Predefined policies
  */
+extern struct starpu_sched_policy _starpu_sched_lws_policy;
 extern struct starpu_sched_policy _starpu_sched_ws_policy;
 extern struct starpu_sched_policy _starpu_sched_prio_policy;
 extern struct starpu_sched_policy _starpu_sched_random_policy;

+ 21 - 3
src/core/simgrid.c

@@ -33,6 +33,8 @@ extern int starpu_main(int argc, char *argv[]);
 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
 #pragma weak smpi_simulated_main_
 extern int smpi_simulated_main_(int argc, char *argv[]);
+#pragma weak starpu_mpi_world_rank
+extern int starpu_mpi_world_rank(void);
 
 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
 
@@ -48,6 +50,13 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 	return starpu_main(args->argc, args->argv);
 }
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
+#ifdef HAVE_MSG_GET_AS_BY_NAME
+static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
+{
+	return MSG_get_as_by_name(name);
+}
+#else /* HAVE_MSG_GET_AS_BY_NAME */
 static msg_as_t __starpu_simgrid_get_as_by_name(msg_as_t root, const char *name)
 {
 	xbt_dict_t dict;
@@ -69,6 +78,8 @@ static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 {
 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
 }
+#endif /* HAVE_MSG_GET_AS_BY_NAME */
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 
 int _starpu_simgrid_get_nbhosts(const char *prefix)
 {
@@ -77,13 +88,16 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 	unsigned i, nb;
 	unsigned len = strlen(prefix);
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	{
 		char name[16];
-		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
 	}
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 	nb = xbt_dynar_length(hosts);
 
@@ -125,7 +139,8 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 	if (_starpu_simgrid_running_smpi())
 	{
 		char mpiname[16];
-		snprintf(mpiname, sizeof(mpiname), "%d-%s", smpi_current_rank, name);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(mpiname, sizeof(mpiname), "%d-%s", starpu_mpi_world_rank(), name);
 		return MSG_get_host_by_name(mpiname);
 	}
 	else
@@ -178,6 +193,7 @@ void _starpu_simgrid_init()
 	xbt_dynar_t hosts;
 	int i;
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	{
 		/* Take back hand to create the local platform for this MPI
@@ -191,7 +207,8 @@ void _starpu_simgrid_init()
 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
 		int ret;
 
-		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
@@ -212,6 +229,7 @@ void _starpu_simgrid_init()
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
 	}
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 
 	int nb = xbt_dynar_length(hosts);

+ 7 - 0
src/core/task.c

@@ -187,6 +187,13 @@ void starpu_task_destroy(struct starpu_task *task)
 	_starpu_task_destroy(task);
 }
 
+int starpu_task_finished(struct starpu_task *task)
+{
+	STARPU_ASSERT(task);
+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_finished can only be called on tasks with detach = 0");
+	return _starpu_job_finished(_starpu_get_job_associated_to_task(task));
+}
+
 int starpu_task_wait(struct starpu_task *task)
 {
         _STARPU_LOG_IN();

+ 54 - 38
src/core/workers.c

@@ -467,6 +467,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->reverse_phase[1] = 0;
 	workerarg->pop_ctx_priority = 1;
 	workerarg->sched_mutex_locked = 0;
+	workerarg->slave = 0;
 
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }
@@ -516,7 +517,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 	/* Launch workers asynchronously */
 	unsigned cpu = 0;
-	unsigned worker;
+	unsigned worker, i;
 
 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
 	/* Get itimer of the main thread, to set it for the worker threads */
@@ -526,6 +527,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 #endif
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	for (i = 0; i < sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0]); i++)
+		cuda_worker_set[i].workers = NULL;
+#endif
+#ifdef STARPU_USE_MIC
+	for (i = 0; i < sizeof(mic_worker_set)/sizeof(mic_worker_set[0]); i++)
+		mic_worker_set[i].workers = NULL;
+#endif
+
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -575,44 +586,44 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = workerarg->devid;
-				if (_starpu_may_launch_driver(pconfig->conf, &driver))
-				{
-					/* We spawn only one thread per CUDA device,
-					 * which will control all CUDA workers of this
-					 * device. (by using a worker set). */
-					if (cuda_worker_set[devid].started)
-						goto worker_set_initialized;
+				workerarg->set = &cuda_worker_set[devid];
 
-					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
-					cuda_worker_set[devid].workers = workerarg;
-					cuda_worker_set[devid].set_is_initialized = 0;
+				/* We spawn only one thread per CUDA device,
+				 * which will control all CUDA workers of this
+				 * device. (by using a worker set). */
+				if (cuda_worker_set[devid].workers)
+					break;
 
-					STARPU_PTHREAD_CREATE_ON(
-						workerarg->name,
-						&cuda_worker_set[devid].worker_thread,
-						NULL,
-						_starpu_cuda_worker,
-						&cuda_worker_set[devid],
-						worker+1);
-#ifdef STARPU_USE_FXT
-					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-					while (!workerarg->worker_is_running)
-						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
-#endif
-					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
-					while (!cuda_worker_set[devid].set_is_initialized)
-						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
-									 &cuda_worker_set[devid].mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
-					cuda_worker_set[devid].started = 1;
-		worker_set_initialized:
-					workerarg->set = &cuda_worker_set[devid];
-				}
-				else
+				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+				cuda_worker_set[devid].workers = workerarg;
+				cuda_worker_set[devid].set_is_initialized = 0;
+
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					workerarg->run_by_starpu = 0;
+					break;
 				}
+
+				STARPU_PTHREAD_CREATE_ON(
+					workerarg->name,
+					&cuda_worker_set[devid].worker_thread,
+					NULL,
+					_starpu_cuda_worker,
+					&cuda_worker_set[devid],
+					worker+1);
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
+				while (!cuda_worker_set[devid].set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
+								 &cuda_worker_set[devid].mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
+				cuda_worker_set[devid].started = 1;
+
 				break;
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
@@ -642,11 +653,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #ifdef STARPU_USE_MIC
 			case STARPU_MIC_WORKER:
+				workerarg->set = &mic_worker_set[devid];
+
 				/* We spawn only one thread
 				 * per MIC device, which will control all MIC
 				 * workers of this device. (by using a worker set). */
-				if (mic_worker_set[devid].started)
-					goto worker_set_initialized;
+				if (mic_worker_set[devid].workers)
+					break;
 
 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
 
@@ -678,8 +691,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
 
 				mic_worker_set[devid].started = 1;
-		worker_set_initialized:
-				workerarg->set = &mic_worker_set[devid];
 
 				break;
 #endif /* STARPU_USE_MIC */
@@ -1374,6 +1385,11 @@ unsigned starpu_worker_get_count(void)
 	return config.topology.nworkers;
 }
 
+unsigned starpu_worker_is_slave(int workerid)
+{
+	return config.workers[workerid].slave;
+}
+
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 {
 	switch (type)

+ 3 - 0
src/core/workers.h

@@ -112,6 +112,9 @@ LIST_TYPE(_starpu_worker,
 	/* flag to know if sched_mutex is locked or not */
 	unsigned sched_mutex_locked;
 
+	/* bool to indicate if the worker is slave in a ctx */
+	unsigned slave;
+
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */

+ 15 - 8
src/datawizard/coherency.c

@@ -151,7 +151,7 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
-	requesting_replicate->requested[requesting_node] = 0;
+	requesting_replicate->requested &= ~(1UL << requesting_node);
 
 	if (mode & STARPU_W)
 	{
@@ -656,18 +656,25 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
-static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
+static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
 {
-// XXX : this is just a hint, so we don't take the lock ...
-//	_starpu_spin_lock(&handle->header_lock);
+	unsigned local_node = _starpu_memory_node_get_local_key();
+	int cpt = 0;
+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
+	{
+		cpt++;
+		_starpu_datawizard_progress(local_node, 1);
+	}
+	if (cpt == STARPU_SPIN_MAXTRY)
+		_starpu_spin_lock(&handle->header_lock);
 
 	if (replicate->state == STARPU_INVALID)
 	{
 		unsigned dst_node = replicate->memory_node;
-		replicate->requested[dst_node] = 1;
+		replicate->requested |= 1UL << dst_node;
 	}
 
-//	_starpu_spin_unlock(&handle->header_lock);
+	_starpu_spin_unlock(&handle->header_lock);
 }
 
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
@@ -686,7 +693,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
 		prefetch_data_on_node(handle, replicate, mode);
 
-		_starpu_set_data_requested_flag_if_needed(replicate);
+		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
 
 	return 0;
@@ -880,7 +887,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsig
 
 		for (i = 0; i < nnodes; i++)
 		{
-			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
+			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
 				ret = 1;
 		}
 

+ 11 - 11
src/datawizard/coherency.h

@@ -48,26 +48,26 @@ LIST_TYPE(_starpu_data_replicate,
 
 	unsigned memory_node;
 
-	/* A buffer that is used for SCRATCH or reduction cannnot be used with
-	 * filters. */
-	unsigned relaxed_coherency;
-
-	/* We may need to initialize the replicate with some value before using it. */
-	unsigned initialized;
-
 	/* describes the state of the local data in term of coherency */
 	enum _starpu_cache_state	state;
 
 	int refcnt;
 
+	/* A buffer that is used for SCRATCH or reduction cannnot be used with
+	 * filters. */
+	unsigned relaxed_coherency:2;
+
+	/* We may need to initialize the replicate with some value before using it. */
+	unsigned initialized:1;
+
 	/* is the data locally allocated ? */
-	uint8_t allocated;
+	unsigned allocated:1;
 	/* was it automatically allocated ? (else it's the application-provided
 	 * buffer, don't ever try to free it!) */
 	/* perhaps the allocation was perform higher in the hiearchy
 	 * for now this is just translated into !automatically_allocated
 	 * */
-	uint8_t automatically_allocated;
+	unsigned automatically_allocated:1;
 
         /* Pointer to memchunk for LRU strategy */
 	struct _starpu_mem_chunk * mc;
@@ -79,7 +79,7 @@ LIST_TYPE(_starpu_data_replicate,
 	   flag when it assigns a task to a queue, policies which do not
 	   use this hint can simply ignore it.
 	 */
-	uint8_t requested[STARPU_MAXNODES];
+	uint32_t requested;
 	struct _starpu_data_request *request[STARPU_MAXNODES];
 )
 
@@ -207,7 +207,7 @@ struct _starpu_data_state
 	 * the end of the reduction. */
 	struct _starpu_data_requester_list *reduction_req_list;
 
-	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
+	starpu_data_handle_t *reduction_tmp_handles;
 
 	unsigned lazy_unregister;
 

+ 2 - 1
src/datawizard/filters.c

@@ -176,6 +176,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		/* initialize the chunk lock */
 		child->req_list = _starpu_data_requester_list_new();
 		child->reduction_req_list = _starpu_data_requester_list_new();
+		child->reduction_tmp_handles = NULL;
 		child->refcnt = 0;
 		child->busy_count = 0;
 		child->busy_waiting = 0;
@@ -240,10 +241,10 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 			child_replicate->automatically_allocated = 0;
 			child_replicate->refcnt = 0;
 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
+			child_replicate->requested = 0;
 
 			for (node = 0; node < STARPU_MAXNODES; node++)
 			{
-				child_replicate->requested[node] = 0;
 				child_replicate->request[node] = NULL;
 			}
 

+ 2 - 1
src/datawizard/interfaces/data_interface.c

@@ -291,6 +291,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
 	handle->reduction_refcnt = 0;
 	handle->reduction_req_list = _starpu_data_requester_list_new();
+	handle->reduction_tmp_handles = NULL;
 
 #ifdef STARPU_USE_FXT
 	handle->last_submitted_ghost_sync_id_is_valid = 0;
@@ -346,10 +347,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		replicate->state = STARPU_INVALID;
 		replicate->refcnt = 0;
 		replicate->handle = handle;
+		replicate->requested = 0;
 
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			replicate->requested[node] = 0;
 			replicate->request[node] = NULL;
 		}
 

+ 5 - 1
src/datawizard/reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -156,6 +156,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 	/* Register all valid per-worker replicates */
 	unsigned nworkers = starpu_worker_get_count();
+	STARPU_ASSERT(!handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = malloc(nworkers * sizeof(handle->reduction_tmp_handles[0]));
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (handle->per_worker[worker].initialized)
@@ -390,4 +392,6 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 			/* TODO put in cache */
 		}
 	}
+	free(handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = NULL;
 }

+ 2 - 4
src/datawizard/user_interactions.c

@@ -519,9 +519,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 /* Query the status of the handle on the specified memory node. */
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 {
-#ifdef STARPU_DEVEL
-#warning FIXME
-#endif
+// XXX : this is just a hint, so we don't take the lock ...
 //	_starpu_spin_lock(&handle->header_lock);
 
 	if (is_allocated)
@@ -537,7 +535,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			if (handle->per_node[memory_node].requested[node])
+			if (handle->per_node[memory_node].requested & (1UL << node))
 			{
 				requested = 1;
 				break;

+ 109 - 7
src/debug/traces/starpu_fxt.c

@@ -275,6 +275,18 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	/* TODO: set detailed state */
+	poti_SetState(time, container, "S", name);
+#else
+	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
+#endif
+}
+
 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
@@ -631,11 +643,8 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	int worker;
 	worker = find_worker_id(ev->param[2]);
 
-	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
 
-	char *prefix = options->file_prefix;
-
 	unsigned long has_name = ev->param[3];
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
@@ -646,8 +655,12 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
 	create_paje_state_if_not_found(name, options);
 
+#ifndef STARPU_ENABLE_PAJE_CODELET_DETAILS
 	if (out_paje_file)
 	{
+		char *prefix = options->file_prefix;
+		unsigned sched_ctx = ev->param[1];
+
 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
 		if (sched_ctx != 0)
 		{
@@ -662,9 +675,40 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 #endif
 		}
 	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
 
 }
 
+static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	int worker;
+	worker = find_worker_id(ev->param[5]);
+
+	unsigned sched_ctx = ev->param[1];
+	if (worker < 0) return;
+
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+	{
+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
+		if (sched_ctx != 0)
+		{
+#ifdef STARPU_HAVE_POTI
+			char container[STARPU_POTI_STR_LEN];
+			char ctx[6];
+			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
+			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
+			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
+#else
+			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+#endif
+		}
+	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
+}
+
 static long dumped_codelets_count;
 static struct starpu_fxt_codelet_event *dumped_codelets;
 
@@ -727,7 +771,7 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 #ifdef STARPU_HAVE_POTI
 			program_container_alias (container, STARPU_POTI_STR_LEN, prefix);
 #else
-			fprintf(out_paje_file, "9	%.9f	event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
+			fprintf(out_paje_file, "9	%.9f	user_event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
 #endif
 	}
 	else
@@ -736,12 +780,12 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 #ifdef STARPU_HAVE_POTI
 			thread_container_alias (container, STARPU_POTI_STR_LEN, prefix, ev->param[1]);
 #else
-			fprintf(out_paje_file, "9	%.9f	event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
+			fprintf(out_paje_file, "9	%.9f	user_event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
 #endif
 	}
 #ifdef STARPU_HAVE_POTI
 	if (out_paje_file)
-		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", paje_value);
+		poti_NewEvent(get_event_time_stamp(ev, options), container, "user_event", paje_value);
 #endif
 }
 
@@ -916,6 +960,40 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 }
 
+
+static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned dst = ev->param[0];
+	unsigned src = ev->param[1];
+	unsigned size = 0;
+	unsigned comid = 0;
+	
+	char *prefix = options->file_prefix;
+
+	
+	if (out_paje_file)
+	{
+		double time = get_event_time_stamp(ev, options);
+#ifdef STARPU_HAVE_POTI
+		char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_worker_container[STARPU_POTI_STR_LEN], dst_worker_container[STARPU_POTI_STR_LEN];
+		char program_container[STARPU_POTI_STR_LEN];
+		snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
+		snprintf(paje_key, STARPU_POTI_STR_LEN, "steal_%u", comid);
+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
+		worker_container_alias(src_worker_container, STARPU_POTI_STR_LEN, prefix, src);
+		worker_container_alias(dst_worker_container, STARPU_POTI_STR_LEN, prefix, dst);
+		poti_StartLink(time, program_container, "L", src_worker_container, paje_value, paje_key);
+		poti_EndLink(time+0.000000001, program_container, "L", dst_worker_container, paje_value, paje_key);
+#else
+
+		fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time, prefix, size, prefix, src, comid);
+		fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time+0.000000001, prefix, size, prefix, dst, comid);
+#endif
+	}
+
+}
+
+
 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	unsigned dst = ev->param[1];
@@ -1380,6 +1458,23 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 	}
 }
 
+static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	/* Add an event in the trace */
+	if (out_paje_file)
+	{
+		char *event = (char*)&ev->param[1];
+
+#ifdef STARPU_HAVE_POTI
+		char container[STARPU_POTI_STR_LEN];
+		thread_container_alias(container, STARPU_POTI_STR_LEN, options->file_prefix, ev->param[0]);
+		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", event);
+#else
+		fprintf(out_paje_file, "9	%.9f	thread_event	%st%"PRIu64"	%s\n", get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], event);
+#endif
+	}
+}
+
 static
 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 {
@@ -1507,6 +1602,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 			case _STARPU_FUT_START_CODELET_BODY:
 				handle_start_codelet_body(&ev, options);
 				break;
+			case _STARPU_FUT_CODELET_DETAILS:
+				handle_codelet_details(&ev, options);
+				break;
 			case _STARPU_FUT_END_CODELET_BODY:
 				handle_end_codelet_body(&ev, options);
 				break;
@@ -1624,7 +1722,7 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				break;
 
 			case _STARPU_FUT_WORK_STEALING:
-				/* XXX */
+				handle_work_stealing(&ev, options);
 				break;
 
 			case _STARPU_FUT_WORKER_DEINIT_START:
@@ -1797,6 +1895,10 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_event(&ev, options);
 				break;
 
+			case _STARPU_FUT_THREAD_EVENT:
+				handle_thread_event(&ev, options);
+				break;
+
 			case _STARPU_FUT_LOCKING_MUTEX:
 				break;
 

+ 13 - 0
src/debug/traces/starpu_paje.c

@@ -130,6 +130,17 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%	DestContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%EndEventDef\n");
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	fprintf(file, "%%EventDef PajeSetState 20\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%	Size	string\n");
+	fprintf(file, "%%	Footprint	string\n");
+	fprintf(file, "%%	Tag	string\n");
+	fprintf(file, "%%EndEventDef\n");
+#endif
 #endif
 
 #ifdef STARPU_HAVE_POTI
@@ -156,6 +167,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
 
 	/* Types for the Worker of the Memory Node */
+	poti_DefineEventType("user_event", "T", "user event type");
 	poti_DefineEventType("thread_event", "T", "thread event type");
 	poti_DefineStateType("S", "T", "Thread State");
 	poti_DefineEntityValue("I", "S", "Initializing", "0.0 .7 1.0");
@@ -220,6 +232,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 1       MPICt   T       \"MPI Communication Thread\"              \n\
 1       Sc       P       \"Scheduler State\"                        \n\
 2       prog_event   P       \"program event type\"				\n\
+2       user_event   T       \"user event type\"				\n\
 2       thread_event   T       \"thread event type\"				\n\
 2       MPIev   MPICt    \"MPI event type\"			\n\
 3       S       T       \"Thread State\"                        \n\

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -89,7 +89,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 	}
 
 	/* Give profiling variable */
-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
+	_starpu_driver_start_job(cpu_args, j, perf_arch, &codelet_start, rank, profiling);
 
 	/* In case this is a Fork-join parallel task, the worker does not
 	 * execute the kernel at all. */

+ 7 - 3
src/drivers/cuda/driver_cuda.c

@@ -396,7 +396,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */
@@ -517,7 +517,10 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 	unsigned memnode = worker0->memory_node;
 	struct starpu_task *tasks[worker_set->nworkers], *task;
 	struct _starpu_job *j;
-	int i, res, idle;
+	int i, res;
+
+#ifndef STARPU_SIMGRID
+	int idle;
 
 	/* First poll for completed jobs */
 	idle = 0;
@@ -540,13 +543,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		if (cures != cudaSuccess)
 		{
 			STARPU_ASSERT(cures == cudaErrorNotReady);
-			idle++;
 		}
 		else
 		{
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(args);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
+			idle++;
 		}
 	}
 
@@ -556,6 +559,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		__starpu_datawizard_progress(memnode, 1, 0);
 		return 0;
 	}
+#endif /* STARPU_SIMGRID */
 
 	/* Something done, make some progress */
 	__starpu_datawizard_progress(memnode, 1, 1);

+ 4 - 2
src/drivers/driver_common/driver_common.c

@@ -34,7 +34,7 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
@@ -74,7 +74,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 
-	_STARPU_TRACE_START_CODELET_BODY(j);
+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -398,6 +398,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		/*else try to pop a task*/
 		else
 		{
+			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
 			_starpu_set_local_worker_key(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
@@ -427,6 +428,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 					workers[i].current_rank = 0;
 				}
 
+				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
 				_starpu_worker_set_status_wakeup(workers[i].workerid);
 			}
 			else

+ 2 - 2
src/drivers/driver_common/driver_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 #include <core/jobs.h>
 #include <common/utils.h>
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			      struct timespec *codelet_start, int rank, int profiling);
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			    struct timespec *codelet_end, int rank, int profiling);

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -421,7 +421,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
 
-	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
 
 
 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);

+ 4 - 2
src/drivers/opencl/driver_opencl.c

@@ -619,6 +619,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	struct starpu_task *task;
 	int res;
 
+#ifndef STARPU_SIMGRID
 	task = starpu_task_get_current();
 
 	if (task)
@@ -642,6 +643,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		/* Asynchronous task completed! */
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 	}
+#endif /* STARPU_SIMGRID */
 
 	__starpu_datawizard_progress(memnode, 1, 1);
 
@@ -700,7 +702,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 	else
 #else
 #ifdef STARPU_DEVEL
-#warning No CUDA asynchronous execution with simgrid yet.
+#warning No OpenCL asynchronous execution with simgrid yet.
 #endif
 #endif
 	/* Synchronous execution */
@@ -823,7 +825,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");

+ 17 - 10
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -286,6 +286,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(best_workerid, sched_ctx_id);
+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+        {
+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+                return 0;
+        }
 
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
 
@@ -405,9 +412,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -543,9 +550,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -692,10 +699,6 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
-
 	compute_all_performance_predictions(task,
 					    nworkers_ctx,
 					    local_task_length,
@@ -712,9 +715,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	unsigned nimpl;
 	if (forced_best == -1)
 	{
-		while(workers->has_next(workers, &it))
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+
+		while(workers->has_next_master(workers, &it))
 		{
-			worker = workers->get_next(workers, &it);
+			worker = workers->get_next_master(workers, &it);
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 				if (!starpu_worker_can_execute_task(worker, task, nimpl))

+ 13 - 2
src/sched_policies/eager_central_policy.c

@@ -94,9 +94,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 		if (!starpu_bitmap_get(data->waiters, worker))
@@ -167,6 +167,17 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
+	if(task)
+	{
+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+		{
+			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+			return NULL;
+		}
+	}
+
 	return task;
 }
 

+ 373 - 0
src/sched_policies/locality_work_stealing_policy.c

@@ -0,0 +1,373 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Work stealing policy */
+
+#include <float.h>
+
+#include <core/workers.h>
+#include <sched_policies/fifo_queues.h>
+#include <core/debug.h>
+#include <starpu_bitmap.h>
+
+struct _starpu_lws_data
+{
+	struct _starpu_fifo_taskq **queue_array;
+	int **proxlist;
+	unsigned last_pop_worker;
+	unsigned last_push_worker;
+};
+
+
+#ifdef STARPU_HAVE_HWLOC
+
+/* Return a worker to steal a task from. The worker is selected
+ * according to the proximity list built using the info on te
+ * architecture provided by hwloc */
+static unsigned select_victim_neighborhood(unsigned sched_ctx_id, int workerid)
+{
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	int i;
+	int neighbor;
+	for(i=0; i<nworkers; i++){
+		neighbor = ws->proxlist[workerid][i];
+		int ntasks = ws->queue_array[neighbor]->ntasks;
+		
+		if (ntasks)
+			return neighbor;
+	}
+
+	return workerid;
+}
+#else
+/* Return a worker to steal a task from. The worker is selected
+ * in a round-robin fashion */
+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_pop_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	/* If the worker's queue is empty, let's try
+	 * the next ones */
+	while (1)
+	{
+		unsigned ntasks;
+
+		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
+		ntasks = ws->queue_array[worker]->ntasks;
+		if (ntasks)
+			break;
+
+		worker = (worker + 1) % nworkers;
+		if (worker == ws->last_pop_worker)
+		{
+			/* We got back to the first worker,
+			 * don't go in infinite loop */
+			break;
+		}
+	}
+
+	ws->last_pop_worker = (worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+#endif
+
+
+/**
+ * Return a worker to whom add a task.
+ * Selecting a worker is done in a round-robin fashion.
+ */
+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_push_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+	/* TODO: use an atomic update operation for this */
+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+/**
+ * Return a worker from which a task can be stolen.
+ */
+static inline unsigned select_victim(unsigned sched_ctx_id, int workerid)
+{
+
+#ifdef STARPU_HAVE_HWLOC
+	return select_victim_neighborhood(sched_ctx_id, workerid);
+#else
+	return select_victim_round_robin(sched_ctx_id);
+#endif
+}
+
+/**
+ * Return a worker on whose queue a task can be pushed. This is only
+ * needed when the push is done by the master
+ */
+static inline unsigned select_worker(unsigned sched_ctx_id)
+{
+	return select_worker_round_robin(sched_ctx_id);
+}
+
+
+static struct starpu_task *lws_pop_task(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	struct starpu_task *task = NULL;
+
+	int workerid = starpu_worker_get_id();
+
+	STARPU_ASSERT(workerid != -1);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+	if (task)
+	{
+		/* there was a local task */
+		/* printf("Own    task!%d\n",workerid); */
+		return task;
+	}
+	starpu_pthread_mutex_t *worker_sched_mutex;
+	starpu_pthread_cond_t *worker_sched_cond;
+	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);
+
+	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
+	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
+       
+
+	/* we need to steal someone's job */
+	unsigned victim = select_victim(sched_ctx_id, workerid);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[victim], workerid);
+	if (task)
+	{
+		_STARPU_TRACE_WORK_STEALING(workerid, victim);
+	}
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);
+
+	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
+	if(!task)
+	{
+		task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+		if (task)
+		{
+			/* there was a local task */
+			return task;
+		}
+	}
+
+	return task;
+}
+
+static int lws_push_task(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int workerid = starpu_worker_get_id();
+
+	/* If the current thread is not a worker but
+	 * the main thread (-1), we find the better one to
+	 * put task on its queue */
+	if (workerid == -1)
+		workerid = select_worker(sched_ctx_id);
+
+	/* int workerid = starpu_worker_get_id(); */
+	/* print_neighborhood(sched_ctx_id, 0); */
+	
+	starpu_pthread_mutex_t *sched_mutex;
+	starpu_pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	_starpu_fifo_push_task(ws->queue_array[workerid], task);
+	
+	starpu_push_task_end(task);
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	}
+#endif
+
+
+	
+	return 0;
+}
+
+static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+		ws->queue_array[workerid] = _starpu_create_fifo();
+
+		/* Tell helgrid that we are fine with getting outdated values,
+		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->ntasks);
+
+		ws->queue_array[workerid]->nprocessed = 0;
+		ws->queue_array[workerid]->ntasks = 0;
+	}
+
+
+#ifdef STARPU_HAVE_HWLOC
+	/* Build a proximity list for every worker. It is cheaper to
+	 * build this once and then use it for popping tasks rather
+	 * than traversing the hwloc tree every time a task must be
+	 * stolen */
+	ws->proxlist = (int**)malloc(nworkers*sizeof(int*));
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		ws->proxlist[workerid] = (int*)malloc(nworkers*sizeof(int));
+		int bindid;
+		
+		struct starpu_tree *neighbour = NULL;
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+	
+		bindid   = starpu_worker_get_bindid(workerid);
+		it.value = starpu_tree_get(tree, bindid);
+		int cnt = 0;
+		for(;;)
+		{
+			neighbour = (struct starpu_tree*)it.value;
+			int workerids[STARPU_NMAXWORKERS];
+			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+			int w;
+			for(w = 0; w < nworkers; w++)
+			{
+				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
+				{
+					ws->proxlist[workerid][cnt++] = workerids[w];
+					it.visited[workerids[w]] = 1;
+				}
+			}
+			if(!workers->has_next(workers, &it))
+				break;
+			it.value = it.possible_value;
+			it.possible_value = NULL;
+		} 
+	}
+#endif	
+}
+
+static void lws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_fifo(ws->queue_array[workerid]);
+#ifdef STARPU_HAVE_HWLOC
+		free(ws->proxlist[workerid]);
+#endif
+	}
+}
+
+static void lws_initialize_policy(unsigned sched_ctx_id)
+{
+#ifdef STARPU_HAVE_HWLOC
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_TREE);
+#else
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+#endif
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)malloc(sizeof(struct _starpu_lws_data));
+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
+
+	ws->last_pop_worker = 0;
+	ws->last_push_worker = 0;
+
+	/* unsigned nw = starpu_sched_ctx_get_nworkers(sched_ctx_id); */
+	unsigned nw = starpu_worker_get_count();
+	ws->queue_array = (struct _starpu_fifo_taskq**)malloc(nw*sizeof(struct _starpu_fifo_taskq*));
+
+}
+	
+static void lws_deinit_policy(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	free(ws->queue_array);
+#ifdef STARPU_HAVE_HWLOC
+	free(ws->proxlist);
+#endif
+	free(ws);
+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
+}
+
+struct starpu_sched_policy _starpu_sched_lws_policy =
+{
+	.init_sched = lws_initialize_policy,
+	.deinit_sched = lws_deinit_policy,
+	.add_workers = lws_add_workers,
+	.remove_workers = lws_remove_workers,
+	.push_task = lws_push_task,
+	.pop_task = lws_pop_task,
+	.pre_exec_hook = NULL,
+	.post_exec_hook = NULL,
+	.pop_every_task = NULL,
+	.policy_name = "nws",
+	.policy_description = "new work stealing"
+};

+ 2 - 2
src/starpu_parameters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,7 +22,7 @@
 
 /* How many executions a codelet will have to be measured before we
  * consider that calibration will provide a value good enough for scheduling */
-#define _STARPU_CALIBRATION_MINIMUM 10
+#define _STARPU_CALIBRATION_MINIMUM ((unsigned) starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10))
 
 /* Assumed relative performance ratios */
 /* TODO: benchmark a bit instead */

+ 65 - 3
src/worker_collection/worker_list.c

@@ -42,6 +42,30 @@ static int list_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 }
 
+static unsigned list_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int nworkers = workers->nmasters;
+	STARPU_ASSERT(it != NULL);
+
+	unsigned ret = it->cursor < nworkers ;
+
+	if(!ret) it->cursor = 0;
+
+	return ret;
+}
+
+static int list_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int *workerids = (int *)workers->masters;
+	int nworkers = (int)workers->nmasters;
+
+	STARPU_ASSERT_MSG(it->cursor < nworkers, "cursor %d nworkers %d\n", it->cursor, nworkers);
+
+	int ret = workerids[it->cursor++];
+
+	return ret;
+}
+
 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
 {
 	int *workerids = (int *)workers->workerids;
@@ -108,9 +132,12 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 {
 	int *workerids = (int *)workers->workerids;
 	unsigned nworkers = workers->nworkers;
+
+	int *masters = (int *)workers->masters;
+	unsigned nmasters = workers->nmasters;
 	
-	int found_worker = -1;
 	unsigned i;
+	int found_worker = -1;
 	for(i = 0; i < nworkers; i++)
 	{
 		if(workerids[i] == worker)
@@ -125,13 +152,29 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 	if(found_worker != -1)
 		workers->nworkers--;
 
+	int found_master = -1;
+	for(i = 0; i < nmasters; i++)
+	{
+		if(masters[i] == worker)
+		{
+			masters[i] = -1;
+			found_master = worker;
+			break;
+		}
+	}
+
+	_rearange_workerids(masters, nmasters);
+	if(found_master != -1)
+		workers->nmasters--;
+	printf("rem %d\n", found_worker);
 	return found_worker;
 }
 
 static void _init_workers(int *workerids)
 {
 	unsigned i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
 		workerids[i] = -1;
 	return;
 }
@@ -139,10 +182,14 @@ static void _init_workers(int *workerids)
 static void list_init(struct starpu_worker_collection *workers)
 {
 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
+	int *masters = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
 	_init_workers(workerids);
+	_init_workers(masters);
 
 	workers->workerids = (void*)workerids;
 	workers->nworkers = 0;
+	workers->masters = (void*)masters;
+	workers->nmasters = 0;
 
 	return;
 }
@@ -150,17 +197,32 @@ static void list_init(struct starpu_worker_collection *workers)
 static void list_deinit(struct starpu_worker_collection *workers)
 {
 	free(workers->workerids);
+	free(workers->masters);
 }
 
-static void list_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void list_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	it->cursor = 0;
+
+	int *workerids = (int *)workers->workerids;
+	unsigned nworkers = workers->nworkers;
+	unsigned i;
+	int nm = 0;
+	for(i = 0;  i < nworkers; i++)
+	{
+		if(!starpu_worker_is_slave(workerids[i]))
+			((int*)workers->masters)[nm++] = workerids[i];
+	}
+	workers->nmasters = nm;
+
 }
 
 struct starpu_worker_collection worker_list =
 {
 	.has_next = list_has_next,
 	.get_next = list_get_next,
+	.has_next_master = list_has_next_master,
+	.get_next_master = list_get_next_master,
 	.add = list_add,
 	.remove = list_remove,
 	.init = list_init,

+ 84 - 4
src/worker_collection/worker_tree.c

@@ -89,6 +89,75 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 }
 
+static unsigned tree_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	STARPU_ASSERT(it != NULL);
+	if(workers->nworkers == 0)
+		return 0;
+
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	if(!neighbour)
+	{
+		starpu_tree_reset_visited(tree, it->visited);
+		it->value = NULL;
+		it->possible_value = NULL;
+		return 0;
+	}
+	int id = -1;
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			id = workerids[w];
+			it->possible_value = neighbour;
+		}
+	}
+
+	STARPU_ASSERT_MSG(id != -1, "bind id (%d) for workerid (%d) not correct", neighbour->id, id);
+
+	return 1;
+}
+
+static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int ret = -1;
+	
+	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
+	struct starpu_tree *neighbour = NULL;
+	if(it->possible_value)
+	{
+		neighbour = it->possible_value;
+		it->possible_value = NULL;
+	}
+	else
+		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	STARPU_ASSERT_MSG(neighbour, "no element anymore");
+	
+	
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			ret = workerids[w];
+			it->visited[workerids[w]] = 1;
+			it->value = neighbour;
+		}
+	}
+	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
+
+	return ret;
+}
+
+
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 {
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
@@ -111,6 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 	if(workers->present[worker])
 	{
 		workers->present[worker] = 0;
+		workers->is_master[worker] = 0;
 		workers->nworkers--;
 		return worker;
 	}
@@ -122,10 +192,14 @@ static void tree_init(struct starpu_worker_collection *workers)
 {
 	workers->workerids = (void*)starpu_workers_get_tree();
 	workers->nworkers = 0;
-	
+
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
 		workers->present[i] = 0;
+		workers->is_master[i] = 0;
+	}
 	
 	return;
 }
@@ -135,19 +209,25 @@ static void tree_deinit(struct starpu_worker_collection *workers)
 //	free(workers->workerids);
 }
 
-static void tree_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void tree_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	it->value = NULL;
 	it->possible_value = NULL;
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
+		workers->is_master[i] = (workers->present[i] && !starpu_worker_is_slave(i));
 		it->visited[i] = 0;
+	}
 }
 
 struct starpu_worker_collection worker_tree =
 {
 	.has_next = tree_has_next,
 	.get_next = tree_get_next,
+	.has_next_master = tree_has_next_master,
+	.get_next_master = tree_get_next_master,
 	.add = tree_add,
 	.remove = tree_remove,
 	.init = tree_init,

+ 1 - 0
tests/datawizard/commute.c

@@ -171,6 +171,7 @@ int main(int argc, char **argv)
 		test(STARPU_R, STARPU_RW, i);
 	}
 
+	starpu_data_unregister(x_handle);
 	starpu_shutdown();
 	STARPU_RETURN(0);
 

+ 2 - 2
tests/heat/dmda.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 
 export STARPU_NCUDA=3

+ 5 - 3
tests/heat/gflops_sched.gp

@@ -3,7 +3,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,8 @@ set key right bottom
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
-     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "ws" 
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "ws" 	    ,\
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "lws" 
 
 set output "gflops_sched_gain.eps"
 set title "LU Decomposition : scheduling strategies : gain"
@@ -43,4 +44,5 @@ set logscale x
 set key right bottom
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
-	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    ,\
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain lws"    

+ 10 - 1
tests/heat/gflops_sched.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -137,6 +137,15 @@ do
 done
 
 
+filename=$TIMINGDIR/gflops.lws.data
+policy=lws
+trace_header 
+for size in $sizelist
+do
+	trace_size $size;
+done
+
+
 filename=$TIMINGDIR/gflops.noprio.data
 policy=no-prio
 trace_header 

+ 2 - 2
tests/heat/granularity.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@ max <- 28
 maxy <- 400
 
 sizelist <- seq(2048, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
 grainlist <- c(256, 512, 1024, 2048);

+ 2 - 2
tests/heat/granularity_model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -17,7 +17,7 @@
 max <- 30
 
 sizelist <- seq(64, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 #grainlist <- c(256, 512, 1024)
 grainlist <- c(512, 1024)

+ 2 - 2
tests/heat/model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio", "dm", "random");
 
 print(schedlist);

+ 4 - 3
tests/heat/random.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio","random");
 
 print(schedlist);
@@ -97,13 +97,14 @@ display_sched <- function()
 	trace_sched("prio", "red", 4);
 	#trace_sched("no-prio", "black");
 	#trace_sched("ws", "purple");
+	#trace_sched("lws", "purple");
 
 	axis(1, at=sizelist)
 	axis(2, at=seq(0, 100, 10), tck=1)
 #	axis(4, at=seq(0, 100, 10))
 	box(bty="u")
 
-        #labels <- c("greedy", "priority", "model", "random", "black", "ws")
+        #labels <- c("greedy", "priority", "model", "random", "black", "ws", "lws")
 #        labels <- c("greedy", "priority", "model", "random")
 	#labels <- c("model", "weighted random", "greedy", "priority")
 	labels <- c("weighted random", "priority")

+ 2 - 2
tests/heat/sched.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("greedy", "prio", "dm", "random");
 
 print(schedlist);

+ 2 - 2
tests/heat/sched.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -94,7 +94,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 
 export STARPU_NCUDA=3

+ 5 - 2
tests/main/driver_api/init_run_deinit.c

@@ -49,8 +49,11 @@ run(struct starpu_task *task, struct starpu_driver *d)
 	int ret;
 	ret = starpu_task_submit(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
-	ret = starpu_driver_run_once(d);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+	while (!starpu_task_finished(task))
+	{
+		ret = starpu_driver_run_once(d);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+	}
 	ret = starpu_task_wait(task);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 }

+ 3 - 1
tests/main/subgraph_repeat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 	starpu_shutdown();
 
@@ -179,6 +180,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 3 - 1
tests/main/subgraph_repeat_regenerate.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -168,6 +168,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 	starpu_shutdown();
 
@@ -183,6 +184,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 2 - 1
tests/main/subgraph_repeat_regenerate_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -198,6 +198,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 2 - 1
tests/main/subgraph_repeat_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -182,6 +182,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 5 - 2
tests/perfmodels/feed.c

@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 {
+		 starpu_shutdown();
 		 return STARPU_TEST_SKIPPED;
+	 }
 
 	starpu_task_init(&task);
 	task.cl = &cl;
@@ -76,7 +79,7 @@ int main(int argc, char **argv)
 		arch.devid = 0;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
-		
+
 		/* Simulate Slow GPU */
 		arch.devid = 1;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);

+ 3 - 1
tests/regression/profiles.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -32,6 +32,8 @@ STARPU_NCUDA=1
 # Execution configuration
 STARPU_SCHED=ws
 # Execution configuration
+STARPU_SCHED=lws
+# Execution configuration
 STARPU_SCHED=prio
 # Execution configuration
 STARPU_SCHED=no-prio

+ 5 - 1
tests/regression/regression_test.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -65,6 +65,10 @@ echo "heat.ws.8k.v2"
 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.ws.8k.v2";
 
+echo "heat.lws.8k.v2"
+timing=`STARPU_SCHED="lws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
+save_cov "heat.lws.8k.v2";
+
 echo "heat.greedy.8k.v2"
 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.greedy.8k.v2";