11 年之前 · f281168e46
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,6 +1,7 @@
 
																 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
															
 
																 Cédric Augonnet <cedric.augonnet@inria.fr>
															
 
																 William Braik <wbraik@gmail.com>
															
 
																+Alfredo Buttari <alfredo.buttari@enseeiht.fr>
															
 
																 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
															
 
																 Nicolas Collin <nicolas.collin@inria.fr>
															
 
																 Ludovic Courtès <ludovic.courtes@inria.fr>
															
--- a/ChangeLog
+++ b/ChangeLog
@@ -17,24 +17,6 @@
 
																 StarPU 1.2.0 (svn revision xxxx)
															
 
																 ==============================================
															
 
																-Small features:
															
 
																-  * New function starpu_sched_ctx_display_workers() to display worker
															
 
																-    information belonging to a given scheduler context
															
 
																-  * The option --enable-verbose can be called with
															
 
																-    --enable-verbose=extra to increase the verbosity
															
 
																-
															
 
																-StarPU 1.1.2 (svn revision xxxx)
															
 
																-==============================================
															
 
																-The scheduling context release
															
 
																-
															
 
																-New features:
															
 
																-  * The reduction init codelet is automatically used to initialize temporary
															
 
																-    buffers.
															
 
																-
															
 
																-StarPU 1.1.1 (svn revision 12638)
															
 
																-==============================================
															
 
																-The scheduling context release
															
 
																-
															
 
																 New features:
															
 
																   * Xeon Phi support
															
 
																   * SCC support
															
@@ -48,46 +30,88 @@ New features:
 
																 	  before the corresponding data, which allows the receiver to
															
 
																 	  allocate data correctly, and to submit the matching receive of
															
 
																 	  the envelope.
															
 
																+        - New function
															
 
																+   	  starpu_mpi_irecv_detached_sequential_consistency which
															
 
																+	  allows to enable or disable the sequential consistency for
															
 
																+	  the given data handle (sequential consistency will be
															
 
																+	  enabled or disabled based on the value of the function
															
 
																+	  parameter and the value of the sequential consistency
															
 
																+	  defined for the given data)
															
 
																+        - New functions starpu_mpi_task_build() and
															
 
																+  	  starpu_mpi_task_post_build()
															
 
																   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
															
 
																     let starpu commute write accesses.
															
 
																   * Out-of-core support, through registration of disk areas as additional memory
															
 
																     nodes.
															
 
																-  * StarPU-MPI: new function
															
 
																-    starpu_mpi_irecv_detached_sequential_consistency which allows to
															
 
																-    enable or disable the sequential consistency for the given data
															
 
																-    handle (sequential consistency will be enabled or disabled based
															
 
																-    on the value of the function parameter and the value of the
															
 
																-    sequential consistency defined for the given data)
															
 
																-  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
															
 
																-  * New functions starpu_pause() and starpu_resume()
															
 
																-  * New codelet specific_nodes field to specify explicit target nodes for data.
															
 
																-  * Use streams for all CUDA transfers, even initiated by CPUs.
															
 
																   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
															
 
																     CUDA and OpenCL kernel execution.
															
 
																-  * Add paje traces statistics tools.
															
 
																   * Add CUDA concurrent kernel execution support through
															
 
																     the STARPU_NWORKER_PER_CUDA environment variable.
															
 
																-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
															
 
																 Small features:
															
 
																+  * Tasks can now have a name (via the field const char *name of
															
 
																+    struct starpu_task)
															
 
																   * New functions starpu_data_acquire_cb_sequential_consistency() and
															
 
																     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
															
 
																     to enable or disable sequential consistency
															
 
																   * New configure option --enable-fxt-lock which enables additional
															
 
																     trace events focused on locks behaviour during the execution
															
 
																-  * New function starpu_perfmodel_directory() to print directory
															
 
																-    storing performance models. Available through the new option -d of
															
 
																-    the tool starpu_perfmodel_display
															
 
																-  * New batch files to execute StarPU applications under Microsoft
															
 
																-    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
															
 
																   * Functions starpu_insert_task and starpu_mpi_insert_task are
															
 
																     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
															
 
																     names are kept to avoid breaking old codes.
															
 
																   * New configure option --enable-calibration-heuristic which allows
															
 
																     the user to set the maximum authorized deviation of the
															
 
																     history-based calibrator.
															
 
																-  * Tasks can now have a name (via the field const char *name of
															
 
																-    struct starpu_task)
															
 
																+  * Allow application to provide the task footprint itself.
															
 
																+  * New function starpu_sched_ctx_display_workers() to display worker
															
 
																+    information belonging to a given scheduler context
															
 
																+  * The option --enable-verbose can be called with
															
 
																+    --enable-verbose=extra to increase the verbosity
															
 
																+  * Add codelet size, footprint and tag id in the paje trace.
															
 
																+
															
 
																+Changes:
															
 
																+  * Data interfaces (variable, vector, matrix and block) now define
															
 
																+    pack und unpack functions
															
 
																+  * StarPU-MPI: Fix for being able to receive data which have not yet
															
 
																+    been registered by the application (i.e it did not call
															
 
																+    starpu_data_set_tag(), data are received as a raw memory)
															
 
																+  * StarPU-MPI: Fix for being able to receive data with the same tag
															
 
																+    from several nodes (see mpi/tests/gather.c)
															
 
																+
															
 
																+Small changes:
															
 
																+  * Rename function starpu_trace_user_event() as
															
 
																+    starpu_fxt_trace_user_event()
															
 
																+
															
 
																+StarPU 1.1.2 (svn revision xxx)
															
 
																+==============================================
															
 
																+The scheduling context release
															
 
																+
															
 
																+New features:
															
 
																+  * The reduction init codelet is automatically used to initialize temporary
															
 
																+    buffers.
															
 
																+  * Traces now include a "scheduling" state, to show the overhead of the
															
 
																+    scheduler.
															
 
																+  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
															
 
																+    number of calibration measurements.
															
 
																+
															
 
																+StarPU 1.1.1 (svn revision 12638)
															
 
																+==============================================
															
 
																+The scheduling context release
															
 
																+
															
 
																+New features:
															
 
																+  * MPI:
															
 
																+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
															
 
																+   	  cache holding received data.
															
 
																+        - New function starpu_mpi_data_register() which sets the rank
															
 
																+  	  and tag of a data, and also allows to automatically clear
															
 
																+	  the MPI communication cache when unregistering the data. It
															
 
																+	  should be called instead of both calling
															
 
																+	  starpu_data_set_tag() and starpu_data_set_rank()
															
 
																+  * Use streams for all CUDA transfers, even initiated by CPUs.
															
 
																+  * Add paje traces statistics tools.
															
 
																+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
															
 
																+
															
 
																+Small features:
															
 
																   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
															
 
																     to execute the task.
															
 
																   * New STARPU_DISABLE_PINNING environment variable to disable host memory
															
@@ -97,23 +121,23 @@ Small features:
 
																   * New starpu_memory_get_total function to get the size of a memory node.
															
 
																   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
															
 
																     a set of workers without going through combined workers.
															
 
																-  * Allow application to provide the task footprint itself.
															
 
																 Changes:
															
 
																-  * Data interfaces (variable, vector, matrix and block) now define
															
 
																-    pack und unpack functions
															
 
																-  * StarPU-MPI: Fix for being able to receive data which have not yet
															
 
																-    been registered by the application (i.e it did not call
															
 
																-    starpu_data_set_tag(), data are received as a raw memory)
															
 
																-  * StarPU-MPI: Fix for being able to receive data with the same tag
															
 
																-    from several nodes (see mpi/tests/gather.c)
															
 
																+  * Fix simgrid execution.
															
 
																+  * Rename starpu_get_nready_tasks_of_sched_ctx to starpu_sched_ctx_get_nready_tasks
															
 
																+  * Rename starpu_get_nready_flops_of_sched_ctx to starpu_sched_ctx_get_nready_flops
															
 
																+  * New functions starpu_pause() and starpu_resume()
															
 
																+  * New codelet specific_nodes field to specify explicit target nodes for data.
															
 
																   * StarPU-MPI: Fix overzealous allocation of memory.
															
 
																   * Interfaces: Allow interface implementation to change pointers at will, in
															
 
																     unpack notably.
															
 
																 Small changes:
															
 
																-  * Rename function starpu_trace_user_event() as
															
 
																-    starpu_fxt_trace_user_event()
															
 
																+  * Use big fat abortions when one tries to make a task or callback
															
 
																+    sleep, instead of just returning EDEADLCK which few people will test
															
 
																+  * By default, StarPU FFT examples are not compiled and checked, the
															
 
																+    configure option --enable-starpufft-examples needs to be specified
															
 
																+    to change this behaviour.
															
 
																 StarPU 1.1.0 (svn revision 11960)
															
 
																 ==============================================
															
--- a/configure.ac
+++ b/configure.ac
@@ -975,16 +975,19 @@ AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
 
																 if test x$enable_simgrid = xyes ; then
															
 
																    	if test -n "$SIMGRID_CFLAGS" ; then
															
 
																 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
															
 
																+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
															
 
																 	fi
															
 
																 	if test -n "$SIMGRID_LIBS" ; then
															
 
																 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
															
 
																 	fi
															
 
																 	if test "$simgrid_dir" != "no" ; then
															
 
																 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
															
 
																+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
															
 
																 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
															
 
																 	fi
															
 
																 	if test "$simgrid_include_dir" != "no" ; then
															
 
																 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
															
 
																+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
															
 
																 	fi
															
 
																 	if test "$simgrid_lib_dir" != "no" ; then
															
 
																 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
															
@@ -994,7 +997,8 @@ if test x$enable_simgrid = xyes ; then
 
																 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
															
 
																 		]
															
 
																 	)
															
 
																-   	AC_CHECK_FUNCS([MSG_process_join])
															
 
																+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
															
 
																+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
															
 
																 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
															
 
																 		    		[[#include <msg/msg.h>]],
															
 
																 				[[msg_host_t foo; ]]
															
@@ -1478,6 +1482,12 @@ if test x$use_fxt = xyes; then
 
																 	AC_CHECK_DECLS([fut_set_filename])
															
 
																 	CFLAGS="$save_CFLAGS"
															
 
																+        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
															
 
																+			[enable details about codelets in the paje trace])],
															
 
																+			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
															
 
																+        if  test x$enable_paje_codelet_details = xyes; then
															
 
																+        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
															
 
																+        fi
															
 
																 	##########################################
															
 
																 	# Poti is a library to generate paje trace files
															
 
																 	##########################################
															
--- a/doc/doxygen/chapters/08scheduling.doxy
+++ b/doc/doxygen/chapters/08scheduling.doxy
@@ -45,6 +45,11 @@ a task on the worker which released it by
 
																 default. When a worker becomes idle, it steals a task from the most loaded
															
 
																 worker.
															
 
																+The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
															
 
																+a task on the worker which released it by
															
 
																+default. When a worker becomes idle, it steals a task from neighbour workers. It
															
 
																+also takes into account priorities.
															
 
																+
															
 
																 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
															
 
																 perform a HEFT-similar scheduling strategy: it schedules tasks where their
															
 
																 termination time will be minimal. The difference with HEFT is that <b>dm</b>
															
--- a/doc/doxygen/chapters/12online_performance_tools.doxy
+++ b/doc/doxygen/chapters/12online_performance_tools.doxy
@@ -389,11 +389,11 @@ parameters through starpu_hash_crc32c_be for instance.
 
																 StarPU will automatically determine when the performance model is calibrated,
															
 
																 or rather, it will assume the performance model is calibrated until the
															
 
																 application submits a task for which the performance can not be predicted. For
															
 
																-::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
															
 
																+::STARPU_HISTORY_BASED, StarPU will require 10 (STARPU_CALIBRATE_MINIMUM)
															
 
																 measurements for a given size before estimating that an average can be taken as
															
 
																 estimation for further executions with the same size. For
															
 
																 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
															
 
																-10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
															
 
																+10 (STARPU_CALIBRATE_MINIMUM) measurements, and that the minimum measured
															
 
																 data size is smaller than 90% of the maximum measured data size (i.e. the
															
 
																 measurement interval is large enough for a regression to have a meaning).
															
 
																 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
															
--- a/doc/doxygen/chapters/13offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/13offline_performance_tools.doxy
@@ -118,6 +118,9 @@ $ vite paje.trace
 
																 To get names of tasks instead of "unknown", fill the optional
															
 
																 starpu_codelet::name, or use a performance model for them.
															
 
																+Details of the codelet execution can be obtained by passing
															
 
																+<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
															
 
																+(at least r1430).
															
 
																 In the MPI execution case, collect the trace files from the MPI nodes, and
															
 
																 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
															
--- a/doc/doxygen/chapters/16mpi_support.doxy
+++ b/doc/doxygen/chapters/16mpi_support.doxy
@@ -121,49 +121,53 @@ automatically released. This mechanism is similar to the pthread
 
																 detach state attribute which determines whether a thread will be
															
 
																 created in a joinable or a detached state.
															
 
																-For any communication, the call of the function will result in the
															
 
																-creation of a StarPU-MPI request, the function
															
 
																-starpu_data_acquire_cb() is then called to asynchronously request
															
 
																-StarPU to fetch the data in main memory; when the data is available in
															
 
																-main memory, a StarPU-MPI function is called to put the new request in
															
 
																-the list of the ready requests if it is a send request, or in an
															
 
																-hashmap if it is a receive request.
															
 
																-
															
 
																-Internally, all MPI communications submitted by StarPU uses a unique
															
 
																-tag which has a default value, and can be accessed with the functions
															
 
																+Internally, all communication are divided in 2 communications, a first
															
 
																+message is used to exchange an envelope describing the data (i.e its
															
 
																+tag and its size), the data itself is sent in a second message. All
															
 
																+MPI communications submitted by StarPU uses a unique tag which has a
															
 
																+default value, and can be accessed with the functions
															
 
																 starpu_mpi_get_communication_tag() and
															
 
																-starpu_mpi_set_communication_tag().
															
 
																-
															
 
																-The matching of tags with corresponding requests is done into StarPU-MPI.
															
 
																-To handle this, any communication is a double-communication based on a
															
 
																-envelope + data system. Every data which will be sent needs to send an
															
 
																-envelope which describes the data (particularly its tag) before sending
															
 
																-the data, so the receiver can get the matching pending receive request
															
 
																-from the hashmap, and submit it to recieve the data correctly.
															
 
																-
															
 
																-To this aim, the StarPU-MPI progression thread has a permanent-submitted
															
 
																-request destined to receive incoming envelopes from all sources.
															
 
																-
															
 
																-The StarPU-MPI progression thread regularly polls this list of ready
															
 
																-requests. For each new ready request, the appropriate function is
															
 
																-called to post the corresponding MPI call. For example, calling
															
 
																-starpu_mpi_isend() will result in posting <c>MPI_Isend</c>. If
															
 
																-the request is marked as detached, the request will be put in the list
															
 
																-of detached requests.
															
 
																-
															
 
																-The StarPU-MPI progression thread also polls the list of detached
															
 
																-requests. For each detached request, it regularly tests the completion
															
 
																-of the MPI request by calling <c>MPI_Test</c>. On completion, the data
															
 
																-handle is released, and if a callback was defined, it is called.
															
 
																-
															
 
																-Finally, the StarPU-MPI progression thread checks if an envelope has
															
 
																-arrived. If it is, it'll check if the corresponding receive has already
															
 
																-been submitted by the application. If it is, it'll submit the request
															
 
																-just as like as it does with those on the list of ready requests.
															
 
																-If it is not, it'll allocate a temporary handle to store the data that
															
 
																-will arrive just after, so as when the corresponding receive request
															
 
																-will be submitted by the application, it'll copy this temporary handle
															
 
																-into its one instead of submitting a new StarPU-MPI request.
															
 
																+starpu_mpi_set_communication_tag(). The matching of tags with
															
 
																+corresponding requests is done within StarPU-MPI.
															
 
																+
															
 
																+For any userland communication, the call of the corresponding function
															
 
																+(e.g starpu_mpi_isend()) will result in the creation of a StarPU-MPI
															
 
																+request, the function starpu_data_acquire_cb() is then called to
															
 
																+asynchronously request StarPU to fetch the data in main memory; when
															
 
																+the data is ready and the corresponding buffer has already been
															
 
																+received by MPI, it will be copied in the memory of the data,
															
 
																+otherwise the request is stored in the <em>early requests list</em>. Sending
															
 
																+requests are stored in the <em>ready requests list</em>.
															
 
																+
															
 
																+While requests need to be processed, the StarPU-MPI progression thread
															
 
																+does the following:
															
 
																+
															
 
																+<ol>
															
 
																+<li> it polls the <em>ready requests list</em>. For all the ready
															
 
																+requests, the appropriate function is called to post the corresponding
															
 
																+MPI call. For example, an initial call to starpu_mpi_isend() will
															
 
																+result in a call to <c>MPI_Isend</c>. If the request is marked as
															
 
																+detached, the request will then be added in the <em>detached requests
															
 
																+list</em>.
															
 
																+</li>
															
 
																+<li> it posts a <c>MPI_Irecv()</c> to retrieve a data envelope.
															
 
																+</li>
															
 
																+<li> it polls the <em>detached requests list</em>. For all the detached
															
 
																+requests, it tests its completion of the MPI request by calling
															
 
																+<c>MPI_Test</c>. On completion, the data handle is released, and if a
															
 
																+callback was defined, it is called.
															
 
																+</li>
															
 
																+<li> finally, it checks if a data envelope has been received. If so,
															
 
																+if the data envelope matches a request in the <em>early requests list</em> (i.e
															
 
																+the request has already been posted by the application), the
															
 
																+corresponding MPI call is posted (similarly to the first step above).
															
 
																+
															
 
																+If the data envelope does not match any application request, a
															
 
																+temporary handle is created to receive the data, a StarPU-MPI request
															
 
																+is created and added into the <em>ready requests list</em>, and thus will be
															
 
																+processed in the first step of the next loop.
															
 
																+</li>
															
 
																+</ol>
															
 
																 \ref MPIPtpCommunication "Communication" gives the list of all the
															
 
																 point to point communications defined in StarPU-MPI.
															
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -314,6 +314,14 @@ is the default behaviour.
 
																 Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
															
 
																 </dd>
															
 
																+<dt>STARPU_CALIBRATE_MINIMUM</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_CALIBRATE_MINIMUM
															
 
																+\addindex __env__STARPU_CALIBRATE_MINIMUM
															
 
																+This defines the minimum number of calibration measurements that will be made
															
 
																+before considering that the performance model is calibrated. The default value is 10.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_BUS_CALIBRATE</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_BUS_CALIBRATE
															
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
@@ -372,6 +372,14 @@ Enable performance debugging through gprof.
 
																 Enable performance model debugging.
															
 
																 </dd>
															
 
																+<dt>--enable-paje-codelet-details</dt>
															
 
																+<dd>
															
 
																+\anchor enable-paje-codelet-details
															
 
																+\addindex __configure__--enable-paje-codelet-details
															
 
																+Enable details about codelets in the paje trace. This requires a recent enough
															
 
																+version of ViTE (at least r1430).
															
 
																+</dd>
															
 
																+
															
 
																 <dt>--enable-fxt-lock</dt>
															
 
																 <dd>
															
 
																 \anchor enable-fxt-lock
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -190,6 +190,7 @@ examplebin_PROGRAMS +=				\
 
																 	sched_ctx/dummy_sched_with_ctx		\
															
 
																 	sched_ctx/prio				\
															
 
																 	sched_ctx/sched_ctx_without_sched_policy\
															
 
																+	sched_ctx/nested_sched_ctxs		\
															
 
																 	worker_collections/worker_tree_example  \
															
 
																 	worker_collections/worker_list_example  \
															
 
																 	reductions/dot_product			\
															
@@ -270,6 +271,7 @@ STARPU_EXAMPLES +=				\
 
																 	sched_ctx/prio				\
															
 
																 	sched_ctx/dummy_sched_with_ctx		\
															
 
																 	sched_ctx/sched_ctx_without_sched_policy\
															
 
																+	sched_ctx/nested_sched_ctxs		\
															
 
																 	worker_collections/worker_tree_example  \
															
 
																 	worker_collections/worker_list_example  \
															
 
																 	reductions/dot_product			\
															
@@ -925,6 +927,9 @@ sched_ctx_parallel_code_CFLAGS = \
 
																 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
															
 
																 	$(AM_CFLAGS) -fopenmp
															
 
																+sched_ctx_nested_sched_ctxs_CFLAGS = \
															
 
																+	$(AM_CFLAGS) -fopenmp
															
 
																+
															
 
																 endif
															
 
																 showcheck:
															
--- a/examples/binary/binary.c
+++ b/examples/binary/binary.c
@@ -29,6 +29,7 @@ struct starpu_codelet cl =
 
																 {
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	.opencl_funcs = {opencl_codelet, NULL},
															
 
																+	.opencl_flags = {STARPU_OPENCL_ASYNC},
															
 
																 #endif
															
 
																 	.nbuffers = 1,
															
 
																 	.modes = {STARPU_RW}
															
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2012 inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -61,9 +61,11 @@ int main(int argc, char **argv)
 
																         cl.cpu_funcs[0] = cpu_codelet;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																         cl.cuda_funcs[0] = cuda_codelet;
															
 
																+	cl.cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 	cl.opencl_funcs[0] = opencl_codelet;
															
 
																+	cl.opencl_flags[0] = STARPU_OPENCL_ASYNC;
															
 
																 #endif
															
 
																         cl.nbuffers = 1;
															
 
																         cl.modes[0] = STARPU_RW;
															
--- a/examples/sched_ctx/nested_sched_ctxs.c
+++ b/examples/sched_ctx/nested_sched_ctxs.c
@@ -0,0 +1,212 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <omp.h>
															
 
																+
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#define NTASKS 64
															
 
																+#else
															
 
																+#define NTASKS 100
															
 
																+#endif
															
 
																+
															
 
																+int tasks_executed[2];
															
 
																+starpu_pthread_mutex_t mut;
															
 
																+
															
 
																+int parallel_code(int sched_ctx)
															
 
																+{
															
 
																+	int i;
															
 
																+	int t = 0;
															
 
																+	int *cpuids = NULL;
															
 
																+	int ncpuids = 0;
															
 
																+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
															
 
																+
															
 
																+//	printf("execute task of %d threads \n", ncpuids);
															
 
																+#pragma omp parallel num_threads(ncpuids)
															
 
																+	{
															
 
																+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
															
 
																+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
															
 
																+#pragma omp for
															
 
																+		for(i = 0; i < NTASKS; i++)
															
 
																+			t++;
															
 
																+	}
															
 
																+
															
 
																+	free(cpuids);
															
 
																+	return t;
															
 
																+}
															
 
																+
															
 
																+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
															
 
																+{
															
 
																+	int w = starpu_worker_get_id();
															
 
																+	unsigned sched_ctx = (unsigned)arg;
															
 
																+	int n = parallel_code(sched_ctx);
															
 
																+//	printf("w %d executed %d it \n", w, n);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+static struct starpu_codelet sched_ctx_codelet =
															
 
																+{
															
 
																+	.cpu_funcs = {sched_ctx_func, NULL},
															
 
																+	.cuda_funcs = {NULL},
															
 
																+	.opencl_funcs = {NULL},
															
 
																+	.model = NULL,
															
 
																+	.nbuffers = 0,
															
 
																+	.name = "sched_ctx"
															
 
																+};
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	tasks_executed[0] = 0;
															
 
																+	tasks_executed[1] = 0;
															
 
																+	int ntasks = NTASKS;
															
 
																+	int ret, j, k;
															
 
																+	unsigned ncpus = 0;
															
 
																+
															
 
																+	ret = starpu_init(NULL);
															
 
																+	if (ret == -ENODEV)
															
 
																+		return 77;
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+
															
 
																+	starpu_pthread_mutex_init(&mut, NULL);
															
 
																+	int nprocs1 = 1;
															
 
																+	int nprocs2 = 1;
															
 
																+	int *procs1, *procs2;
															
 
																+
															
 
																+#ifdef STARPU_USE_CPU
															
 
																+	ncpus =  starpu_cpu_worker_get_count();
															
 
																+	procs1 = (int*)malloc(ncpus*sizeof(int));
															
 
																+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
															
 
																+
															
 
																+	if (ncpus > 1)
															
 
																+	{
															
 
																+		nprocs1 = ncpus/2;
															
 
																+		nprocs2 =  nprocs1;
															
 
																+		k = 0;
															
 
																+		procs2 = (int*)malloc(nprocs2*sizeof(int));
															
 
																+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
															
 
																+			procs2[k++] = procs1[j];
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		procs2 = (int*)malloc(nprocs2*sizeof(int));
															
 
																+		procs2[0] = procs1[0];
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+	if (ncpus == 0)
															
 
																+	{
															
 
																+#ifdef STARPU_USE_CPU
															
 
																+		free(procs1);
															
 
																+		free(procs2);
															
 
																+#endif
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																+	/*create contexts however you want*/
															
 
																+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
															
 
																+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
															
 
																+
															
 
																+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
															
 
																+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
															
 
																+
															
 
																+	int nprocs3 = nprocs1/2;
															
 
																+	int nprocs4 = nprocs1/2;
															
 
																+	int nprocs5 = nprocs2/2;
															
 
																+	int nprocs6 = nprocs2/2;
															
 
																+	int procs3[nprocs3];
															
 
																+	int procs4[nprocs4];
															
 
																+	int procs5[nprocs5];
															
 
																+	int procs6[nprocs6];
															
 
																+
															
 
																+	k = 0;
															
 
																+	for(j = 0; j < nprocs3; j++)
															
 
																+		procs3[k++] = procs1[j];
															
 
																+	k = 0;
															
 
																+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
															
 
																+		procs4[k++] = procs1[j];
															
 
																+
															
 
																+	k = 0;
															
 
																+	for(j = 0; j < nprocs5; j++)
															
 
																+		procs5[k++] = procs2[j];
															
 
																+	k = 0;
															
 
																+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
															
 
																+		procs6[k++] = procs2[j];
															
 
																+
															
 
																+	unsigned sched_ctx3 = starpu_sched_ctx_create(procs3, nprocs3, "ctx3", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
															
 
																+	unsigned sched_ctx4 = starpu_sched_ctx_create(procs4, nprocs4, "ctx4", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
															
 
																+
															
 
																+	unsigned sched_ctx5 = starpu_sched_ctx_create(procs5, nprocs5, "ctx5", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
															
 
																+	unsigned sched_ctx6 = starpu_sched_ctx_create(procs6, nprocs6, "ctx6", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
															
 
																+
															
 
																+
															
 
																+	int i;
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+		task->cl = &sched_ctx_codelet;
															
 
																+		task->cl_arg = sched_ctx1;
															
 
																+
															
 
																+		/*submit tasks to context*/
															
 
																+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
															
 
																+
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+
															
 
																+	for (i = 0; i < ntasks; i++)
															
 
																+	{
															
 
																+		struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+		task->cl = &sched_ctx_codelet;
															
 
																+		task->cl_arg = sched_ctx2;
															
 
																+
															
 
																+		/*submit tasks to context*/
															
 
																+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
															
 
																+
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+	/* tell starpu when you finished submitting tasks to this context
															
 
																+	   in order to allow moving resources from this context to the inheritor one
															
 
																+	   when its corresponding tasks finished executing */
															
 
																+
															
 
																+
															
 
																+
															
 
																+	/* wait for all tasks at the end*/
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	starpu_sched_ctx_delete(sched_ctx3);
															
 
																+	starpu_sched_ctx_delete(sched_ctx4);
															
 
																+
															
 
																+	starpu_sched_ctx_delete(sched_ctx5);
															
 
																+	starpu_sched_ctx_delete(sched_ctx6);
															
 
																+
															
 
																+	starpu_sched_ctx_delete(sched_ctx1);
															
 
																+	starpu_sched_ctx_delete(sched_ctx2);
															
 
																+
															
 
																+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
															
 
																+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
															
 
																+
															
 
																+#ifdef STARPU_USE_CPU
															
 
																+	free(procs1);
															
 
																+	free(procs2);
															
 
																+#endif
															
 
																+	starpu_shutdown();
															
 
																+	return 0;
															
 
																+}
															
--- a/examples/sched_ctx/sched_ctx_without_sched_policy.c
+++ b/examples/sched_ctx/sched_ctx_without_sched_policy.c
@@ -88,7 +88,6 @@ int main(int argc, char **argv)
 
																 #ifdef STARPU_USE_CPU
															
 
																 	ncpus = starpu_cpu_worker_get_count();
															
 
																 	procs1 = (int*)malloc(ncpus*sizeof(int));
															
 
																-	procs2 = (int*)malloc(ncpus*sizeof(int));
															
 
																 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
															
 
																 	if(ncpus > 1)
															
@@ -96,22 +95,16 @@ int main(int argc, char **argv)
 
																 		nprocs1 = ncpus/2;
															
 
																 		nprocs2 =  ncpus-nprocs1;
															
 
																 		k = 0;
															
 
																+		procs2 = (int*)malloc(nprocs2*sizeof(int));
															
 
																 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
															
 
																 			procs2[k++] = procs1[j];
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-		procs1 = (int*)malloc(nprocs1*sizeof(int));
															
 
																 		procs2 = (int*)malloc(nprocs2*sizeof(int));
															
 
																-		procs1[0] = 0;
															
 
																-		procs2[0] = 0;
															
 
																+		procs2[0] = procs1[0];
															
 
																 	}
															
 
																-#else
															
 
																-	procs1 = (int*)malloc(nprocs1*sizeof(int));
															
 
																-	procs2 = (int*)malloc(nprocs2*sizeof(int));
															
 
																-	procs1[0] = 0;
															
 
																-	procs2[0] = 0;
															
 
																 #endif
															
 
																 	if (ncpus == 0) goto enodev;
															
@@ -163,6 +156,10 @@ int main(int argc, char **argv)
 
																 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
															
 
																 enodev:
															
 
																+#ifdef STARPU_USE_CPU
															
 
																+	free(procs1);
															
 
																+	free(procs2);
															
 
																+#endif
															
 
																 	starpu_shutdown();
															
 
																 	return ncpus == 0 ? 77 : 0;
															
 
																 }
															
--- a/examples/worker_collections/worker_list_example.c
+++ b/examples/worker_collections/worker_list_example.c
@@ -85,6 +85,7 @@ int main()
 
																 	FPRINTF(stderr, "timing init = %lf \n", timing);
															
 
																 	co->deinit(co);
															
 
																+	free(co);
															
 
																 	starpu_shutdown();
															
 
																 	return 0;
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -32,6 +32,7 @@
 
																 #undef STARPU_OPENMP
															
 
																 #undef STARPU_SIMGRID
															
 
																+#undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
															
 
																 #undef STARPU_HAVE_ICC
															
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -29,6 +29,7 @@ extern "C"
 
																 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
															
 
																 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
															
 
																 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
															
 
																+#define STARPU_SCHED_CTX_NESTED                  (6<<16)
															
 
																 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
															
@@ -127,6 +128,13 @@ int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids
 
																 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
															
 
																+/* return the first context (child of sched_ctx_id) where the workerid is master */
															
 
																+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id);
															
 
																+
															
 
																+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
															
 
																+
															
 
																+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
															
 
																+
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
 
																 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
															
 
																 #endif /* STARPU_USE_SC_HYPERVISOR */
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -255,6 +255,8 @@ void starpu_task_destroy(struct starpu_task *task);
 
																 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
															
 
																 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
															
 
																+int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
															
 
																+
															
 
																 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
															
 
																 int starpu_task_wait_for_all(void);
															
--- a/include/starpu_thread.h
+++ b/include/starpu_thread.h
@@ -200,6 +200,11 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
																 #if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
															
 
																+#if defined(STARPU_SIMGRID) && defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
															
 
																+typedef xbt_bar_t starpu_pthread_barrier_t;
															
 
																+typedef int starpu_pthread_barrierattr_t;
															
 
																+#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
															
 
																+#else
															
 
																 typedef struct {
															
 
																 	starpu_pthread_mutex_t mutex;
															
 
																 	starpu_pthread_cond_t cond;
															
@@ -208,6 +213,7 @@ typedef struct {
 
																 } starpu_pthread_barrier_t;
															
 
																 typedef int starpu_pthread_barrierattr_t;
															
 
																 #define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
															
 
																+#endif
															
 
																 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count);
															
 
																 int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier);
															
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -57,10 +57,15 @@ struct starpu_worker_collection
 
																 {
															
 
																 	void *workerids;
															
 
																 	unsigned nworkers;
															
 
																+	void *masters;
															
 
																+	unsigned nmasters;
															
 
																 	int present[STARPU_NMAXWORKERS];
															
 
																+	int is_master[STARPU_NMAXWORKERS];
															
 
																 	enum starpu_worker_collection_type type;
															
 
																 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																+	unsigned (*has_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																+	int (*get_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
															
 
																 	int (*add)(struct starpu_worker_collection *workers, int worker);
															
 
																 	int (*remove)(struct starpu_worker_collection *workers, int worker);
															
 
																 	void (*init)(struct starpu_worker_collection *workers);
															
@@ -109,6 +114,8 @@ int starpu_worker_get_mp_nodeid(int id);
 
																 struct starpu_tree* starpu_workers_get_tree(void);
															
 
																 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
															
 
																+
															
 
																+unsigned starpu_worker_is_slave(int workerid);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -71,6 +71,8 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 
																 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
															
 
																 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
															
 
																+int starpu_mpi_world_rank(void);
															
 
																+
															
 
																 int starpu_mpi_get_communication_tag(void);
															
 
																 void starpu_mpi_set_communication_tag(int tag);
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -30,7 +30,7 @@
 
																 #include <datawizard/coherency.h>
															
 
																 static void _starpu_mpi_add_sync_point_in_fxt(void);
															
 
																-static void _starpu_mpi_submit_new_mpi_request(void *arg);
															
 
																+static void _starpu_mpi_submit_ready_request(void *arg);
															
 
																 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
															
 
																 #ifdef STARPU_VERBOSE
															
 
																 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
															
@@ -46,8 +46,8 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 
																 							ssize_t count);
															
 
																 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
															
 
																-/* The list of requests that have been newly submitted by the application */
															
 
																-static struct _starpu_mpi_req_list *new_requests;
															
 
																+/* The list of ready requests */
															
 
																+static struct _starpu_mpi_req_list *ready_requests;
															
 
																 /* The list of detached requests that have already been submitted to MPI */
															
 
																 static struct _starpu_mpi_req_list *detached_requests;
															
@@ -61,7 +61,7 @@ static starpu_pthread_mutex_t mutex;
 
																 static starpu_pthread_t progress_thread;
															
 
																 static int running = 0;
															
 
																-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
															
 
																+/* Count requests posted by the application and not yet submitted to MPI */
															
 
																 static starpu_pthread_mutex_t mutex_posted_requests;
															
 
																 static int posted_requests = 0, newer_requests, barrier_running = 0;
															
@@ -151,9 +151,9 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
																 	req->count = count;
															
 
																 	/* Asynchronously request StarPU to fetch the data in main memory: when
															
 
																-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
															
 
																+	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
															
 
																 	 * the request is actually submitted */
															
 
																-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
															
 
																+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
															
 
																 	_STARPU_MPI_LOG_OUT();
															
 
																 	return req;
															
@@ -447,7 +447,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
																 	waiting_req->func = _starpu_mpi_wait_func;
															
 
																 	waiting_req->request_type = WAIT_REQ;
															
 
																-	_starpu_mpi_submit_new_mpi_request(waiting_req);
															
 
																+	_starpu_mpi_submit_ready_request(waiting_req);
															
 
																 	/* We wait for the MPI request to finish */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
@@ -532,7 +532,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
																 		testing_req->request_type = TEST_REQ;
															
 
																 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																-		_starpu_mpi_submit_new_mpi_request(testing_req);
															
 
																+		_starpu_mpi_submit_ready_request(testing_req);
															
 
																 		/* We wait for the test request to finish */
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
															
@@ -619,7 +619,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
																 	barrier_req->comm = comm;
															
 
																 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
															
 
																-	_starpu_mpi_submit_new_mpi_request(barrier_req);
															
 
																+	_starpu_mpi_submit_ready_request(barrier_req);
															
 
																 	/* We wait for the MPI request to finish */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
															
@@ -785,24 +785,25 @@ static void _starpu_mpi_early_data_cb(void* arg)
 
																 	free(args);
															
 
																 }
															
 
																-static void _starpu_mpi_submit_new_mpi_request(void *arg)
															
 
																+static void _starpu_mpi_submit_ready_request(void *arg)
															
 
																 {
															
 
																 	_STARPU_MPI_LOG_IN();
															
 
																 	struct _starpu_mpi_req *req = arg;
															
 
																 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
															
 
																-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
															
 
																+	_STARPU_MPI_DEBUG(3, "new req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 	if (req->request_type == RECV_REQ)
															
 
																 	{
															
 
																-		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
															
 
																-		 * incoming data without a matching pending receive already submitted by the application.
															
 
																-		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
															
 
																-		 * the list of new_requests, so as the real MPI request can be submitted before the next
															
 
																-		 * submission of the envelope-catching request. */
															
 
																+		/* Case : the request is the internal receive request submitted
															
 
																+		 * by StarPU-MPI to receive incoming data without a matching
															
 
																+		 * early_request from the application. We immediately allocate the
															
 
																+		 * pointer associated to the data_handle, and push it into the
															
 
																+		 * ready_requests list, so as the real MPI request can be submitted
															
 
																+		 * before the next submission of the envelope-catching request. */
															
 
																 		if (req->is_internal_req)
															
 
																 		{
															
 
																 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
															
@@ -818,10 +819,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
																 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
															
 
																 			}
															
 
																-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
															
 
																-			_starpu_mpi_req_list_push_front(new_requests, req);
															
 
																+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
															
 
																+					  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
															
 
																+					  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
															
 
																+			_starpu_mpi_req_list_push_front(ready_requests, req);
															
 
																-			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
															
 
																+			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
															
 
																 			req->posted = 1;
															
@@ -834,10 +837,10 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
																 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
															
 
																 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
															
 
																-			/* Case : the request has already been submitted internally by StarPU.
															
 
																-			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
															
 
																-			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
															
 
																-			 * bring the data back to the original data handle associated to the request.*/
															
 
																+			/* Case: a receive request for a data with the given tag and source has already been
															
 
																+			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
															
 
																+			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
															
 
																+			 * will be called to bring the data back to the original data handle associated to the request.*/
															
 
																 			if (early_data_handle)
															
 
																 			{
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
@@ -861,8 +864,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
																 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
															
 
																 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
															
 
																 			}
															
 
																-			/* Case : a classic receive request with no send received earlier than expected.
															
 
																-			 * We just add the pending receive request to the requests' hashmap. */
															
 
																+			/* Case: no matching data has been received. Store the receive request as an early_request. */
															
 
																 			else
															
 
																 			{
															
 
																 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
															
@@ -872,7 +874,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-		_starpu_mpi_req_list_push_front(new_requests, req);
															
 
																+		_starpu_mpi_req_list_push_front(ready_requests, req);
															
 
																 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
															
 
																 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
															
 
																 	}
															
@@ -986,7 +988,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 
																 	}
															
 
																 }
															
 
																-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
															
 
																+static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
															
 
																 {
															
 
																 	_STARPU_MPI_LOG_IN();
															
 
																 	STARPU_ASSERT_MSG(req, "Invalid request");
															
@@ -1080,10 +1082,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																  	int header_req_submitted = 0;
															
 
																-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
															
 
																+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
															
 
																 	{
															
 
																 		/* shall we block ? */
															
 
																-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
															
 
																+		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0;
															
 
																 #ifndef STARPU_MPI_ACTIVITY
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
															
@@ -1107,21 +1109,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 		/* get one request */
															
 
																 		struct _starpu_mpi_req *req;
															
 
																-		while (!_starpu_mpi_req_list_empty(new_requests))
															
 
																+		while (!_starpu_mpi_req_list_empty(ready_requests))
															
 
																 		{
															
 
																-			req = _starpu_mpi_req_list_pop_back(new_requests);
															
 
																+			req = _starpu_mpi_req_list_pop_back(ready_requests);
															
 
																 			/* handling a request is likely to block for a while
															
 
																 			 * (on a sync_data_with_mem call), we want to let the
															
 
																 			 * application submit requests in the meantime, so we
															
 
																 			 * release the lock. */
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-			_starpu_mpi_handle_new_request(req);
															
 
																+			_starpu_mpi_handle_ready_request(req);
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 		}
															
 
																-		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
															
 
																-		 * requests in our side, we resubmit a header request. */
															
 
																+		/* If there is no currently submitted header_req submitted to
															
 
																+                 * catch envelopes from senders, and there is some pending
															
 
																+                 * receive requests on our side, we resubmit a header request. */
															
 
																 		MPI_Request header_req;
															
 
																 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
															
 
																 		{
															
@@ -1151,11 +1154,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
															
 
																-				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
															
 
																-				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
															
 
																-				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
															
 
																+				/* Case: a data will arrive before a matching receive is
															
 
																+				 * posted by the application. Create a temporary handle to
															
 
																+				 * store the incoming data, submit a starpu_mpi_irecv_detached
															
 
																+				 * on this handle, and store it as an early_data
															
 
																+				 */
															
 
																 				if (!found_req)
															
 
																 				{
															
 
																+
															
 
																 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
															
 
																 					starpu_data_handle_t data_handle = NULL;
															
@@ -1198,8 +1204,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 					// We wait until the request is pushed in the
															
 
																-					// new_request list, that ensures that the next loop
															
 
																-					// will call _starpu_mpi_handle_new_request
															
 
																+					// ready_request list, that ensures that the next loop
															
 
																+					// will call _starpu_mpi_handle_ready_request
															
 
																 					// on the request and post the corresponding mpi_irecv,
															
 
																 					// otherwise, it may lead to read data as envelop
															
 
																 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
@@ -1214,8 +1220,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
															
 
																 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 				}
															
 
																-				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
															
 
																-				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
															
 
																+				/* Case: a matching application request has been found for
															
 
																+				 * the incoming data, we handle the correct allocation
															
 
																+				 * of the pointer associated to the data handle, then
															
 
																+				 * submit the corresponding receive with
															
 
																+				 * _starpu_mpi_handle_ready_request. */
															
 
																 				else
															
 
																 				{
															
 
																 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
															
@@ -1242,7 +1251,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 					 * application submit requests in the meantime, so we
															
 
																 					 * release the lock. */
															
 
																 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
															
 
																-					_starpu_mpi_handle_new_request(found_req);
															
 
																+					_starpu_mpi_handle_ready_request(found_req);
															
 
																 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
															
 
																 				}
															
 
																 				header_req_submitted = 0;
															
@@ -1255,7 +1264,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
																 	}
															
 
																 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
															
 
																-	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
															
 
																+	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
															
 
																 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
															
 
																 	_starpu_mpi_early_request_check_termination();
															
 
																 	_starpu_mpi_early_data_check_termination();
															
@@ -1326,7 +1335,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
																 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
															
 
																 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
															
 
																 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
															
 
																-	new_requests = _starpu_mpi_req_list_new();
															
 
																+	ready_requests = _starpu_mpi_req_list_new();
															
 
																 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
															
 
																 	detached_requests = _starpu_mpi_req_list_new();
															
@@ -1402,7 +1411,7 @@ int starpu_mpi_shutdown(void)
 
																 	/* free the request queues */
															
 
																 	_starpu_mpi_req_list_delete(detached_requests);
															
 
																-	_starpu_mpi_req_list_delete(new_requests);
															
 
																+	_starpu_mpi_req_list_delete(ready_requests);
															
 
																 	_starpu_mpi_comm_amounts_display(rank);
															
 
																 	_starpu_mpi_comm_amounts_free();
															
@@ -1423,3 +1432,10 @@ void starpu_mpi_data_register(starpu_data_handle_t data_handle, int tag, int ran
 
																 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
															
 
																 }
															
 
																+
															
 
																+int starpu_mpi_world_rank(void)
															
 
																+{
															
 
																+	int rank;
															
 
																+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	return rank;
															
 
																+}
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -181,6 +181,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
																 	sched_policies/eager_central_policy.c			\
															
 
																 	sched_policies/eager_central_priority_policy.c		\
															
 
																 	sched_policies/work_stealing_policy.c			\
															
 
																+	sched_policies/locality_work_stealing_policy.c		\
															
 
																 	sched_policies/deque_modeling_policy_data_aware.c	\
															
 
																 	sched_policies/random_policy.c				\
															
 
																 	sched_policies/stack_queues.c				\
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -106,6 +106,9 @@
 
																 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
															
 
																 #define _STARPU_FUT_EVENT	0x513c
															
 
																+#define _STARPU_FUT_THREAD_EVENT	0x513d
															
 
																+
															
 
																+#define	_STARPU_FUT_CODELET_DETAILS	0x513e
															
 
																 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
															
 
																 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
															
@@ -193,6 +196,31 @@ void _starpu_fxt_register_thread(unsigned);
 
																 #define _STARPU_FUT_COMMIT(size) do { } while (0)
															
 
																 #endif
															
 
																+#ifdef FUT_DO_PROBE1STR
															
 
																+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str) FUT_DO_PROBE1STR(CODE, P1, str)
															
 
																+#else
															
 
																+/* Sometimes we need something a little more specific than the wrappers from
															
 
																+ * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
															
 
																+ * by a string. */
															
 
																+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str)			\
															
 
																+do {									\
															
 
																+    if(fut_active) {							\
															
 
																+	/* No more than FXT_MAX_PARAMS args are allowed */		\
															
 
																+	/* we add a \0 just in case ... */				\
															
 
																+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
															
 
																+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
															
 
																+	unsigned nbargs = 1 + nbargs_str;				\
															
 
																+	size_t total_len = FUT_SIZE(nbargs);				\
															
 
																+	unsigned long *futargs =					\
															
 
																+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
															
 
																+	*(futargs++) = (unsigned long)(P1);				\
															
 
																+	snprintf((char *)futargs, len, "%s", str);			\
															
 
																+	((char *)futargs)[len - 1] = '\0';				\
															
 
																+	_STARPU_FUT_COMMIT(total_len);					\
															
 
																+    }									\
															
 
																+} while (0);
															
 
																+#endif
															
 
																+
															
 
																 #ifdef FUT_DO_PROBE2STR
															
 
																 #define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
															
 
																 #else
															
@@ -297,7 +325,7 @@ do {									\
 
																 #ifdef FUT_DO_PROBE6STR
															
 
																 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
															
 
																 #else
															
 
																-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
															
 
																+#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
															
 
																 do {									\
															
 
																     if(fut_active) {							\
															
 
																 	/* No more than FXT_MAX_PARAMS args are allowed */		\
															
@@ -324,7 +352,7 @@ do {									\
 
																 #ifdef FUT_DO_PROBE7STR
															
 
																 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
															
 
																 #else
															
 
																-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
															
 
																+#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
															
 
																 do {									\
															
 
																     if(fut_active) {							\
															
 
																 	/* No more than FXT_MAX_PARAMS args are allowed */		\
															
@@ -378,7 +406,7 @@ do {									\
 
																 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
															
 
																 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
															
 
																-#define _STARPU_TRACE_START_CODELET_BODY(job)				\
															
 
																+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
															
 
																 do {									\
															
 
																         const char *model_name = _starpu_job_get_model_name((job));         \
															
 
																 	if (model_name)                                                 \
															
@@ -389,6 +417,11 @@ do {									\
 
																 	else {                                                          \
															
 
																 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
															
 
																 	}								\
															
 
																+	{								\
															
 
																+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
															
 
																+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
															
 
																+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
															
 
																+	}								\
															
 
																 } while(0);
															
 
																 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
															
@@ -563,6 +596,9 @@ do {										\
 
																 #define _STARPU_TRACE_EVENT(S)			\
															
 
																 	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
															
 
																+#define _STARPU_TRACE_THREAD_EVENT(S)			\
															
 
																+	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
															
 
																+
															
 
																 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
															
 
																 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
															
@@ -746,7 +782,7 @@ do {										\
 
																 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
															
 
																 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
															
 
																 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
															
 
																-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
															
 
																+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
															
 
																 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
															
 
																 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
															
 
																 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
															
@@ -794,6 +830,8 @@ do {										\
 
																 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
															
 
																 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
															
 
																 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
															
 
																+#define _STARPU_TRACE_EVENT(S)		do {} while(0)
															
 
																+#define _STARPU_TRACE_THREAD_EVENT(S)		do {} while(0)
															
 
																 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
															
 
																 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
															
 
																 #define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
															
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -288,9 +288,35 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
																 	return p_ret;
															
 
																 }
															
 
																+
															
 
																+#if defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
															
 
																+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
															
 
																+{
															
 
																+	*barrier = xbt_barrier_init(count);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
															
 
																+{
															
 
																+	if (*barrier)
															
 
																+		xbt_barrier_destroy(*barrier);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
															
 
																+{
															
 
																+	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
															
 
																+
															
 
																+	xbt_barrier_wait(*barrier);
															
 
																+
															
 
																+	_STARPU_TRACE_BARRIER_WAIT_END();
															
 
																+	return 0;
															
 
																+}
															
 
																+#endif /* defined(STARPU_SIMGRID) */
															
 
																+
															
 
																 #endif /* STARPU_SIMGRID */
															
 
																-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
															
 
																+#if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
															
 
																 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
															
 
																 {
															
 
																 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -116,6 +116,15 @@ void _starpu_job_destroy(struct _starpu_job *j)
 
																 	_starpu_job_delete(j);
															
 
																 }
															
 
																+int _starpu_job_finished(struct _starpu_job *j)
															
 
																+{
															
 
																+	int ret;
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
															
 
																+	ret = j->terminated == 2;
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																 void _starpu_wait_job(struct _starpu_job *j)
															
 
																 {
															
 
																 	STARPU_ASSERT(j->task);
															
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -182,6 +182,9 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
																 /* Destroy the data structure associated to the job structure */
															
 
																 void _starpu_job_destroy(struct _starpu_job *j);
															
 
																+/* Test for the termination of the job */
															
 
																+int _starpu_job_finished(struct _starpu_job *j);
															
 
																+
															
 
																 /* Wait for the termination of the job */
															
 
																 void _starpu_wait_job(struct _starpu_job *j);
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -60,7 +60,11 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
																 	{
															
 
																 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
															
 
																+		{
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+		}
															
 
																 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
															
 
																 		worker->nsched_ctxs--;
															
 
																 	}
															
@@ -185,6 +189,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 	}
															
 
																 	else if(sched_ctx->sched_policy->add_workers)
															
 
																 	{
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		if(added_workers)
															
 
																 		{
															
 
																 			if(*n_added_workers > 0)
															
@@ -192,6 +197,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 		}
															
 
																 		else
															
 
																 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																 	}
															
 
																 	return;
															
 
																 }
															
@@ -229,7 +235,11 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 
																 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
															
 
																 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+	}
															
 
																 	free(workerids);
															
 
																 	return;
															
@@ -523,6 +533,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 	int max_prio = 0;
															
 
																 	struct starpu_sched_policy *sched_policy = NULL;
															
 
																 	unsigned hierarchy_level = 0;
															
 
																+	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
															
 
																 	va_start(varg_list, sched_ctx_name);
															
 
																 	while ((arg_type = va_arg(varg_list, int)) != 0)
															
@@ -551,6 +562,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 		{
															
 
																 			hierarchy_level = va_arg(varg_list, unsigned);
															
 
																 		}
															
 
																+		else if (arg_type == STARPU_SCHED_CTX_NESTED)
															
 
																+		{
															
 
																+			nesting_sched_ctx = va_arg(varg_list, unsigned);
															
 
																+		}
															
 
																 		else
															
 
																 		{
															
 
																 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
															
@@ -562,6 +577,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
																 	struct _starpu_sched_ctx *sched_ctx = NULL;
															
 
																 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
															
 
																 	sched_ctx->hierarchy_level = hierarchy_level;
															
 
																+	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
															
 
																 	_starpu_unlock_mutex_if_prev_locked();
															
 
																 	int *added_workerids;
															
@@ -1132,6 +1148,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
																 	case STARPU_WORKER_TREE:
															
 
																 		sched_ctx->workers->has_next = worker_tree.has_next;
															
 
																 		sched_ctx->workers->get_next = worker_tree.get_next;
															
 
																+		sched_ctx->workers->has_next_master = worker_tree.has_next_master;
															
 
																+		sched_ctx->workers->get_next_master = worker_tree.get_next_master;
															
 
																 		sched_ctx->workers->add = worker_tree.add;
															
 
																 		sched_ctx->workers->remove = worker_tree.remove;
															
 
																 		sched_ctx->workers->init = worker_tree.init;
															
@@ -1144,6 +1162,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
																 	default:
															
 
																 		sched_ctx->workers->has_next = worker_list.has_next;
															
 
																 		sched_ctx->workers->get_next = worker_list.get_next;
															
 
																+		sched_ctx->workers->has_next_master = worker_list.has_next_master;
															
 
																+		sched_ctx->workers->get_next_master = worker_list.get_next_master;
															
 
																 		sched_ctx->workers->add = worker_list.add;
															
 
																 		sched_ctx->workers->remove = worker_list.remove;
															
 
																 		sched_ctx->workers->init = worker_list.init;
															
@@ -1171,6 +1191,7 @@ void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
 
																 		starpu_worker_get_name(workerids[i], name, 256);
															
 
																 		fprintf(f, "\t\t%s\n", name);
															
 
																 	}
															
 
																+	free(workerids);
															
 
																 }
															
 
																 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
															
@@ -1605,6 +1626,44 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
																 }
															
 
																+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+	struct _starpu_sched_ctx_list *l = NULL;
															
 
																+	struct _starpu_sched_ctx *sched_ctx = NULL;
															
 
																+	for (l = worker->sched_ctx_list; l; l = l->next)
															
 
																+	{ 
															
 
																+		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
															
 
																+		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
															
 
																+			return sched_ctx->id;
															
 
																+	}
															
 
																+	return STARPU_NMAX_SCHED_CTXS;
															
 
																+
															
 
																+}
															
 
																+
															
 
																+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
															
 
																+{
															
 
																+        _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
															
 
																+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
															
 
																+}
															
 
																+
															
 
																+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
															
 
																+{
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	struct _starpu_worker *worker  = NULL;
															
 
																+	if(workerid != -1)
															
 
																+	{
															
 
																+		worker = _starpu_get_worker_struct(workerid);
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
															
 
																+	}
															
 
																+
															
 
																+	task->sched_ctx = sched_ctx;
															
 
																+	_starpu_task_submit_nodeps(task);
															
 
																+
															
 
																+	if(workerid != -1)
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
															
 
																+}
															
 
																+
															
 
																 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
															
 
																 {
															
 
																 	int s;
															
@@ -1620,6 +1679,7 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 
																 	return 0;
															
 
																 }
															
 
																+
															
 
																 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
@@ -1643,7 +1703,6 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
																 		workerid = workerids[w];
															
 
																 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
															
 
																 		{
															
 
																-			sched_ctx->sleeping[workerids[w]] = 1;
															
 
																 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
															
 
																 		}
															
 
																 	}
															
@@ -1652,7 +1711,10 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
																 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
															
 
																 {
															
 
																+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+	worker->slave = 1;
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	sched_ctx->sleeping[workerid] = 1;
															
 
																 	int master = sched_ctx->master[workerid];
															
 
																 	sem_post(&sched_ctx->fall_asleep_sem[master]);
															
@@ -1666,6 +1728,9 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 
																 	sem_post(&sched_ctx->wake_up_sem[master]);
															
 
																 	sched_ctx->sleeping[workerid] = 0;
															
 
																 	sched_ctx->master[workerid] = -1;
															
 
																+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+	worker->slave = 0;
															
 
																+
															
 
																 	return;
															
 
																 }
															
@@ -1720,7 +1785,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 
																 	int current_worker_id = starpu_worker_get_id();
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																-
															
 
																 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
															
 
																 	int w = 0;
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -147,6 +147,9 @@ struct _starpu_sched_ctx
 
																 	/* bool indicating if the workers is sleeping in this ctx */
															
 
																 	unsigned sleeping[STARPU_NMAXWORKERS];
															
 
																+	/* ctx nesting the current ctx */
															
 
																+	unsigned nesting_sched_ctx;
															
 
																+
															
 
																 };
															
 
																 struct _starpu_machine_config;
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -38,6 +38,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 
																 	&_starpu_sched_eager_policy,
															
 
																 	&_starpu_sched_prio_policy,
															
 
																 	&_starpu_sched_random_policy,
															
 
																+	&_starpu_sched_lws_policy,
															
 
																 	&_starpu_sched_ws_policy,
															
 
																 	&_starpu_sched_dm_policy,
															
 
																 	&_starpu_sched_dmda_policy,
															
@@ -174,14 +175,20 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _st
 
																 	load_sched_policy(selected_policy, sched_ctx);
															
 
																+	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
															
 
																+	_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																 }
															
 
																 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																 	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
															
 
																 	if (policy->deinit_sched)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		policy->deinit_sched(sched_ctx->id);
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+	}
															
 
																 }
															
 
																 static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task *task, struct _starpu_worker *worker, int workerid, int perf_workerid)
															
@@ -193,7 +200,11 @@ static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task
 
																         {
															
 
																 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
															
 
																 		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
															
 
																+		{
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 			sched_ctx->sched_policy->push_task_notify(task, workerid, perf_workerid, sched_ctx->id);
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+		}
															
 
																 	}
															
 
																 }
															
@@ -867,22 +878,31 @@ profiling:
 
																 struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
															
 
																 {
															
 
																+	struct starpu_task *task = NULL;
															
 
																 	if(sched_ctx->sched_policy)
															
 
																 	{
															
 
																 		STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
															
 
																 		/* TODO set profiling info */
															
 
																 		if(sched_ctx->sched_policy->pop_every_task)
															
 
																-			return sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
															
 
																+		{
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																+			task = sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
															
 
																+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+		}
															
 
																 	}
															
 
																-	return NULL;
															
 
																+	return task;
															
 
																 }
															
 
																 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
															
 
																 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		sched_ctx->sched_policy->pre_exec_hook(task);
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+	}
															
 
																 }
															
 
																 void _starpu_sched_post_exec_hook(struct starpu_task *task)
															
@@ -890,7 +910,11 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
															
 
																 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		sched_ctx->sched_policy->post_exec_hook(task);
															
 
																+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
															
 
																+	}
															
 
																 }
															
 
																 void _starpu_wait_on_sched_event(void)
															
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -58,6 +58,7 @@ void _starpu_print_idle_time();
 
																 /*
															
 
																  *	Predefined policies
															
 
																  */
															
 
																+extern struct starpu_sched_policy _starpu_sched_lws_policy;
															
 
																 extern struct starpu_sched_policy _starpu_sched_ws_policy;
															
 
																 extern struct starpu_sched_policy _starpu_sched_prio_policy;
															
 
																 extern struct starpu_sched_policy _starpu_sched_random_policy;
															
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -33,6 +33,8 @@ extern int starpu_main(int argc, char *argv[]);
 
																 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
															
 
																 #pragma weak smpi_simulated_main_
															
 
																 extern int smpi_simulated_main_(int argc, char *argv[]);
															
 
																+#pragma weak starpu_mpi_world_rank
															
 
																+extern int starpu_mpi_world_rank(void);
															
 
																 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
															
@@ -48,6 +50,13 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 
																 	return starpu_main(args->argc, args->argv);
															
 
																 }
															
 
																+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
															
 
																+#ifdef HAVE_MSG_GET_AS_BY_NAME
															
 
																+static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
															
 
																+{
															
 
																+	return MSG_get_as_by_name(name);
															
 
																+}
															
 
																+#else /* HAVE_MSG_GET_AS_BY_NAME */
															
 
																 static msg_as_t __starpu_simgrid_get_as_by_name(msg_as_t root, const char *name)
															
 
																 {
															
 
																 	xbt_dict_t dict;
															
@@ -69,6 +78,8 @@ static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 
																 {
															
 
																 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
															
 
																 }
															
 
																+#endif /* HAVE_MSG_GET_AS_BY_NAME */
															
 
																+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
															
 
																 int _starpu_simgrid_get_nbhosts(const char *prefix)
															
 
																 {
															
@@ -77,13 +88,16 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 
																 	unsigned i, nb;
															
 
																 	unsigned len = strlen(prefix);
															
 
																+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
															
 
																 	if (_starpu_simgrid_running_smpi())
															
 
																 	{
															
 
																 		char name[16];
															
 
																-		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
															
 
																+		STARPU_ASSERT(starpu_mpi_world_rank);
															
 
																+		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
															
 
																 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
															
 
																 	}
															
 
																 	else
															
 
																+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
															
 
																 		hosts = MSG_hosts_as_dynar();
															
 
																 	nb = xbt_dynar_length(hosts);
															
@@ -125,7 +139,8 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 
																 	if (_starpu_simgrid_running_smpi())
															
 
																 	{
															
 
																 		char mpiname[16];
															
 
																-		snprintf(mpiname, sizeof(mpiname), "%d-%s", smpi_current_rank, name);
															
 
																+		STARPU_ASSERT(starpu_mpi_world_rank);
															
 
																+		snprintf(mpiname, sizeof(mpiname), "%d-%s", starpu_mpi_world_rank(), name);
															
 
																 		return MSG_get_host_by_name(mpiname);
															
 
																 	}
															
 
																 	else
															
@@ -178,6 +193,7 @@ void _starpu_simgrid_init()
 
																 	xbt_dynar_t hosts;
															
 
																 	int i;
															
 
																+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
															
 
																 	if (_starpu_simgrid_running_smpi())
															
 
																 	{
															
 
																 		/* Take back hand to create the local platform for this MPI
															
@@ -191,7 +207,8 @@ void _starpu_simgrid_init()
 
																 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
															
 
																 		int ret;
															
 
																-		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
															
 
																+		STARPU_ASSERT(starpu_mpi_world_rank);
															
 
																+		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
															
 
																 		/* Get XML platform */
															
 
																 		_starpu_simgrid_get_platform_path(path, sizeof(path));
															
@@ -212,6 +229,7 @@ void _starpu_simgrid_init()
 
																 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
															
 
																 	}
															
 
																 	else
															
 
																+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
															
 
																 		hosts = MSG_hosts_as_dynar();
															
 
																 	int nb = xbt_dynar_length(hosts);
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -187,6 +187,13 @@ void starpu_task_destroy(struct starpu_task *task)
 
																 	_starpu_task_destroy(task);
															
 
																 }
															
 
																+int starpu_task_finished(struct starpu_task *task)
															
 
																+{
															
 
																+	STARPU_ASSERT(task);
															
 
																+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_finished can only be called on tasks with detach = 0");
															
 
																+	return _starpu_job_finished(_starpu_get_job_associated_to_task(task));
															
 
																+}
															
 
																+
															
 
																 int starpu_task_wait(struct starpu_task *task)
															
 
																 {
															
 
																         _STARPU_LOG_IN();
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -467,6 +467,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
																 	workerarg->reverse_phase[1] = 0;
															
 
																 	workerarg->pop_ctx_priority = 1;
															
 
																 	workerarg->sched_mutex_locked = 0;
															
 
																+	workerarg->slave = 0;
															
 
																 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
															
 
																 }
															
@@ -516,7 +517,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 	/* Launch workers asynchronously */
															
 
																 	unsigned cpu = 0;
															
 
																-	unsigned worker;
															
 
																+	unsigned worker, i;
															
 
																 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
															
 
																 	/* Get itimer of the main thread, to set it for the worker threads */
															
@@ -526,6 +527,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 #ifdef HAVE_AYUDAME_H
															
 
																 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
															
 
																 #endif
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																+	for (i = 0; i < sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0]); i++)
															
 
																+		cuda_worker_set[i].workers = NULL;
															
 
																+#endif
															
 
																+#ifdef STARPU_USE_MIC
															
 
																+	for (i = 0; i < sizeof(mic_worker_set)/sizeof(mic_worker_set[0]); i++)
															
 
																+		mic_worker_set[i].workers = NULL;
															
 
																+#endif
															
 
																+
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
															
@@ -575,44 +586,44 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																 				driver.id.cuda_id = workerarg->devid;
															
 
																-				if (_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																-				{
															
 
																-					/* We spawn only one thread per CUDA device,
															
 
																-					 * which will control all CUDA workers of this
															
 
																-					 * device. (by using a worker set). */
															
 
																-					if (cuda_worker_set[devid].started)
															
 
																-						goto worker_set_initialized;
															
 
																+				workerarg->set = &cuda_worker_set[devid];
															
 
																-					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
															
 
																-					cuda_worker_set[devid].workers = workerarg;
															
 
																-					cuda_worker_set[devid].set_is_initialized = 0;
															
 
																+				/* We spawn only one thread per CUDA device,
															
 
																+				 * which will control all CUDA workers of this
															
 
																+				 * device. (by using a worker set). */
															
 
																+				if (cuda_worker_set[devid].workers)
															
 
																+					break;
															
 
																-					STARPU_PTHREAD_CREATE_ON(
															
 
																-						workerarg->name,
															
 
																-						&cuda_worker_set[devid].worker_thread,
															
 
																-						NULL,
															
 
																-						_starpu_cuda_worker,
															
 
																-						&cuda_worker_set[devid],
															
 
																-						worker+1);
															
 
																-#ifdef STARPU_USE_FXT
															
 
																-					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
															
 
																-					while (!workerarg->worker_is_running)
															
 
																-						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
															
 
																-					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
															
 
																-#endif
															
 
																-					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
															
 
																-					while (!cuda_worker_set[devid].set_is_initialized)
															
 
																-						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
															
 
																-									 &cuda_worker_set[devid].mutex);
															
 
																-					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
															
 
																-					cuda_worker_set[devid].started = 1;
															
 
																-		worker_set_initialized:
															
 
																-					workerarg->set = &cuda_worker_set[devid];
															
 
																-				}
															
 
																-				else
															
 
																+				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
															
 
																+				cuda_worker_set[devid].workers = workerarg;
															
 
																+				cuda_worker_set[devid].set_is_initialized = 0;
															
 
																+
															
 
																+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
															
 
																 				{
															
 
																 					workerarg->run_by_starpu = 0;
															
 
																+					break;
															
 
																 				}
															
 
																+
															
 
																+				STARPU_PTHREAD_CREATE_ON(
															
 
																+					workerarg->name,
															
 
																+					&cuda_worker_set[devid].worker_thread,
															
 
																+					NULL,
															
 
																+					_starpu_cuda_worker,
															
 
																+					&cuda_worker_set[devid],
															
 
																+					worker+1);
															
 
																+#ifdef STARPU_USE_FXT
															
 
																+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
															
 
																+				while (!workerarg->worker_is_running)
															
 
																+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
															
 
																+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
															
 
																+#endif
															
 
																+				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
															
 
																+				while (!cuda_worker_set[devid].set_is_initialized)
															
 
																+					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
															
 
																+								 &cuda_worker_set[devid].mutex);
															
 
																+				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
															
 
																+				cuda_worker_set[devid].started = 1;
															
 
																+
															
 
																 				break;
															
 
																 #endif
															
 
																 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
@@ -642,11 +653,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 #endif
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 			case STARPU_MIC_WORKER:
															
 
																+				workerarg->set = &mic_worker_set[devid];
															
 
																+
															
 
																 				/* We spawn only one thread
															
 
																 				 * per MIC device, which will control all MIC
															
 
																 				 * workers of this device. (by using a worker set). */
															
 
																-				if (mic_worker_set[devid].started)
															
 
																-					goto worker_set_initialized;
															
 
																+				if (mic_worker_set[devid].workers)
															
 
																+					break;
															
 
																 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
															
@@ -678,8 +691,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
															
 
																 				mic_worker_set[devid].started = 1;
															
 
																-		worker_set_initialized:
															
 
																-				workerarg->set = &mic_worker_set[devid];
															
 
																 				break;
															
 
																 #endif /* STARPU_USE_MIC */
															
@@ -1374,6 +1385,11 @@ unsigned starpu_worker_get_count(void)
 
																 	return config.topology.nworkers;
															
 
																 }
															
 
																+unsigned starpu_worker_is_slave(int workerid)
															
 
																+{
															
 
																+	return config.workers[workerid].slave;
															
 
																+}
															
 
																+
															
 
																 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
															
 
																 {
															
 
																 	switch (type)
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -112,6 +112,9 @@ LIST_TYPE(_starpu_worker,
 
																 	/* flag to know if sched_mutex is locked or not */
															
 
																 	unsigned sched_mutex_locked;
															
 
																+	/* bool to indicate if the worker is slave in a ctx */
															
 
																+	unsigned slave;
															
 
																+
															
 
																 #ifdef __GLIBC__
															
 
																 	cpu_set_t cpu_set;
															
 
																 #endif /* __GLIBC__ */
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -151,7 +151,7 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
																 	/* the data is present now */
															
 
																 	unsigned requesting_node = requesting_replicate->memory_node;
															
 
																-	requesting_replicate->requested[requesting_node] = 0;
															
 
																+	requesting_replicate->requested &= ~(1UL << requesting_node);
															
 
																 	if (mode & STARPU_W)
															
 
																 	{
															
@@ -656,18 +656,25 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
 
																-static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
															
 
																+static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
															
 
																 {
															
 
																-// XXX : this is just a hint, so we don't take the lock ...
															
 
																-//	_starpu_spin_lock(&handle->header_lock);
															
 
																+	unsigned local_node = _starpu_memory_node_get_local_key();
															
 
																+	int cpt = 0;
															
 
																+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
															
 
																+	{
															
 
																+		cpt++;
															
 
																+		_starpu_datawizard_progress(local_node, 1);
															
 
																+	}
															
 
																+	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																+		_starpu_spin_lock(&handle->header_lock);
															
 
																 	if (replicate->state == STARPU_INVALID)
															
 
																 	{
															
 
																 		unsigned dst_node = replicate->memory_node;
															
 
																-		replicate->requested[dst_node] = 1;
															
 
																+		replicate->requested |= 1UL << dst_node;
															
 
																 	}
															
 
																-//	_starpu_spin_unlock(&handle->header_lock);
															
 
																+	_starpu_spin_unlock(&handle->header_lock);
															
 
																 }
															
 
																 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
															
@@ -686,7 +693,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 
																 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
															
 
																 		prefetch_data_on_node(handle, replicate, mode);
															
 
																-		_starpu_set_data_requested_flag_if_needed(replicate);
															
 
																+		_starpu_set_data_requested_flag_if_needed(handle, replicate);
															
 
																 	}
															
 
																 	return 0;
															
@@ -880,7 +887,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsig
 
																 		for (i = 0; i < nnodes; i++)
															
 
																 		{
															
 
																-			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
															
 
																+			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
															
 
																 				ret = 1;
															
 
																 		}
															
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -48,26 +48,26 @@ LIST_TYPE(_starpu_data_replicate,
 
																 	unsigned memory_node;
															
 
																-	/* A buffer that is used for SCRATCH or reduction cannnot be used with
															
 
																-	 * filters. */
															
 
																-	unsigned relaxed_coherency;
															
 
																-
															
 
																-	/* We may need to initialize the replicate with some value before using it. */
															
 
																-	unsigned initialized;
															
 
																-
															
 
																 	/* describes the state of the local data in term of coherency */
															
 
																 	enum _starpu_cache_state	state;
															
 
																 	int refcnt;
															
 
																+	/* A buffer that is used for SCRATCH or reduction cannnot be used with
															
 
																+	 * filters. */
															
 
																+	unsigned relaxed_coherency:2;
															
 
																+
															
 
																+	/* We may need to initialize the replicate with some value before using it. */
															
 
																+	unsigned initialized:1;
															
 
																+
															
 
																 	/* is the data locally allocated ? */
															
 
																-	uint8_t allocated;
															
 
																+	unsigned allocated:1;
															
 
																 	/* was it automatically allocated ? (else it's the application-provided
															
 
																 	 * buffer, don't ever try to free it!) */
															
 
																 	/* perhaps the allocation was perform higher in the hiearchy
															
 
																 	 * for now this is just translated into !automatically_allocated
															
 
																 	 * */
															
 
																-	uint8_t automatically_allocated;
															
 
																+	unsigned automatically_allocated:1;
															
 
																         /* Pointer to memchunk for LRU strategy */
															
 
																 	struct _starpu_mem_chunk * mc;
															
@@ -79,7 +79,7 @@ LIST_TYPE(_starpu_data_replicate,
 
																 	   flag when it assigns a task to a queue, policies which do not
															
 
																 	   use this hint can simply ignore it.
															
 
																 	 */
															
 
																-	uint8_t requested[STARPU_MAXNODES];
															
 
																+	uint32_t requested;
															
 
																 	struct _starpu_data_request *request[STARPU_MAXNODES];
															
 
																 )
															
@@ -207,7 +207,7 @@ struct _starpu_data_state
 
																 	 * the end of the reduction. */
															
 
																 	struct _starpu_data_requester_list *reduction_req_list;
															
 
																-	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
															
 
																+	starpu_data_handle_t *reduction_tmp_handles;
															
 
																 	unsigned lazy_unregister;
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -176,6 +176,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 		/* initialize the chunk lock */
															
 
																 		child->req_list = _starpu_data_requester_list_new();
															
 
																 		child->reduction_req_list = _starpu_data_requester_list_new();
															
 
																+		child->reduction_tmp_handles = NULL;
															
 
																 		child->refcnt = 0;
															
 
																 		child->busy_count = 0;
															
 
																 		child->busy_waiting = 0;
															
@@ -240,10 +241,10 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
																 			child_replicate->automatically_allocated = 0;
															
 
																 			child_replicate->refcnt = 0;
															
 
																 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
															
 
																+			child_replicate->requested = 0;
															
 
																 			for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 			{
															
 
																-				child_replicate->requested[node] = 0;
															
 
																 				child_replicate->request[node] = NULL;
															
 
																 			}
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -291,6 +291,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
																 	handle->reduction_refcnt = 0;
															
 
																 	handle->reduction_req_list = _starpu_data_requester_list_new();
															
 
																+	handle->reduction_tmp_handles = NULL;
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 	handle->last_submitted_ghost_sync_id_is_valid = 0;
															
@@ -346,10 +347,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
																 		replicate->state = STARPU_INVALID;
															
 
																 		replicate->refcnt = 0;
															
 
																 		replicate->handle = handle;
															
 
																+		replicate->requested = 0;
															
 
																 		for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 		{
															
 
																-			replicate->requested[node] = 0;
															
 
																 			replicate->request[node] = NULL;
															
 
																 		}
															
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2013  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -156,6 +156,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
																 	/* Register all valid per-worker replicates */
															
 
																 	unsigned nworkers = starpu_worker_get_count();
															
 
																+	STARPU_ASSERT(!handle->reduction_tmp_handles);
															
 
																+	handle->reduction_tmp_handles = malloc(nworkers * sizeof(handle->reduction_tmp_handles[0]));
															
 
																 	for (worker = 0; worker < nworkers; worker++)
															
 
																 	{
															
 
																 		if (handle->per_worker[worker].initialized)
															
@@ -390,4 +392,6 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 
																 			/* TODO put in cache */
															
 
																 		}
															
 
																 	}
															
 
																+	free(handle->reduction_tmp_handles);
															
 
																+	handle->reduction_tmp_handles = NULL;
															
 
																 }
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -519,9 +519,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 
																 /* Query the status of the handle on the specified memory node. */
															
 
																 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
															
 
																 {
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#warning FIXME
															
 
																-#endif
															
 
																+// XXX : this is just a hint, so we don't take the lock ...
															
 
																 //	_starpu_spin_lock(&handle->header_lock);
															
 
																 	if (is_allocated)
															
@@ -537,7 +535,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 
																 		unsigned node;
															
 
																 		for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 		{
															
 
																-			if (handle->per_node[memory_node].requested[node])
															
 
																+			if (handle->per_node[memory_node].requested & (1UL << node))
															
 
																 			{
															
 
																 				requested = 1;
															
 
																 				break;
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -275,6 +275,18 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 
																 #endif
															
 
																 }
															
 
																+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+	char container[STARPU_POTI_STR_LEN];
															
 
																+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
															
 
																+	/* TODO: set detailed state */
															
 
																+	poti_SetState(time, container, "S", name);
															
 
																+#else
															
 
																+	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
															
 
																 {
															
 
																 #ifdef STARPU_HAVE_POTI
															
@@ -631,11 +643,8 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
																 	int worker;
															
 
																 	worker = find_worker_id(ev->param[2]);
															
 
																-	unsigned sched_ctx = ev->param[1];
															
 
																 	if (worker < 0) return;
															
 
																-	char *prefix = options->file_prefix;
															
 
																-
															
 
																 	unsigned long has_name = ev->param[3];
															
 
																 	char *name = has_name?(char *)&ev->param[4]:"unknown";
															
@@ -646,8 +655,12 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
																 	create_paje_state_if_not_found(name, options);
															
 
																+#ifndef STARPU_ENABLE_PAJE_CODELET_DETAILS
															
 
																 	if (out_paje_file)
															
 
																 	{
															
 
																+		char *prefix = options->file_prefix;
															
 
																+		unsigned sched_ctx = ev->param[1];
															
 
																+
															
 
																 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
															
 
																 		if (sched_ctx != 0)
															
 
																 		{
															
@@ -662,9 +675,40 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
																 #endif
															
 
																 		}
															
 
																 	}
															
 
																+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
															
 
																 }
															
 
																+static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+{
															
 
																+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
															
 
																+	int worker;
															
 
																+	worker = find_worker_id(ev->param[5]);
															
 
																+
															
 
																+	unsigned sched_ctx = ev->param[1];
															
 
																+	if (worker < 0) return;
															
 
																+
															
 
																+	char *prefix = options->file_prefix;
															
 
																+
															
 
																+	if (out_paje_file)
															
 
																+	{
															
 
																+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
															
 
																+		if (sched_ctx != 0)
															
 
																+		{
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+			char container[STARPU_POTI_STR_LEN];
															
 
																+			char ctx[6];
															
 
																+			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
															
 
																+			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
															
 
																+			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
															
 
																+#else
															
 
																+			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
															
 
																+#endif
															
 
																+		}
															
 
																+	}
															
 
																+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
															
 
																+}
															
 
																+
															
 
																 static long dumped_codelets_count;
															
 
																 static struct starpu_fxt_codelet_event *dumped_codelets;
															
@@ -727,7 +771,7 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			program_container_alias (container, STARPU_POTI_STR_LEN, prefix);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "9	%.9f	event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
															
 
																+			fprintf(out_paje_file, "9	%.9f	user_event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
															
 
																 #endif
															
 
																 	}
															
 
																 	else
															
@@ -736,12 +780,12 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			thread_container_alias (container, STARPU_POTI_STR_LEN, prefix, ev->param[1]);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "9	%.9f	event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
															
 
																+			fprintf(out_paje_file, "9	%.9f	user_event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
															
 
																 #endif
															
 
																 	}
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	if (out_paje_file)
															
 
																-		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", paje_value);
															
 
																+		poti_NewEvent(get_event_time_stamp(ev, options), container, "user_event", paje_value);
															
 
																 #endif
															
 
																 }
															
@@ -916,6 +960,40 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
																 }
															
 
																+
															
 
																+static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+{
															
 
																+	unsigned dst = ev->param[0];
															
 
																+	unsigned src = ev->param[1];
															
 
																+	unsigned size = 0;
															
 
																+	unsigned comid = 0;
															
 
																+	
															
 
																+	char *prefix = options->file_prefix;
															
 
																+
															
 
																+	
															
 
																+	if (out_paje_file)
															
 
																+	{
															
 
																+		double time = get_event_time_stamp(ev, options);
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+		char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_worker_container[STARPU_POTI_STR_LEN], dst_worker_container[STARPU_POTI_STR_LEN];
															
 
																+		char program_container[STARPU_POTI_STR_LEN];
															
 
																+		snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
															
 
																+		snprintf(paje_key, STARPU_POTI_STR_LEN, "steal_%u", comid);
															
 
																+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
															
 
																+		worker_container_alias(src_worker_container, STARPU_POTI_STR_LEN, prefix, src);
															
 
																+		worker_container_alias(dst_worker_container, STARPU_POTI_STR_LEN, prefix, dst);
															
 
																+		poti_StartLink(time, program_container, "L", src_worker_container, paje_value, paje_key);
															
 
																+		poti_EndLink(time+0.000000001, program_container, "L", dst_worker_container, paje_value, paje_key);
															
 
																+#else
															
 
																+
															
 
																+		fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time, prefix, size, prefix, src, comid);
															
 
																+		fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time+0.000000001, prefix, size, prefix, dst, comid);
															
 
																+#endif
															
 
																+	}
															
 
																+
															
 
																+}
															
 
																+
															
 
																+
															
 
																 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																 {
															
 
																 	unsigned dst = ev->param[1];
															
@@ -1380,6 +1458,23 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 
																 	}
															
 
																 }
															
 
																+static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+{
															
 
																+	/* Add an event in the trace */
															
 
																+	if (out_paje_file)
															
 
																+	{
															
 
																+		char *event = (char*)&ev->param[1];
															
 
																+
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+		char container[STARPU_POTI_STR_LEN];
															
 
																+		thread_container_alias(container, STARPU_POTI_STR_LEN, options->file_prefix, ev->param[0]);
															
 
																+		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", event);
															
 
																+#else
															
 
																+		fprintf(out_paje_file, "9	%.9f	thread_event	%st%"PRIu64"	%s\n", get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], event);
															
 
																+#endif
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																 static
															
 
																 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
															
 
																 {
															
@@ -1507,6 +1602,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
																 			case _STARPU_FUT_START_CODELET_BODY:
															
 
																 				handle_start_codelet_body(&ev, options);
															
 
																 				break;
															
 
																+			case _STARPU_FUT_CODELET_DETAILS:
															
 
																+				handle_codelet_details(&ev, options);
															
 
																+				break;
															
 
																 			case _STARPU_FUT_END_CODELET_BODY:
															
 
																 				handle_end_codelet_body(&ev, options);
															
 
																 				break;
															
@@ -1624,7 +1722,7 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
																 				break;
															
 
																 			case _STARPU_FUT_WORK_STEALING:
															
 
																-				/* XXX */
															
 
																+				handle_work_stealing(&ev, options);
															
 
																 				break;
															
 
																 			case _STARPU_FUT_WORKER_DEINIT_START:
															
@@ -1797,6 +1895,10 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
																 				handle_event(&ev, options);
															
 
																 				break;
															
 
																+			case _STARPU_FUT_THREAD_EVENT:
															
 
																+				handle_thread_event(&ev, options);
															
 
																+				break;
															
 
																+
															
 
																 			case _STARPU_FUT_LOCKING_MUTEX:
															
 
																 				break;
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -130,6 +130,17 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 	fprintf(file, "%%	DestContainer	string\n");
															
 
																 	fprintf(file, "%%	Key	string\n");
															
 
																 	fprintf(file, "%%EndEventDef\n");
															
 
																+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
															
 
																+	fprintf(file, "%%EventDef PajeSetState 20\n");
															
 
																+	fprintf(file, "%%	Time	date\n");
															
 
																+	fprintf(file, "%%	Container	string\n");
															
 
																+	fprintf(file, "%%	Type	string\n");
															
 
																+	fprintf(file, "%%	Value	string\n");
															
 
																+	fprintf(file, "%%	Size	string\n");
															
 
																+	fprintf(file, "%%	Footprint	string\n");
															
 
																+	fprintf(file, "%%	Tag	string\n");
															
 
																+	fprintf(file, "%%EndEventDef\n");
															
 
																+#endif
															
 
																 #endif
															
 
																 #ifdef STARPU_HAVE_POTI
															
@@ -156,6 +167,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
															
 
																 	/* Types for the Worker of the Memory Node */
															
 
																+	poti_DefineEventType("user_event", "T", "user event type");
															
 
																 	poti_DefineEventType("thread_event", "T", "thread event type");
															
 
																 	poti_DefineStateType("S", "T", "Thread State");
															
 
																 	poti_DefineEntityValue("I", "S", "Initializing", "0.0 .7 1.0");
															
@@ -220,6 +232,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
																 1       MPICt   T       \"MPI Communication Thread\"              \n\
															
 
																 1       Sc       P       \"Scheduler State\"                        \n\
															
 
																 2       prog_event   P       \"program event type\"				\n\
															
 
																+2       user_event   T       \"user event type\"				\n\
															
 
																 2       thread_event   T       \"thread event type\"				\n\
															
 
																 2       MPIev   MPICt    \"MPI event type\"			\n\
															
 
																 3       S       T       \"Thread State\"                        \n\
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -89,7 +89,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 	}
															
 
																 	/* Give profiling variable */
															
 
																-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
															
 
																+	_starpu_driver_start_job(cpu_args, j, perf_arch, &codelet_start, rank, profiling);
															
 
																 	/* In case this is a Fork-join parallel task, the worker does not
															
 
																 	 * execute the kernel at all. */
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -396,7 +396,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 
																 		return -EAGAIN;
															
 
																 	}
															
 
																-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
															
 
																+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
															
 
																 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
															
 
																 	/* We make sure we do manipulate the proper device */
															
@@ -517,7 +517,10 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 	unsigned memnode = worker0->memory_node;
															
 
																 	struct starpu_task *tasks[worker_set->nworkers], *task;
															
 
																 	struct _starpu_job *j;
															
 
																-	int i, res, idle;
															
 
																+	int i, res;
															
 
																+
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	int idle;
															
 
																 	/* First poll for completed jobs */
															
 
																 	idle = 0;
															
@@ -540,13 +543,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 		if (cures != cudaSuccess)
															
 
																 		{
															
 
																 			STARPU_ASSERT(cures == cudaErrorNotReady);
															
 
																-			idle++;
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			/* Asynchronous task completed! */
															
 
																 			_starpu_set_local_worker_key(args);
															
 
																 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
															
 
																+			idle++;
															
 
																 		}
															
 
																 	}
															
@@ -556,6 +559,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 		__starpu_datawizard_progress(memnode, 1, 0);
															
 
																 		return 0;
															
 
																 	}
															
 
																+#endif /* STARPU_SIMGRID */
															
 
																 	/* Something done, make some progress */
															
 
																 	__starpu_datawizard_progress(memnode, 1, 1);
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -34,7 +34,7 @@
 
																 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
															
 
																 #define BACKOFF_MIN 1
															
 
																-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
															
 
																+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
															
 
																 {
															
 
																 	struct starpu_task *task = j->task;
															
 
																 	struct starpu_codelet *cl = task->cl;
															
@@ -74,7 +74,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 
																 	if (starpu_top)
															
 
																 		_starpu_top_task_started(task,workerid,codelet_start);
															
 
																-	_STARPU_TRACE_START_CODELET_BODY(j);
															
 
																+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
															
 
																 }
															
 
																 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
															
@@ -398,6 +398,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
																 		/*else try to pop a task*/
															
 
																 		else
															
 
																 		{
															
 
																+			_starpu_worker_set_status_scheduling(workers[i].workerid);
															
 
																 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
															
 
																 			_starpu_set_local_worker_key(&workers[i]);
															
 
																 			tasks[i] = _starpu_pop_task(&workers[i]);
															
@@ -427,6 +428,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
																 					workers[i].current_rank = 0;
															
 
																 				}
															
 
																+				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
															
 
																 				_starpu_worker_set_status_wakeup(workers[i].workerid);
															
 
																 			}
															
 
																 			else
															
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2012  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
															
 
																  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -23,7 +23,7 @@
 
																 #include <core/jobs.h>
															
 
																 #include <common/utils.h>
															
 
																-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
															
 
																+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
															
 
																 			      struct timespec *codelet_start, int rank, int profiling);
															
 
																 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
															
 
																 			    struct timespec *codelet_end, int rank, int profiling);
															
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -421,7 +421,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
																 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
															
 
																-	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
															
 
																+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
															
 
																 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -619,6 +619,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
																 	struct starpu_task *task;
															
 
																 	int res;
															
 
																+#ifndef STARPU_SIMGRID
															
 
																 	task = starpu_task_get_current();
															
 
																 	if (task)
															
@@ -642,6 +643,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
																 		/* Asynchronous task completed! */
															
 
																 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
															
 
																 	}
															
 
																+#endif /* STARPU_SIMGRID */
															
 
																 	__starpu_datawizard_progress(memnode, 1, 1);
															
@@ -700,7 +702,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
																 	else
															
 
																 #else
															
 
																 #ifdef STARPU_DEVEL
															
 
																-#warning No CUDA asynchronous execution with simgrid yet.
															
 
																+#warning No OpenCL asynchronous execution with simgrid yet.
															
 
																 #endif
															
 
																 #endif
															
 
																 	/* Synchronous execution */
															
@@ -823,7 +825,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
																 		return -EAGAIN;
															
 
																 	}
															
 
																-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
															
 
																+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
															
 
																 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
															
 
																 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -286,6 +286,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																 	/* make sure someone coule execute that task ! */
															
 
																 	STARPU_ASSERT(best_workerid != -1);
															
 
																+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(best_workerid, sched_ctx_id);
															
 
																+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																+        {
															
 
																+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
															
 
																+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
															
 
																+                return 0;
															
 
																+        }
															
 
																 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
															
@@ -405,9 +412,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																-	while(workers->has_next(workers, &it))
															
 
																+	while(workers->has_next_master(workers, &it))
															
 
																 	{
															
 
																-		worker = workers->get_next(workers, &it);
															
 
																+		worker = workers->get_next_master(workers, &it);
															
 
																 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
															
 
																 		unsigned memory_node = starpu_worker_get_memory_node(worker);
															
 
																 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
															
@@ -543,9 +550,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																-	while(workers->has_next(workers, &it))
															
 
																+	while(workers->has_next_master(workers, &it))
															
 
																 	{
															
 
																-		worker = workers->get_next(workers, &it);
															
 
																+		worker = workers->get_next_master(workers, &it);
															
 
																 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
															
 
																 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
															
@@ -692,10 +699,6 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
															
 
																-	struct starpu_sched_ctx_iterator it;
															
 
																-	if(workers->init_iterator)
															
 
																-		workers->init_iterator(workers, &it);
															
 
																-
															
 
																 	compute_all_performance_predictions(task,
															
 
																 					    nworkers_ctx,
															
 
																 					    local_task_length,
															
@@ -712,9 +715,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
																 	unsigned nimpl;
															
 
																 	if (forced_best == -1)
															
 
																 	{
															
 
																-		while(workers->has_next(workers, &it))
															
 
																+		struct starpu_sched_ctx_iterator it;
															
 
																+		if(workers->init_iterator)
															
 
																+			workers->init_iterator(workers, &it);
															
 
																+
															
 
																+		while(workers->has_next_master(workers, &it))
															
 
																 		{
															
 
																-			worker = workers->get_next(workers, &it);
															
 
																+			worker = workers->get_next_master(workers, &it);
															
 
																 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 			{
															
 
																 				if (!starpu_worker_can_execute_task(worker, task, nimpl))
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -94,9 +94,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 
																 	if(workers->init_iterator)
															
 
																 		workers->init_iterator(workers, &it);
															
 
																-	while(workers->has_next(workers, &it))
															
 
																+	while(workers->has_next_master(workers, &it))
															
 
																 	{
															
 
																-		worker = workers->get_next(workers, &it);
															
 
																+		worker = workers->get_next_master(workers, &it);
															
 
																 #ifdef STARPU_NON_BLOCKING_DRIVERS
															
 
																 		if (!starpu_bitmap_get(data->waiters, worker))
															
@@ -167,6 +167,17 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
															
 
																+	if(task)
															
 
																+	{
															
 
																+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
															
 
																+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
															
 
																+		{
															
 
																+			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
															
 
																+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
															
 
																+			return NULL;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																 	return task;
															
 
																 }
															
--- a/src/sched_policies/locality_work_stealing_policy.c
+++ b/src/sched_policies/locality_work_stealing_policy.c
@@ -0,0 +1,373 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012  INRIA
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/* Work stealing policy */
															
 
																+
															
 
																+#include <float.h>
															
 
																+
															
 
																+#include <core/workers.h>
															
 
																+#include <sched_policies/fifo_queues.h>
															
 
																+#include <core/debug.h>
															
 
																+#include <starpu_bitmap.h>
															
 
																+
															
 
																+struct _starpu_lws_data
															
 
																+{
															
 
																+	struct _starpu_fifo_taskq **queue_array;
															
 
																+	int **proxlist;
															
 
																+	unsigned last_pop_worker;
															
 
																+	unsigned last_push_worker;
															
 
																+};
															
 
																+
															
 
																+
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+
															
 
																+/* Return a worker to steal a task from. The worker is selected
															
 
																+ * according to the proximity list built using the info on te
															
 
																+ * architecture provided by hwloc */
															
 
																+static unsigned select_victim_neighborhood(unsigned sched_ctx_id, int workerid)
															
 
																+{
															
 
																+
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	int nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
															
 
																+
															
 
																+	int i;
															
 
																+	int neighbor;
															
 
																+	for(i=0; i<nworkers; i++){
															
 
																+		neighbor = ws->proxlist[workerid][i];
															
 
																+		int ntasks = ws->queue_array[neighbor]->ntasks;
															
 
																+		
															
 
																+		if (ntasks)
															
 
																+			return neighbor;
															
 
																+	}
															
 
																+
															
 
																+	return workerid;
															
 
																+}
															
 
																+#else
															
 
																+/* Return a worker to steal a task from. The worker is selected
															
 
																+ * in a round-robin fashion */
															
 
																+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+	unsigned worker = ws->last_pop_worker;
															
 
																+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
															
 
																+
															
 
																+	starpu_pthread_mutex_t *victim_sched_mutex;
															
 
																+	starpu_pthread_cond_t *victim_sched_cond;
															
 
																+
															
 
																+	/* If the worker's queue is empty, let's try
															
 
																+	 * the next ones */
															
 
																+	while (1)
															
 
																+	{
															
 
																+		unsigned ntasks;
															
 
																+
															
 
																+		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
															
 
																+		ntasks = ws->queue_array[worker]->ntasks;
															
 
																+		if (ntasks)
															
 
																+			break;
															
 
																+
															
 
																+		worker = (worker + 1) % nworkers;
															
 
																+		if (worker == ws->last_pop_worker)
															
 
																+		{
															
 
																+			/* We got back to the first worker,
															
 
																+			 * don't go in infinite loop */
															
 
																+			break;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	ws->last_pop_worker = (worker + 1) % nworkers;
															
 
																+
															
 
																+	return worker;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																+
															
 
																+/**
															
 
																+ * Return a worker to whom add a task.
															
 
																+ * Selecting a worker is done in a round-robin fashion.
															
 
																+ */
															
 
																+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+	unsigned worker = ws->last_push_worker;
															
 
																+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
															
 
																+	/* TODO: use an atomic update operation for this */
															
 
																+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
															
 
																+
															
 
																+	return worker;
															
 
																+}
															
 
																+
															
 
																+
															
 
																+/**
															
 
																+ * Return a worker from which a task can be stolen.
															
 
																+ */
															
 
																+static inline unsigned select_victim(unsigned sched_ctx_id, int workerid)
															
 
																+{
															
 
																+
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	return select_victim_neighborhood(sched_ctx_id, workerid);
															
 
																+#else
															
 
																+	return select_victim_round_robin(sched_ctx_id);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+/**
															
 
																+ * Return a worker on whose queue a task can be pushed. This is only
															
 
																+ * needed when the push is done by the master
															
 
																+ */
															
 
																+static inline unsigned select_worker(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	return select_worker_round_robin(sched_ctx_id);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+static struct starpu_task *lws_pop_task(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	struct starpu_task *task = NULL;
															
 
																+
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	STARPU_ASSERT(workerid != -1);
															
 
																+
															
 
																+	task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
															
 
																+	if (task)
															
 
																+	{
															
 
																+		/* there was a local task */
															
 
																+		/* printf("Own    task!%d\n",workerid); */
															
 
																+		return task;
															
 
																+	}
															
 
																+	starpu_pthread_mutex_t *worker_sched_mutex;
															
 
																+	starpu_pthread_cond_t *worker_sched_cond;
															
 
																+	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);
															
 
																+
															
 
																+	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
															
 
																+       
															
 
																+
															
 
																+	/* we need to steal someone's job */
															
 
																+	unsigned victim = select_victim(sched_ctx_id, workerid);
															
 
																+
															
 
																+	starpu_pthread_mutex_t *victim_sched_mutex;
															
 
																+	starpu_pthread_cond_t *victim_sched_cond;
															
 
																+
															
 
																+	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
															
 
																+
															
 
																+	task = _starpu_fifo_pop_task(ws->queue_array[victim], workerid);
															
 
																+	if (task)
															
 
																+	{
															
 
																+		_STARPU_TRACE_WORK_STEALING(workerid, victim);
															
 
																+	}
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
															
 
																+	if(!task)
															
 
																+	{
															
 
																+		task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
															
 
																+		if (task)
															
 
																+		{
															
 
																+			/* there was a local task */
															
 
																+			return task;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return task;
															
 
																+}
															
 
																+
															
 
																+static int lws_push_task(struct starpu_task *task)
															
 
																+{
															
 
																+	unsigned sched_ctx_id = task->sched_ctx;
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+
															
 
																+	/* If the current thread is not a worker but
															
 
																+	 * the main thread (-1), we find the better one to
															
 
																+	 * put task on its queue */
															
 
																+	if (workerid == -1)
															
 
																+		workerid = select_worker(sched_ctx_id);
															
 
																+
															
 
																+	/* int workerid = starpu_worker_get_id(); */
															
 
																+	/* print_neighborhood(sched_ctx_id, 0); */
															
 
																+	
															
 
																+	starpu_pthread_mutex_t *sched_mutex;
															
 
																+	starpu_pthread_cond_t *sched_cond;
															
 
																+	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																+
															
 
																+	_starpu_fifo_push_task(ws->queue_array[workerid], task);
															
 
																+	
															
 
																+	starpu_push_task_end(task);
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																+
															
 
																+#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																+	if(workers->init_iterator)
															
 
																+		workers->init_iterator(workers, &it);
															
 
																+	while(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		worker = workers->get_next(workers, &it);
															
 
																+		starpu_pthread_mutex_t *sched_mutex;
															
 
																+		starpu_pthread_cond_t *sched_cond;
															
 
																+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																+		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																+
															
 
																+	
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	int workerid;
															
 
																+
															
 
																+	for (i = 0; i < nworkers; i++)
															
 
																+	{
															
 
																+		workerid = workerids[i];
															
 
																+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
															
 
																+		ws->queue_array[workerid] = _starpu_create_fifo();
															
 
																+
															
 
																+		/* Tell helgrid that we are fine with getting outdated values,
															
 
																+		 * this is just an estimation */
															
 
																+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->ntasks);
															
 
																+
															
 
																+		ws->queue_array[workerid]->nprocessed = 0;
															
 
																+		ws->queue_array[workerid]->ntasks = 0;
															
 
																+	}
															
 
																+
															
 
																+
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	/* Build a proximity list for every worker. It is cheaper to
															
 
																+	 * build this once and then use it for popping tasks rather
															
 
																+	 * than traversing the hwloc tree every time a task must be
															
 
																+	 * stolen */
															
 
																+	ws->proxlist = (int**)malloc(nworkers*sizeof(int*));
															
 
																+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
															
 
																+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
															
 
																+	for (i = 0; i < nworkers; i++)
															
 
																+	{
															
 
																+		workerid = workerids[i];
															
 
																+		ws->proxlist[workerid] = (int*)malloc(nworkers*sizeof(int));
															
 
																+		int bindid;
															
 
																+		
															
 
																+		struct starpu_tree *neighbour = NULL;
															
 
																+		struct starpu_sched_ctx_iterator it;
															
 
																+		if(workers->init_iterator)
															
 
																+			workers->init_iterator(workers, &it);
															
 
																+	
															
 
																+		bindid   = starpu_worker_get_bindid(workerid);
															
 
																+		it.value = starpu_tree_get(tree, bindid);
															
 
																+		int cnt = 0;
															
 
																+		for(;;)
															
 
																+		{
															
 
																+			neighbour = (struct starpu_tree*)it.value;
															
 
																+			int workerids[STARPU_NMAXWORKERS];
															
 
																+			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
															
 
																+			int w;
															
 
																+			for(w = 0; w < nworkers; w++)
															
 
																+			{
															
 
																+				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
															
 
																+				{
															
 
																+					ws->proxlist[workerid][cnt++] = workerids[w];
															
 
																+					it.visited[workerids[w]] = 1;
															
 
																+				}
															
 
																+			}
															
 
																+			if(!workers->has_next(workers, &it))
															
 
																+				break;
															
 
																+			it.value = it.possible_value;
															
 
																+			it.possible_value = NULL;
															
 
																+		} 
															
 
																+	}
															
 
																+#endif	
															
 
																+}
															
 
																+
															
 
																+static void lws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	unsigned i;
															
 
																+	int workerid;
															
 
																+
															
 
																+	for (i = 0; i < nworkers; i++)
															
 
																+	{
															
 
																+		workerid = workerids[i];
															
 
																+		_starpu_destroy_fifo(ws->queue_array[workerid]);
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+		free(ws->proxlist[workerid]);
															
 
																+#endif
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void lws_initialize_policy(unsigned sched_ctx_id)
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_TREE);
															
 
																+#else
															
 
																+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
															
 
																+#endif
															
 
																+
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)malloc(sizeof(struct _starpu_lws_data));
															
 
																+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
															
 
																+
															
 
																+	ws->last_pop_worker = 0;
															
 
																+	ws->last_push_worker = 0;
															
 
																+
															
 
																+	/* unsigned nw = starpu_sched_ctx_get_nworkers(sched_ctx_id); */
															
 
																+	unsigned nw = starpu_worker_get_count();
															
 
																+	ws->queue_array = (struct _starpu_fifo_taskq**)malloc(nw*sizeof(struct _starpu_fifo_taskq*));
															
 
																+
															
 
																+}
															
 
																+	
															
 
																+static void lws_deinit_policy(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
															
 
																+
															
 
																+	free(ws->queue_array);
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	free(ws->proxlist);
															
 
																+#endif
															
 
																+	free(ws);
															
 
																+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
															
 
																+}
															
 
																+
															
 
																+struct starpu_sched_policy _starpu_sched_lws_policy =
															
 
																+{
															
 
																+	.init_sched = lws_initialize_policy,
															
 
																+	.deinit_sched = lws_deinit_policy,
															
 
																+	.add_workers = lws_add_workers,
															
 
																+	.remove_workers = lws_remove_workers,
															
 
																+	.push_task = lws_push_task,
															
 
																+	.pop_task = lws_pop_task,
															
 
																+	.pre_exec_hook = NULL,
															
 
																+	.post_exec_hook = NULL,
															
 
																+	.pop_every_task = NULL,
															
 
																+	.policy_name = "nws",
															
 
																+	.policy_description = "new work stealing"
															
 
																+};
															
--- a/src/starpu_parameters.h
+++ b/src/starpu_parameters.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -22,7 +22,7 @@
 
																 /* How many executions a codelet will have to be measured before we
															
 
																  * consider that calibration will provide a value good enough for scheduling */
															
 
																-#define _STARPU_CALIBRATION_MINIMUM 10
															
 
																+#define _STARPU_CALIBRATION_MINIMUM ((unsigned) starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10))
															
 
																 /* Assumed relative performance ratios */
															
 
																 /* TODO: benchmark a bit instead */
															
--- a/src/worker_collection/worker_list.c
+++ b/src/worker_collection/worker_list.c
@@ -42,6 +42,30 @@ static int list_get_next(struct starpu_worker_collection *workers, struct starpu
 
																 	return ret;
															
 
																 }
															
 
																+static unsigned list_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																+{
															
 
																+	int nworkers = workers->nmasters;
															
 
																+	STARPU_ASSERT(it != NULL);
															
 
																+
															
 
																+	unsigned ret = it->cursor < nworkers ;
															
 
																+
															
 
																+	if(!ret) it->cursor = 0;
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+static int list_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																+{
															
 
																+	int *workerids = (int *)workers->masters;
															
 
																+	int nworkers = (int)workers->nmasters;
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(it->cursor < nworkers, "cursor %d nworkers %d\n", it->cursor, nworkers);
															
 
																+
															
 
																+	int ret = workerids[it->cursor++];
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
															
 
																 {
															
 
																 	int *workerids = (int *)workers->workerids;
															
@@ -108,9 +132,12 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 
																 {
															
 
																 	int *workerids = (int *)workers->workerids;
															
 
																 	unsigned nworkers = workers->nworkers;
															
 
																+
															
 
																+	int *masters = (int *)workers->masters;
															
 
																+	unsigned nmasters = workers->nmasters;
															
 
																-	int found_worker = -1;
															
 
																 	unsigned i;
															
 
																+	int found_worker = -1;
															
 
																 	for(i = 0; i < nworkers; i++)
															
 
																 	{
															
 
																 		if(workerids[i] == worker)
															
@@ -125,13 +152,29 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 
																 	if(found_worker != -1)
															
 
																 		workers->nworkers--;
															
 
																+	int found_master = -1;
															
 
																+	for(i = 0; i < nmasters; i++)
															
 
																+	{
															
 
																+		if(masters[i] == worker)
															
 
																+		{
															
 
																+			masters[i] = -1;
															
 
																+			found_master = worker;
															
 
																+			break;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	_rearange_workerids(masters, nmasters);
															
 
																+	if(found_master != -1)
															
 
																+		workers->nmasters--;
															
 
																+	printf("rem %d\n", found_worker);
															
 
																 	return found_worker;
															
 
																 }
															
 
																 static void _init_workers(int *workerids)
															
 
																 {
															
 
																 	unsigned i;
															
 
																-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																+	int nworkers = starpu_worker_get_count();
															
 
																+	for(i = 0; i < nworkers; i++)
															
 
																 		workerids[i] = -1;
															
 
																 	return;
															
 
																 }
															
@@ -139,10 +182,14 @@ static void _init_workers(int *workerids)
 
																 static void list_init(struct starpu_worker_collection *workers)
															
 
																 {
															
 
																 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
															
 
																+	int *masters = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
															
 
																 	_init_workers(workerids);
															
 
																+	_init_workers(masters);
															
 
																 	workers->workerids = (void*)workerids;
															
 
																 	workers->nworkers = 0;
															
 
																+	workers->masters = (void*)masters;
															
 
																+	workers->nmasters = 0;
															
 
																 	return;
															
 
																 }
															
@@ -150,17 +197,32 @@ static void list_init(struct starpu_worker_collection *workers)
 
																 static void list_deinit(struct starpu_worker_collection *workers)
															
 
																 {
															
 
																 	free(workers->workerids);
															
 
																+	free(workers->masters);
															
 
																 }
															
 
																-static void list_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
															
 
																+static void list_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																 {
															
 
																 	it->cursor = 0;
															
 
																+
															
 
																+	int *workerids = (int *)workers->workerids;
															
 
																+	unsigned nworkers = workers->nworkers;
															
 
																+	unsigned i;
															
 
																+	int nm = 0;
															
 
																+	for(i = 0;  i < nworkers; i++)
															
 
																+	{
															
 
																+		if(!starpu_worker_is_slave(workerids[i]))
															
 
																+			((int*)workers->masters)[nm++] = workerids[i];
															
 
																+	}
															
 
																+	workers->nmasters = nm;
															
 
																+
															
 
																 }
															
 
																 struct starpu_worker_collection worker_list =
															
 
																 {
															
 
																 	.has_next = list_has_next,
															
 
																 	.get_next = list_get_next,
															
 
																+	.has_next_master = list_has_next_master,
															
 
																+	.get_next_master = list_get_next_master,
															
 
																 	.add = list_add,
															
 
																 	.remove = list_remove,
															
 
																 	.init = list_init,
															
--- a/src/worker_collection/worker_tree.c
+++ b/src/worker_collection/worker_tree.c
@@ -89,6 +89,75 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 
																 	return ret;
															
 
																 }
															
 
																+static unsigned tree_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																+{
															
 
																+	STARPU_ASSERT(it != NULL);
															
 
																+	if(workers->nworkers == 0)
															
 
																+		return 0;
															
 
																+
															
 
																+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
															
 
																+	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
															
 
																+	
															
 
																+	if(!neighbour)
															
 
																+	{
															
 
																+		starpu_tree_reset_visited(tree, it->visited);
															
 
																+		it->value = NULL;
															
 
																+		it->possible_value = NULL;
															
 
																+		return 0;
															
 
																+	}
															
 
																+	int id = -1;
															
 
																+	int workerids[STARPU_NMAXWORKERS];
															
 
																+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
															
 
																+	int w;
															
 
																+	for(w = 0; w < nworkers; w++)
															
 
																+	{
															
 
																+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
															
 
																+		{
															
 
																+			id = workerids[w];
															
 
																+			it->possible_value = neighbour;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(id != -1, "bind id (%d) for workerid (%d) not correct", neighbour->id, id);
															
 
																+
															
 
																+	return 1;
															
 
																+}
															
 
																+
															
 
																+static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																+{
															
 
																+	int ret = -1;
															
 
																+	
															
 
																+	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
															
 
																+	struct starpu_tree *neighbour = NULL;
															
 
																+	if(it->possible_value)
															
 
																+	{
															
 
																+		neighbour = it->possible_value;
															
 
																+		it->possible_value = NULL;
															
 
																+	}
															
 
																+	else
															
 
																+		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
															
 
																+	
															
 
																+	STARPU_ASSERT_MSG(neighbour, "no element anymore");
															
 
																+	
															
 
																+	
															
 
																+	int workerids[STARPU_NMAXWORKERS];
															
 
																+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
															
 
																+	int w;
															
 
																+	for(w = 0; w < nworkers; w++)
															
 
																+	{
															
 
																+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
															
 
																+		{
															
 
																+			ret = workerids[w];
															
 
																+			it->visited[workerids[w]] = 1;
															
 
																+			it->value = neighbour;
															
 
																+		}
															
 
																+	}
															
 
																+	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+
															
 
																 static int tree_add(struct starpu_worker_collection *workers, int worker)
															
 
																 {
															
 
																 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
															
@@ -111,6 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 
																 	if(workers->present[worker])
															
 
																 	{
															
 
																 		workers->present[worker] = 0;
															
 
																+		workers->is_master[worker] = 0;
															
 
																 		workers->nworkers--;
															
 
																 		return worker;
															
 
																 	}
															
@@ -122,10 +192,14 @@ static void tree_init(struct starpu_worker_collection *workers)
 
																 {
															
 
																 	workers->workerids = (void*)starpu_workers_get_tree();
															
 
																 	workers->nworkers = 0;
															
 
																-	
															
 
																+
															
 
																 	int i;
															
 
																-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																+	int nworkers = starpu_worker_get_count();
															
 
																+	for(i = 0; i < nworkers; i++)
															
 
																+	{
															
 
																 		workers->present[i] = 0;
															
 
																+		workers->is_master[i] = 0;
															
 
																+	}
															
 
																 	return;
															
 
																 }
															
@@ -135,19 +209,25 @@ static void tree_deinit(struct starpu_worker_collection *workers)
 
																 //	free(workers->workerids);
															
 
																 }
															
 
																-static void tree_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
															
 
																+static void tree_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
															
 
																 {
															
 
																 	it->value = NULL;
															
 
																 	it->possible_value = NULL;
															
 
																 	int i;
															
 
																-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
															
 
																+	int nworkers = starpu_worker_get_count();
															
 
																+	for(i = 0; i < nworkers; i++)
															
 
																+	{
															
 
																+		workers->is_master[i] = (workers->present[i] && !starpu_worker_is_slave(i));
															
 
																 		it->visited[i] = 0;
															
 
																+	}
															
 
																 }
															
 
																 struct starpu_worker_collection worker_tree =
															
 
																 {
															
 
																 	.has_next = tree_has_next,
															
 
																 	.get_next = tree_get_next,
															
 
																+	.has_next_master = tree_has_next_master,
															
 
																+	.get_next_master = tree_get_next_master,
															
 
																 	.add = tree_add,
															
 
																 	.remove = tree_remove,
															
 
																 	.init = tree_init,
															
--- a/tests/datawizard/commute.c
+++ b/tests/datawizard/commute.c
@@ -171,6 +171,7 @@ int main(int argc, char **argv)
 
																 		test(STARPU_R, STARPU_RW, i);
															
 
																 	}
															
 
																+	starpu_data_unregister(x_handle);
															
 
																 	starpu_shutdown();
															
 
																 	STARPU_RETURN(0);
															
--- a/tests/heat/dmda.sh
+++ b/tests/heat/dmda.sh
@@ -2,7 +2,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 # 
															
 
																-# Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																 # 
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -52,7 +52,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 
																 mkdir -p $TIMINGDIR
															
 
																 mkdir -p $SAMPLINGDIR
															
 
																-#schedlist="ws no-prio greedy prio dm random"
															
 
																+#schedlist="ws lws no-prio greedy prio dm random"
															
 
																 #schedlist="random random random random"
															
 
																 export STARPU_NCUDA=3
															
--- a/tests/heat/gflops_sched.gp
+++ b/tests/heat/gflops_sched.gp
@@ -3,7 +3,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 # 
															
 
																-# Copyright (C) 2008, 2009  Université de Bordeaux 1
															
 
																+# Copyright (C) 2008, 2009, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																 # 
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -30,7 +30,8 @@ set key right bottom
 
																 set datafile missing 'x'
															
 
																 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
															
 
																      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
															
 
																-     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "ws" 
															
 
																+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "ws" 	    ,\
															
 
																+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "lws" 
															
 
																 set output "gflops_sched_gain.eps"
															
 
																 set title "LU Decomposition : scheduling strategies : gain"
															
@@ -43,4 +44,5 @@ set logscale x
 
																 set key right bottom
															
 
																 set datafile missing 'x'
															
 
																 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
															
 
																-	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    
															
 
																+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    ,\
															
 
																+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain lws"    
															
--- a/tests/heat/gflops_sched.sh
+++ b/tests/heat/gflops_sched.sh
@@ -2,7 +2,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 # 
															
 
																-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																 # 
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -137,6 +137,15 @@ do
 
																 done
															
 
																+filename=$TIMINGDIR/gflops.lws.data
															
 
																+policy=lws
															
 
																+trace_header 
															
 
																+for size in $sizelist
															
 
																+do
															
 
																+	trace_size $size;
															
 
																+done
															
 
																+
															
 
																+
															
 
																 filename=$TIMINGDIR/gflops.noprio.data
															
 
																 policy=no-prio
															
 
																 trace_header 
															
--- a/tests/heat/granularity.r
+++ b/tests/heat/granularity.r
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -19,7 +19,7 @@ max <- 28
 
																 maxy <- 400
															
 
																 sizelist <- seq(2048, max*1024, 64);
															
 
																-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
															
 
																+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
															
 
																 #schedlist <- c("greedy", "prio", "dm", "random");
															
 
																 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
															
 
																 grainlist <- c(256, 512, 1024, 2048);
															
--- a/tests/heat/granularity_model.r
+++ b/tests/heat/granularity_model.r
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -17,7 +17,7 @@
 
																 max <- 30
															
 
																 sizelist <- seq(64, max*1024, 64);
															
 
																-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
															
 
																+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
															
 
																 #schedlist <- c("greedy", "prio", "dm", "random");
															
 
																 #grainlist <- c(256, 512, 1024)
															
 
																 grainlist <- c(512, 1024)
															
--- a/tests/heat/model.r
+++ b/tests/heat/model.r
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -15,7 +15,7 @@
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 sizelist <- seq(2048, 24576, 2048);
															
 
																-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
															
 
																+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
															
 
																 schedlist <- c("prio", "dm", "random");
															
 
																 print(schedlist);
															
--- a/tests/heat/random.r
+++ b/tests/heat/random.r
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -15,7 +15,7 @@
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 sizelist <- seq(2048, 24576, 2048);
															
 
																-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
															
 
																+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
															
 
																 schedlist <- c("prio","random");
															
 
																 print(schedlist);
															
@@ -97,13 +97,14 @@ display_sched <- function()
 
																 	trace_sched("prio", "red", 4);
															
 
																 	#trace_sched("no-prio", "black");
															
 
																 	#trace_sched("ws", "purple");
															
 
																+	#trace_sched("lws", "purple");
															
 
																 	axis(1, at=sizelist)
															
 
																 	axis(2, at=seq(0, 100, 10), tck=1)
															
 
																 #	axis(4, at=seq(0, 100, 10))
															
 
																 	box(bty="u")
															
 
																-        #labels <- c("greedy", "priority", "model", "random", "black", "ws")
															
 
																+        #labels <- c("greedy", "priority", "model", "random", "black", "ws", "lws")
															
 
																 #        labels <- c("greedy", "priority", "model", "random")
															
 
																 	#labels <- c("model", "weighted random", "greedy", "priority")
															
 
																 	labels <- c("weighted random", "priority")
															
--- a/tests/heat/sched.r
+++ b/tests/heat/sched.r
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -15,7 +15,7 @@
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 sizelist <- seq(2048, 24576, 2048);
															
 
																-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
															
 
																+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
															
 
																 schedlist <- c("greedy", "prio", "dm", "random");
															
 
																 print(schedlist);
															
--- a/tests/heat/sched.sh
+++ b/tests/heat/sched.sh
@@ -2,7 +2,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 # 
															
 
																-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																 # 
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -94,7 +94,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 
																 mkdir -p $TIMINGDIR
															
 
																 mkdir -p $SAMPLINGDIR
															
 
																-#schedlist="ws no-prio greedy prio dm random"
															
 
																+#schedlist="ws lws no-prio greedy prio dm random"
															
 
																 #schedlist="random random random random"
															
 
																 export STARPU_NCUDA=3
															
--- a/tests/main/driver_api/init_run_deinit.c
+++ b/tests/main/driver_api/init_run_deinit.c
@@ -49,8 +49,11 @@ run(struct starpu_task *task, struct starpu_driver *d)
 
																 	int ret;
															
 
																 	ret = starpu_task_submit(task);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																-	ret = starpu_driver_run_once(d);
															
 
																-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
															
 
																+	while (!starpu_task_finished(task))
															
 
																+	{
															
 
																+		ret = starpu_driver_run_once(d);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
															
 
																+	}
															
 
																 	ret = starpu_task_wait(task);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
															
 
																 }
															
--- a/tests/main/subgraph_repeat.c
+++ b/tests/main/subgraph_repeat.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 
																 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
															
 
																 	starpu_free(check_cnt);
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
@@ -179,6 +180,7 @@ enodev:
 
																 	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																 	/* yes, we do not perform the computation but we did detect that no one
															
 
																  	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
 
																 	return STARPU_TEST_SKIPPED;
															
 
																 }
															
--- a/tests/main/subgraph_repeat_regenerate.c
+++ b/tests/main/subgraph_repeat_regenerate.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -168,6 +168,7 @@ int main(int argc, char **argv)
 
																 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
															
 
																 	starpu_free(check_cnt);
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
@@ -183,6 +184,7 @@ enodev:
 
																 	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																 	/* yes, we do not perform the computation but we did detect that no one
															
 
																  	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
 
																 	return STARPU_TEST_SKIPPED;
															
 
																 }
															
--- a/tests/main/subgraph_repeat_regenerate_tag.c
+++ b/tests/main/subgraph_repeat_regenerate_tag.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -198,6 +198,7 @@ enodev:
 
																 	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																 	/* yes, we do not perform the computation but we did detect that no one
															
 
																  	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
 
																 	return STARPU_TEST_SKIPPED;
															
 
																 }
															
--- a/tests/main/subgraph_repeat_tag.c
+++ b/tests/main/subgraph_repeat_tag.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2014  Université de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -182,6 +182,7 @@ enodev:
 
																 	fprintf(stderr, "WARNING: No one can execute this task\n");
															
 
																 	/* yes, we do not perform the computation but we did detect that no one
															
 
																  	 * could perform the kernel, so this is not an error from StarPU */
															
 
																+	starpu_data_unregister(check_data);
															
 
																 	starpu_shutdown();
															
 
																 	return STARPU_TEST_SKIPPED;
															
 
																 }
															
--- a/tests/perfmodels/feed.c
+++ b/tests/perfmodels/feed.c
@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 
																 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																-	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
															
 
																+	 if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
															
 
																+	 {
															
 
																+		 starpu_shutdown();
															
 
																 		 return STARPU_TEST_SKIPPED;
															
 
																+	 }
															
 
																 	starpu_task_init(&task);
															
 
																 	task.cl = &cl;
															
@@ -76,7 +79,7 @@ int main(int argc, char **argv)
 
																 		arch.devid = 0;
															
 
																 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
															
 
																 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
															
 
																-		
															
 
																+
															
 
																 		/* Simulate Slow GPU */
															
 
																 		arch.devid = 1;
															
 
																 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
															
--- a/tests/regression/profiles.in
+++ b/tests/regression/profiles.in
@@ -1,6 +1,6 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -32,6 +32,8 @@ STARPU_NCUDA=1
 
																 # Execution configuration
															
 
																 STARPU_SCHED=ws
															
 
																 # Execution configuration
															
 
																+STARPU_SCHED=lws
															
 
																+# Execution configuration
															
 
																 STARPU_SCHED=prio
															
 
																 # Execution configuration
															
 
																 STARPU_SCHED=no-prio
															
--- a/tests/regression/regression_test.sh
+++ b/tests/regression/regression_test.sh
@@ -2,7 +2,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 # 
															
 
																-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
															
 
																+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
															
 
																 # Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																 # 
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -65,6 +65,10 @@ echo "heat.ws.8k.v2"
 
																 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
															
 
																 save_cov "heat.ws.8k.v2";
															
 
																+echo "heat.lws.8k.v2"
															
 
																+timing=`STARPU_SCHED="lws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
															
 
																+save_cov "heat.lws.8k.v2";
															
 
																+
															
 
																 echo "heat.greedy.8k.v2"
															
 
																 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
															
 
																 save_cov "heat.greedy.8k.v2";