11 years ago · f281168e46
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,6 +1,7 @@
 
				 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
			
 
				 Cédric Augonnet <cedric.augonnet@inria.fr>
			
 
				 William Braik <wbraik@gmail.com>
			
 
				+Alfredo Buttari <alfredo.buttari@enseeiht.fr>
			
 
				 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
			
 
				 Nicolas Collin <nicolas.collin@inria.fr>
			
 
				 Ludovic Courtès <ludovic.courtes@inria.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -17,24 +17,6 @@
 
				 StarPU 1.2.0 (svn revision xxxx)
			
 
				 ==============================================
			
 
				 
			
 
				-Small features:
			
 
				-  * New function starpu_sched_ctx_display_workers() to display worker
			
 
				-    information belonging to a given scheduler context
			
 
				-  * The option --enable-verbose can be called with
			
 
				-    --enable-verbose=extra to increase the verbosity
			
 
				-
			
 
				-StarPU 1.1.2 (svn revision xxxx)
			
 
				-==============================================
			
 
				-The scheduling context release
			
 
				-
			
 
				-New features:
			
 
				-  * The reduction init codelet is automatically used to initialize temporary
			
 
				-    buffers.
			
 
				-
			
 
				-StarPU 1.1.1 (svn revision 12638)
			
 
				-==============================================
			
 
				-The scheduling context release
			
 
				-
			
 
				 New features:
			
 
				   * Xeon Phi support
			
 
				   * SCC support
			
@@ -48,46 +30,88 @@ New features:
 
				 	  before the corresponding data, which allows the receiver to
			
 
				 	  allocate data correctly, and to submit the matching receive of
			
 
				 	  the envelope.
			
 
				+        - New function
			
 
				+   	  starpu_mpi_irecv_detached_sequential_consistency which
			
 
				+	  allows to enable or disable the sequential consistency for
			
 
				+	  the given data handle (sequential consistency will be
			
 
				+	  enabled or disabled based on the value of the function
			
 
				+	  parameter and the value of the sequential consistency
			
 
				+	  defined for the given data)
			
 
				+        - New functions starpu_mpi_task_build() and
			
 
				+  	  starpu_mpi_task_post_build()
			
 
				   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
			
 
				     let starpu commute write accesses.
			
 
				   * Out-of-core support, through registration of disk areas as additional memory
			
 
				     nodes.
			
 
				-  * StarPU-MPI: new function
			
 
				-    starpu_mpi_irecv_detached_sequential_consistency which allows to
			
 
				-    enable or disable the sequential consistency for the given data
			
 
				-    handle (sequential consistency will be enabled or disabled based
			
 
				-    on the value of the function parameter and the value of the
			
 
				-    sequential consistency defined for the given data)
			
 
				-  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
			
 
				-  * New functions starpu_pause() and starpu_resume()
			
 
				-  * New codelet specific_nodes field to specify explicit target nodes for data.
			
 
				-  * Use streams for all CUDA transfers, even initiated by CPUs.
			
 
				   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
			
 
				     CUDA and OpenCL kernel execution.
			
 
				-  * Add paje traces statistics tools.
			
 
				   * Add CUDA concurrent kernel execution support through
			
 
				     the STARPU_NWORKER_PER_CUDA environment variable.
			
 
				-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
			
 
				 
			
 
				 Small features:
			
 
				+  * Tasks can now have a name (via the field const char *name of
			
 
				+    struct starpu_task)
			
 
				   * New functions starpu_data_acquire_cb_sequential_consistency() and
			
 
				     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
			
 
				     to enable or disable sequential consistency
			
 
				   * New configure option --enable-fxt-lock which enables additional
			
 
				     trace events focused on locks behaviour during the execution
			
 
				-  * New function starpu_perfmodel_directory() to print directory
			
 
				-    storing performance models. Available through the new option -d of
			
 
				-    the tool starpu_perfmodel_display
			
 
				-  * New batch files to execute StarPU applications under Microsoft
			
 
				-    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
			
 
				   * Functions starpu_insert_task and starpu_mpi_insert_task are
			
 
				     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
			
 
				     names are kept to avoid breaking old codes.
			
 
				   * New configure option --enable-calibration-heuristic which allows
			
 
				     the user to set the maximum authorized deviation of the
			
 
				     history-based calibrator.
			
 
				-  * Tasks can now have a name (via the field const char *name of
			
 
				-    struct starpu_task)
			
 
				+  * Allow application to provide the task footprint itself.
			
 
				+  * New function starpu_sched_ctx_display_workers() to display worker
			
 
				+    information belonging to a given scheduler context
			
 
				+  * The option --enable-verbose can be called with
			
 
				+    --enable-verbose=extra to increase the verbosity
			
 
				+  * Add codelet size, footprint and tag id in the paje trace.
			
 
				+
			
 
				+Changes:
			
 
				+  * Data interfaces (variable, vector, matrix and block) now define
			
 
				+    pack und unpack functions
			
 
				+  * StarPU-MPI: Fix for being able to receive data which have not yet
			
 
				+    been registered by the application (i.e it did not call
			
 
				+    starpu_data_set_tag(), data are received as a raw memory)
			
 
				+  * StarPU-MPI: Fix for being able to receive data with the same tag
			
 
				+    from several nodes (see mpi/tests/gather.c)
			
 
				+
			
 
				+Small changes:
			
 
				+  * Rename function starpu_trace_user_event() as
			
 
				+    starpu_fxt_trace_user_event()
			
 
				+
			
 
				+StarPU 1.1.2 (svn revision xxx)
			
 
				+==============================================
			
 
				+The scheduling context release
			
 
				+
			
 
				+New features:
			
 
				+  * The reduction init codelet is automatically used to initialize temporary
			
 
				+    buffers.
			
 
				+  * Traces now include a "scheduling" state, to show the overhead of the
			
 
				+    scheduler.
			
 
				+  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
			
 
				+    number of calibration measurements.
			
 
				+
			
 
				+StarPU 1.1.1 (svn revision 12638)
			
 
				+==============================================
			
 
				+The scheduling context release
			
 
				+
			
 
				+New features:
			
 
				+  * MPI:
			
 
				+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
			
 
				+   	  cache holding received data.
			
 
				+        - New function starpu_mpi_data_register() which sets the rank
			
 
				+  	  and tag of a data, and also allows to automatically clear
			
 
				+	  the MPI communication cache when unregistering the data. It
			
 
				+	  should be called instead of both calling
			
 
				+	  starpu_data_set_tag() and starpu_data_set_rank()
			
 
				+  * Use streams for all CUDA transfers, even initiated by CPUs.
			
 
				+  * Add paje traces statistics tools.
			
 
				+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
			
 
				+
			
 
				+Small features:
			
 
				   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
			
 
				     to execute the task.
			
 
				   * New STARPU_DISABLE_PINNING environment variable to disable host memory
			
@@ -97,23 +121,23 @@ Small features:
 
				   * New starpu_memory_get_total function to get the size of a memory node.
			
 
				   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
			
 
				     a set of workers without going through combined workers.
			
 
				-  * Allow application to provide the task footprint itself.
			
 
				 
			
 
				 Changes:
			
 
				-  * Data interfaces (variable, vector, matrix and block) now define
			
 
				-    pack und unpack functions
			
 
				-  * StarPU-MPI: Fix for being able to receive data which have not yet
			
 
				-    been registered by the application (i.e it did not call
			
 
				-    starpu_data_set_tag(), data are received as a raw memory)
			
 
				-  * StarPU-MPI: Fix for being able to receive data with the same tag
			
 
				-    from several nodes (see mpi/tests/gather.c)
			
 
				+  * Fix simgrid execution.
			
 
				+  * Rename starpu_get_nready_tasks_of_sched_ctx to starpu_sched_ctx_get_nready_tasks
			
 
				+  * Rename starpu_get_nready_flops_of_sched_ctx to starpu_sched_ctx_get_nready_flops
			
 
				+  * New functions starpu_pause() and starpu_resume()
			
 
				+  * New codelet specific_nodes field to specify explicit target nodes for data.
			
 
				   * StarPU-MPI: Fix overzealous allocation of memory.
			
 
				   * Interfaces: Allow interface implementation to change pointers at will, in
			
 
				     unpack notably.
			
 
				 
			
 
				 Small changes:
			
 
				-  * Rename function starpu_trace_user_event() as
			
 
				-    starpu_fxt_trace_user_event()
			
 
				+  * Use big fat abortions when one tries to make a task or callback
			
 
				+    sleep, instead of just returning EDEADLCK which few people will test
			
 
				+  * By default, StarPU FFT examples are not compiled and checked, the
			
 
				+    configure option --enable-starpufft-examples needs to be specified
			
 
				+    to change this behaviour.
			
 
				 
			
 
				 StarPU 1.1.0 (svn revision 11960)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -975,16 +975,19 @@ AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
 
				 if test x$enable_simgrid = xyes ; then
			
 
				    	if test -n "$SIMGRID_CFLAGS" ; then
			
 
				 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
			
 
				+	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
			
 
				 	fi
			
 
				 	if test -n "$SIMGRID_LIBS" ; then
			
 
				 		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
			
 
				 	fi
			
 
				 	if test "$simgrid_dir" != "no" ; then
			
 
				 	   	CFLAGS="-I$simgrid_dir/include $CFLAGS"
			
 
				+	   	CXXFLAGS="-I$simgrid_dir/include $CXXFLAGS"
			
 
				 	   	LDFLAGS="-L$simgrid_dir/lib $LDFLAGS"
			
 
				 	fi
			
 
				 	if test "$simgrid_include_dir" != "no" ; then
			
 
				 	   	CFLAGS="-I$simgrid_include_dir $CFLAGS"
			
 
				+	   	CXXFLAGS="-I$simgrid_include_dir $CXXFLAGS"
			
 
				 	fi
			
 
				 	if test "$simgrid_lib_dir" != "no" ; then
			
 
				 	   	LDFLAGS="-L$simgrid_lib_dir $LDFLAGS"
			
@@ -994,7 +997,8 @@ if test x$enable_simgrid = xyes ; then
 
				 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
			
 
				 		]
			
 
				 	)
			
 
				-   	AC_CHECK_FUNCS([MSG_process_join])
			
 
				+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
			
 
				+	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
			
 
				 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
			
 
				 		    		[[#include <msg/msg.h>]],
			
 
				 				[[msg_host_t foo; ]]
			
@@ -1478,6 +1482,12 @@ if test x$use_fxt = xyes; then
 
				 	AC_CHECK_DECLS([fut_set_filename])
			
 
				 	CFLAGS="$save_CFLAGS"
			
 
				 
			
 
				+        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
			
 
				+			[enable details about codelets in the paje trace])],
			
 
				+			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
			
 
				+        if  test x$enable_paje_codelet_details = xyes; then
			
 
				+        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
			
 
				+        fi
			
 
				 	##########################################
			
 
				 	# Poti is a library to generate paje trace files
			
 
				 	##########################################
			
--- a/doc/doxygen/chapters/08scheduling.doxy
+++ b/doc/doxygen/chapters/08scheduling.doxy
@@ -45,6 +45,11 @@ a task on the worker which released it by
 
				 default. When a worker becomes idle, it steals a task from the most loaded
			
 
				 worker.
			
 
				 
			
 
				+The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
			
 
				+a task on the worker which released it by
			
 
				+default. When a worker becomes idle, it steals a task from neighbour workers. It
			
 
				+also takes into account priorities.
			
 
				+
			
 
				 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
			
 
				 perform a HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				 termination time will be minimal. The difference with HEFT is that <b>dm</b>
			
--- a/doc/doxygen/chapters/12online_performance_tools.doxy
+++ b/doc/doxygen/chapters/12online_performance_tools.doxy
@@ -389,11 +389,11 @@ parameters through starpu_hash_crc32c_be for instance.
 
				 StarPU will automatically determine when the performance model is calibrated,
			
 
				 or rather, it will assume the performance model is calibrated until the
			
 
				 application submits a task for which the performance can not be predicted. For
			
 
				-::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
			
 
				+::STARPU_HISTORY_BASED, StarPU will require 10 (STARPU_CALIBRATE_MINIMUM)
			
 
				 measurements for a given size before estimating that an average can be taken as
			
 
				 estimation for further executions with the same size. For
			
 
				 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
			
 
				-10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
			
 
				+10 (STARPU_CALIBRATE_MINIMUM) measurements, and that the minimum measured
			
 
				 data size is smaller than 90% of the maximum measured data size (i.e. the
			
 
				 measurement interval is large enough for a regression to have a meaning).
			
 
				 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
			
--- a/doc/doxygen/chapters/13offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/13offline_performance_tools.doxy
@@ -118,6 +118,9 @@ $ vite paje.trace
 
				 
			
 
				 To get names of tasks instead of "unknown", fill the optional
			
 
				 starpu_codelet::name, or use a performance model for them.
			
 
				+Details of the codelet execution can be obtained by passing
			
 
				+<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
			
 
				+(at least r1430).
			
 
				 
			
 
				 In the MPI execution case, collect the trace files from the MPI nodes, and
			
 
				 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
			
--- a/doc/doxygen/chapters/16mpi_support.doxy
+++ b/doc/doxygen/chapters/16mpi_support.doxy
@@ -121,49 +121,53 @@ automatically released. This mechanism is similar to the pthread
 
				 detach state attribute which determines whether a thread will be
			
 
				 created in a joinable or a detached state.
			
 
				 
			
 
				-For any communication, the call of the function will result in the
			
 
				-creation of a StarPU-MPI request, the function
			
 
				-starpu_data_acquire_cb() is then called to asynchronously request
			
 
				-StarPU to fetch the data in main memory; when the data is available in
			
 
				-main memory, a StarPU-MPI function is called to put the new request in
			
 
				-the list of the ready requests if it is a send request, or in an
			
 
				-hashmap if it is a receive request.
			
 
				-
			
 
				-Internally, all MPI communications submitted by StarPU uses a unique
			
 
				-tag which has a default value, and can be accessed with the functions
			
 
				+Internally, all communication are divided in 2 communications, a first
			
 
				+message is used to exchange an envelope describing the data (i.e its
			
 
				+tag and its size), the data itself is sent in a second message. All
			
 
				+MPI communications submitted by StarPU uses a unique tag which has a
			
 
				+default value, and can be accessed with the functions
			
 
				 starpu_mpi_get_communication_tag() and
			
 
				-starpu_mpi_set_communication_tag().
			
 
				-
			
 
				-The matching of tags with corresponding requests is done into StarPU-MPI.
			
 
				-To handle this, any communication is a double-communication based on a
			
 
				-envelope + data system. Every data which will be sent needs to send an
			
 
				-envelope which describes the data (particularly its tag) before sending
			
 
				-the data, so the receiver can get the matching pending receive request
			
 
				-from the hashmap, and submit it to recieve the data correctly.
			
 
				-
			
 
				-To this aim, the StarPU-MPI progression thread has a permanent-submitted
			
 
				-request destined to receive incoming envelopes from all sources.
			
 
				-
			
 
				-The StarPU-MPI progression thread regularly polls this list of ready
			
 
				-requests. For each new ready request, the appropriate function is
			
 
				-called to post the corresponding MPI call. For example, calling
			
 
				-starpu_mpi_isend() will result in posting <c>MPI_Isend</c>. If
			
 
				-the request is marked as detached, the request will be put in the list
			
 
				-of detached requests.
			
 
				-
			
 
				-The StarPU-MPI progression thread also polls the list of detached
			
 
				-requests. For each detached request, it regularly tests the completion
			
 
				-of the MPI request by calling <c>MPI_Test</c>. On completion, the data
			
 
				-handle is released, and if a callback was defined, it is called.
			
 
				-
			
 
				-Finally, the StarPU-MPI progression thread checks if an envelope has
			
 
				-arrived. If it is, it'll check if the corresponding receive has already
			
 
				-been submitted by the application. If it is, it'll submit the request
			
 
				-just as like as it does with those on the list of ready requests.
			
 
				-If it is not, it'll allocate a temporary handle to store the data that
			
 
				-will arrive just after, so as when the corresponding receive request
			
 
				-will be submitted by the application, it'll copy this temporary handle
			
 
				-into its one instead of submitting a new StarPU-MPI request.
			
 
				+starpu_mpi_set_communication_tag(). The matching of tags with
			
 
				+corresponding requests is done within StarPU-MPI.
			
 
				+
			
 
				+For any userland communication, the call of the corresponding function
			
 
				+(e.g starpu_mpi_isend()) will result in the creation of a StarPU-MPI
			
 
				+request, the function starpu_data_acquire_cb() is then called to
			
 
				+asynchronously request StarPU to fetch the data in main memory; when
			
 
				+the data is ready and the corresponding buffer has already been
			
 
				+received by MPI, it will be copied in the memory of the data,
			
 
				+otherwise the request is stored in the <em>early requests list</em>. Sending
			
 
				+requests are stored in the <em>ready requests list</em>.
			
 
				+
			
 
				+While requests need to be processed, the StarPU-MPI progression thread
			
 
				+does the following:
			
 
				+
			
 
				+<ol>
			
 
				+<li> it polls the <em>ready requests list</em>. For all the ready
			
 
				+requests, the appropriate function is called to post the corresponding
			
 
				+MPI call. For example, an initial call to starpu_mpi_isend() will
			
 
				+result in a call to <c>MPI_Isend</c>. If the request is marked as
			
 
				+detached, the request will then be added in the <em>detached requests
			
 
				+list</em>.
			
 
				+</li>
			
 
				+<li> it posts a <c>MPI_Irecv()</c> to retrieve a data envelope.
			
 
				+</li>
			
 
				+<li> it polls the <em>detached requests list</em>. For all the detached
			
 
				+requests, it tests its completion of the MPI request by calling
			
 
				+<c>MPI_Test</c>. On completion, the data handle is released, and if a
			
 
				+callback was defined, it is called.
			
 
				+</li>
			
 
				+<li> finally, it checks if a data envelope has been received. If so,
			
 
				+if the data envelope matches a request in the <em>early requests list</em> (i.e
			
 
				+the request has already been posted by the application), the
			
 
				+corresponding MPI call is posted (similarly to the first step above).
			
 
				+
			
 
				+If the data envelope does not match any application request, a
			
 
				+temporary handle is created to receive the data, a StarPU-MPI request
			
 
				+is created and added into the <em>ready requests list</em>, and thus will be
			
 
				+processed in the first step of the next loop.
			
 
				+</li>
			
 
				+</ol>
			
 
				 
			
 
				 \ref MPIPtpCommunication "Communication" gives the list of all the
			
 
				 point to point communications defined in StarPU-MPI.
			
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -314,6 +314,14 @@ is the default behaviour.
 
				 Note: this currently only applies to <c>dm</c> and <c>dmda</c> scheduling policies.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_CALIBRATE_MINIMUM</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CALIBRATE_MINIMUM
			
 
				+\addindex __env__STARPU_CALIBRATE_MINIMUM
			
 
				+This defines the minimum number of calibration measurements that will be made
			
 
				+before considering that the performance model is calibrated. The default value is 10.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_BUS_CALIBRATE</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_BUS_CALIBRATE
			
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
@@ -372,6 +372,14 @@ Enable performance debugging through gprof.
 
				 Enable performance model debugging.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-paje-codelet-details</dt>
			
 
				+<dd>
			
 
				+\anchor enable-paje-codelet-details
			
 
				+\addindex __configure__--enable-paje-codelet-details
			
 
				+Enable details about codelets in the paje trace. This requires a recent enough
			
 
				+version of ViTE (at least r1430).
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--enable-fxt-lock</dt>
			
 
				 <dd>
			
 
				 \anchor enable-fxt-lock
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -190,6 +190,7 @@ examplebin_PROGRAMS +=				\
 
				 	sched_ctx/dummy_sched_with_ctx		\
			
 
				 	sched_ctx/prio				\
			
 
				 	sched_ctx/sched_ctx_without_sched_policy\
			
 
				+	sched_ctx/nested_sched_ctxs		\
			
 
				 	worker_collections/worker_tree_example  \
			
 
				 	worker_collections/worker_list_example  \
			
 
				 	reductions/dot_product			\
			
@@ -270,6 +271,7 @@ STARPU_EXAMPLES +=				\
 
				 	sched_ctx/prio				\
			
 
				 	sched_ctx/dummy_sched_with_ctx		\
			
 
				 	sched_ctx/sched_ctx_without_sched_policy\
			
 
				+	sched_ctx/nested_sched_ctxs		\
			
 
				 	worker_collections/worker_tree_example  \
			
 
				 	worker_collections/worker_list_example  \
			
 
				 	reductions/dot_product			\
			
@@ -925,6 +927,9 @@ sched_ctx_parallel_code_CFLAGS = \
 
				 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
			
 
				 	$(AM_CFLAGS) -fopenmp
			
 
				 
			
 
				+sched_ctx_nested_sched_ctxs_CFLAGS = \
			
 
				+	$(AM_CFLAGS) -fopenmp
			
 
				+
			
 
				 endif
			
 
				 
			
 
				 showcheck:
			
--- a/examples/binary/binary.c
+++ b/examples/binary/binary.c
@@ -29,6 +29,7 @@ struct starpu_codelet cl =
 
				 {
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.opencl_funcs = {opencl_codelet, NULL},
			
 
				+	.opencl_flags = {STARPU_OPENCL_ASYNC},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW}
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2012 inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -61,9 +61,11 @@ int main(int argc, char **argv)
 
				         cl.cpu_funcs[0] = cpu_codelet;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				         cl.cuda_funcs[0] = cuda_codelet;
			
 
				+	cl.cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	cl.opencl_funcs[0] = opencl_codelet;
			
 
				+	cl.opencl_flags[0] = STARPU_OPENCL_ASYNC;
			
 
				 #endif
			
 
				         cl.nbuffers = 1;
			
 
				         cl.modes[0] = STARPU_RW;
			
--- a/examples/sched_ctx/nested_sched_ctxs.c
+++ b/examples/sched_ctx/nested_sched_ctxs.c
@@ -0,0 +1,212 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define NTASKS 64
			
 
				+#else
			
 
				+#define NTASKS 100
			
 
				+#endif
			
 
				+
			
 
				+int tasks_executed[2];
			
 
				+starpu_pthread_mutex_t mut;
			
 
				+
			
 
				+int parallel_code(int sched_ctx)
			
 
				+{
			
 
				+	int i;
			
 
				+	int t = 0;
			
 
				+	int *cpuids = NULL;
			
 
				+	int ncpuids = 0;
			
 
				+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
			
 
				+
			
 
				+//	printf("execute task of %d threads \n", ncpuids);
			
 
				+#pragma omp parallel num_threads(ncpuids)
			
 
				+	{
			
 
				+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
			
 
				+#pragma omp for
			
 
				+		for(i = 0; i < NTASKS; i++)
			
 
				+			t++;
			
 
				+	}
			
 
				+
			
 
				+	free(cpuids);
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				+{
			
 
				+	int w = starpu_worker_get_id();
			
 
				+	unsigned sched_ctx = (unsigned)arg;
			
 
				+	int n = parallel_code(sched_ctx);
			
 
				+//	printf("w %d executed %d it \n", w, n);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static struct starpu_codelet sched_ctx_codelet =
			
 
				+{
			
 
				+	.cpu_funcs = {sched_ctx_func, NULL},
			
 
				+	.cuda_funcs = {NULL},
			
 
				+	.opencl_funcs = {NULL},
			
 
				+	.model = NULL,
			
 
				+	.nbuffers = 0,
			
 
				+	.name = "sched_ctx"
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	tasks_executed[0] = 0;
			
 
				+	tasks_executed[1] = 0;
			
 
				+	int ntasks = NTASKS;
			
 
				+	int ret, j, k;
			
 
				+	unsigned ncpus = 0;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_pthread_mutex_init(&mut, NULL);
			
 
				+	int nprocs1 = 1;
			
 
				+	int nprocs2 = 1;
			
 
				+	int *procs1, *procs2;
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	ncpus =  starpu_cpu_worker_get_count();
			
 
				+	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				+
			
 
				+	if (ncpus > 1)
			
 
				+	{
			
 
				+		nprocs1 = ncpus/2;
			
 
				+		nprocs2 =  nprocs1;
			
 
				+		k = 0;
			
 
				+		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
			
 
				+			procs2[k++] = procs1[j];
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				+		procs2[0] = procs1[0];
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (ncpus == 0)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+		free(procs1);
			
 
				+		free(procs2);
			
 
				+#endif
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	/*create contexts however you want*/
			
 
				+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
			
 
				+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
			
 
				+
			
 
				+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
			
 
				+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
			
 
				+
			
 
				+	int nprocs3 = nprocs1/2;
			
 
				+	int nprocs4 = nprocs1/2;
			
 
				+	int nprocs5 = nprocs2/2;
			
 
				+	int nprocs6 = nprocs2/2;
			
 
				+	int procs3[nprocs3];
			
 
				+	int procs4[nprocs4];
			
 
				+	int procs5[nprocs5];
			
 
				+	int procs6[nprocs6];
			
 
				+
			
 
				+	k = 0;
			
 
				+	for(j = 0; j < nprocs3; j++)
			
 
				+		procs3[k++] = procs1[j];
			
 
				+	k = 0;
			
 
				+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
			
 
				+		procs4[k++] = procs1[j];
			
 
				+
			
 
				+	k = 0;
			
 
				+	for(j = 0; j < nprocs5; j++)
			
 
				+		procs5[k++] = procs2[j];
			
 
				+	k = 0;
			
 
				+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
			
 
				+		procs6[k++] = procs2[j];
			
 
				+
			
 
				+	unsigned sched_ctx3 = starpu_sched_ctx_create(procs3, nprocs3, "ctx3", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
			
 
				+	unsigned sched_ctx4 = starpu_sched_ctx_create(procs4, nprocs4, "ctx4", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
			
 
				+
			
 
				+	unsigned sched_ctx5 = starpu_sched_ctx_create(procs5, nprocs5, "ctx5", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
			
 
				+	unsigned sched_ctx6 = starpu_sched_ctx_create(procs6, nprocs6, "ctx6", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
			
 
				+
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl_arg = sched_ctx1;
			
 
				+
			
 
				+		/*submit tasks to context*/
			
 
				+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
			
 
				+
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &sched_ctx_codelet;
			
 
				+		task->cl_arg = sched_ctx2;
			
 
				+
			
 
				+		/*submit tasks to context*/
			
 
				+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
			
 
				+
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/* tell starpu when you finished submitting tasks to this context
			
 
				+	   in order to allow moving resources from this context to the inheritor one
			
 
				+	   when its corresponding tasks finished executing */
			
 
				+
			
 
				+
			
 
				+
			
 
				+	/* wait for all tasks at the end*/
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_sched_ctx_delete(sched_ctx3);
			
 
				+	starpu_sched_ctx_delete(sched_ctx4);
			
 
				+
			
 
				+	starpu_sched_ctx_delete(sched_ctx5);
			
 
				+	starpu_sched_ctx_delete(sched_ctx6);
			
 
				+
			
 
				+	starpu_sched_ctx_delete(sched_ctx1);
			
 
				+	starpu_sched_ctx_delete(sched_ctx2);
			
 
				+
			
 
				+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
			
 
				+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	free(procs1);
			
 
				+	free(procs2);
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/sched_ctx/sched_ctx_without_sched_policy.c
+++ b/examples/sched_ctx/sched_ctx_without_sched_policy.c
@@ -88,7 +88,6 @@ int main(int argc, char **argv)
 
				 #ifdef STARPU_USE_CPU
			
 
				 	ncpus = starpu_cpu_worker_get_count();
			
 
				 	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				-	procs2 = (int*)malloc(ncpus*sizeof(int));
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				 
			
 
				 	if(ncpus > 1)
			
@@ -96,22 +95,16 @@ int main(int argc, char **argv)
 
				 		nprocs1 = ncpus/2;
			
 
				 		nprocs2 =  ncpus-nprocs1;
			
 
				 		k = 0;
			
 
				+		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
			
 
				 			procs2[k++] = procs1[j];
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				 		procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				-		procs1[0] = 0;
			
 
				-		procs2[0] = 0;
			
 
				+		procs2[0] = procs1[0];
			
 
				 
			
 
				 	}
			
 
				-#else
			
 
				-	procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				-	procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				-	procs1[0] = 0;
			
 
				-	procs2[0] = 0;
			
 
				 #endif
			
 
				 
			
 
				 	if (ncpus == 0) goto enodev;
			
@@ -163,6 +156,10 @@ int main(int argc, char **argv)
 
				 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
			
 
				 
			
 
				 enodev:
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	free(procs1);
			
 
				+	free(procs2);
			
 
				+#endif
			
 
				 	starpu_shutdown();
			
 
				 	return ncpus == 0 ? 77 : 0;
			
 
				 }
			
--- a/examples/worker_collections/worker_list_example.c
+++ b/examples/worker_collections/worker_list_example.c
@@ -85,6 +85,7 @@ int main()
 
				 
			
 
				 	FPRINTF(stderr, "timing init = %lf \n", timing);
			
 
				 	co->deinit(co);
			
 
				+	free(co);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -32,6 +32,7 @@
 
				 #undef STARPU_OPENMP
			
 
				 
			
 
				 #undef STARPU_SIMGRID
			
 
				+#undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
			
 
				 
			
 
				 #undef STARPU_HAVE_ICC
			
 
				 
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -29,6 +29,7 @@ extern "C"
 
				 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
			
 
				 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
			
 
				 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
			
 
				+#define STARPU_SCHED_CTX_NESTED                  (6<<16)
			
 
				 
			
 
				 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
			
 
				 
			
@@ -127,6 +128,13 @@ int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids
 
				 
			
 
				 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
			
 
				 
			
 
				+/* return the first context (child of sched_ctx_id) where the workerid is master */
			
 
				+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id);
			
 
				+
			
 
				+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
			
 
				+
			
 
				+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
			
 
				+
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
			
 
				 #endif /* STARPU_USE_SC_HYPERVISOR */
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -255,6 +255,8 @@ void starpu_task_destroy(struct starpu_task *task);
 
				 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
			
 
				 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 
			
 
				+int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
			
 
				+
			
 
				 int starpu_task_wait(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
			
 
				 
			
 
				 int starpu_task_wait_for_all(void);
			
--- a/include/starpu_thread.h
+++ b/include/starpu_thread.h
@@ -200,6 +200,11 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
				 
			
 
				 #if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
			
 
				 
			
 
				+#if defined(STARPU_SIMGRID) && defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
			
 
				+typedef xbt_bar_t starpu_pthread_barrier_t;
			
 
				+typedef int starpu_pthread_barrierattr_t;
			
 
				+#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
			
 
				+#else
			
 
				 typedef struct {
			
 
				 	starpu_pthread_mutex_t mutex;
			
 
				 	starpu_pthread_cond_t cond;
			
@@ -208,6 +213,7 @@ typedef struct {
 
				 } starpu_pthread_barrier_t;
			
 
				 typedef int starpu_pthread_barrierattr_t;
			
 
				 #define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
			
 
				+#endif
			
 
				 
			
 
				 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *barrier, const starpu_pthread_barrierattr_t *attr, unsigned count);
			
 
				 int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier);
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -57,10 +57,15 @@ struct starpu_worker_collection
 
				 {
			
 
				 	void *workerids;
			
 
				 	unsigned nworkers;
			
 
				+	void *masters;
			
 
				+	unsigned nmasters;
			
 
				 	int present[STARPU_NMAXWORKERS];
			
 
				+	int is_master[STARPU_NMAXWORKERS];
			
 
				 	enum starpu_worker_collection_type type;
			
 
				 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				+	unsigned (*has_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				+	int (*get_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
			
 
				 	int (*add)(struct starpu_worker_collection *workers, int worker);
			
 
				 	int (*remove)(struct starpu_worker_collection *workers, int worker);
			
 
				 	void (*init)(struct starpu_worker_collection *workers);
			
@@ -109,6 +114,8 @@ int starpu_worker_get_mp_nodeid(int id);
 
				 struct starpu_tree* starpu_workers_get_tree(void);
			
 
				 
			
 
				 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
			
 
				+
			
 
				+unsigned starpu_worker_is_slave(int workerid);
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -71,6 +71,8 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 
				 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
			
 
				 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
			
 
				 
			
 
				+int starpu_mpi_world_rank(void);
			
 
				+
			
 
				 int starpu_mpi_get_communication_tag(void);
			
 
				 void starpu_mpi_set_communication_tag(int tag);
			
 
				 
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -30,7 +30,7 @@
 
				 #include <datawizard/coherency.h>
			
 
				 
			
 
				 static void _starpu_mpi_add_sync_point_in_fxt(void);
			
 
				-static void _starpu_mpi_submit_new_mpi_request(void *arg);
			
 
				+static void _starpu_mpi_submit_ready_request(void *arg);
			
 
				 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
			
@@ -46,8 +46,8 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 
				 							ssize_t count);
			
 
				 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
			
 
				 
			
 
				-/* The list of requests that have been newly submitted by the application */
			
 
				-static struct _starpu_mpi_req_list *new_requests;
			
 
				+/* The list of ready requests */
			
 
				+static struct _starpu_mpi_req_list *ready_requests;
			
 
				 
			
 
				 /* The list of detached requests that have already been submitted to MPI */
			
 
				 static struct _starpu_mpi_req_list *detached_requests;
			
@@ -61,7 +61,7 @@ static starpu_pthread_mutex_t mutex;
 
				 static starpu_pthread_t progress_thread;
			
 
				 static int running = 0;
			
 
				 
			
 
				-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
			
 
				+/* Count requests posted by the application and not yet submitted to MPI */
			
 
				 static starpu_pthread_mutex_t mutex_posted_requests;
			
 
				 static int posted_requests = 0, newer_requests, barrier_running = 0;
			
 
				 
			
@@ -151,9 +151,9 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 
				 	req->count = count;
			
 
				 
			
 
				 	/* Asynchronously request StarPU to fetch the data in main memory: when
			
 
				-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
			
 
				+	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
			
 
				 	 * the request is actually submitted */
			
 
				-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
			
 
				+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return req;
			
@@ -447,7 +447,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 
				 	waiting_req->func = _starpu_mpi_wait_func;
			
 
				 	waiting_req->request_type = WAIT_REQ;
			
 
				 
			
 
				-	_starpu_mpi_submit_new_mpi_request(waiting_req);
			
 
				+	_starpu_mpi_submit_ready_request(waiting_req);
			
 
				 
			
 
				 	/* We wait for the MPI request to finish */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
			
@@ -532,7 +532,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 		testing_req->request_type = TEST_REQ;
			
 
				 
			
 
				 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				-		_starpu_mpi_submit_new_mpi_request(testing_req);
			
 
				+		_starpu_mpi_submit_ready_request(testing_req);
			
 
				 
			
 
				 		/* We wait for the test request to finish */
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
			
@@ -619,7 +619,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
				 	barrier_req->comm = comm;
			
 
				 
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				-	_starpu_mpi_submit_new_mpi_request(barrier_req);
			
 
				+	_starpu_mpi_submit_ready_request(barrier_req);
			
 
				 
			
 
				 	/* We wait for the MPI request to finish */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
			
@@ -785,24 +785,25 @@ static void _starpu_mpi_early_data_cb(void* arg)
 
				 	free(args);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_mpi_submit_new_mpi_request(void *arg)
			
 
				+static void _starpu_mpi_submit_ready_request(void *arg)
			
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	struct _starpu_mpi_req *req = arg;
			
 
				 
			
 
				 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
			
 
				+	_STARPU_MPI_DEBUG(3, "new req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 	if (req->request_type == RECV_REQ)
			
 
				 	{
			
 
				-		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
			
 
				-		 * incoming data without a matching pending receive already submitted by the application.
			
 
				-		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
			
 
				-		 * the list of new_requests, so as the real MPI request can be submitted before the next
			
 
				-		 * submission of the envelope-catching request. */
			
 
				+		/* Case : the request is the internal receive request submitted
			
 
				+		 * by StarPU-MPI to receive incoming data without a matching
			
 
				+		 * early_request from the application. We immediately allocate the
			
 
				+		 * pointer associated to the data_handle, and push it into the
			
 
				+		 * ready_requests list, so as the real MPI request can be submitted
			
 
				+		 * before the next submission of the envelope-catching request. */
			
 
				 		if (req->is_internal_req)
			
 
				 		{
			
 
				 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
			
@@ -818,10 +819,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
			
 
				 			}
			
 
				 
			
 
				-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				-			_starpu_mpi_req_list_push_front(new_requests, req);
			
 
				+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
			
 
				+					  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
			
 
				+					  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				+			_starpu_mpi_req_list_push_front(ready_requests, req);
			
 
				 
			
 
				-			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
			
 
				+			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
			
 
				 			req->posted = 1;
			
@@ -834,10 +837,10 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
			
 
				 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
			
 
				 
			
 
				-			/* Case : the request has already been submitted internally by StarPU.
			
 
				-			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
			
 
				-			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
			
 
				-			 * bring the data back to the original data handle associated to the request.*/
			
 
				+			/* Case: a receive request for a data with the given tag and source has already been
			
 
				+			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
			
 
				+			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
			
 
				+			 * will be called to bring the data back to the original data handle associated to the request.*/
			
 
				 			if (early_data_handle)
			
 
				 			{
			
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
@@ -861,8 +864,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
			
 
				 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
			
 
				 			}
			
 
				-			/* Case : a classic receive request with no send received earlier than expected.
			
 
				-			 * We just add the pending receive request to the requests' hashmap. */
			
 
				+			/* Case: no matching data has been received. Store the receive request as an early_request. */
			
 
				 			else
			
 
				 			{
			
 
				 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
			
@@ -872,7 +874,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		_starpu_mpi_req_list_push_front(new_requests, req);
			
 
				+		_starpu_mpi_req_list_push_front(ready_requests, req);
			
 
				 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
			
 
				 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
			
 
				 	}
			
@@ -986,7 +988,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
			
 
				+static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
			
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 	STARPU_ASSERT_MSG(req, "Invalid request");
			
@@ -1080,10 +1082,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				  	int header_req_submitted = 0;
			
 
				 
			
 
				-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
			
 
				+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
			
 
				 	{
			
 
				 		/* shall we block ? */
			
 
				-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
			
 
				+		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0;
			
 
				 
			
 
				 #ifndef STARPU_MPI_ACTIVITY
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
			
@@ -1107,21 +1109,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 		/* get one request */
			
 
				 		struct _starpu_mpi_req *req;
			
 
				-		while (!_starpu_mpi_req_list_empty(new_requests))
			
 
				+		while (!_starpu_mpi_req_list_empty(ready_requests))
			
 
				 		{
			
 
				-			req = _starpu_mpi_req_list_pop_back(new_requests);
			
 
				+			req = _starpu_mpi_req_list_pop_back(ready_requests);
			
 
				 
			
 
				 			/* handling a request is likely to block for a while
			
 
				 			 * (on a sync_data_with_mem call), we want to let the
			
 
				 			 * application submit requests in the meantime, so we
			
 
				 			 * release the lock. */
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-			_starpu_mpi_handle_new_request(req);
			
 
				+			_starpu_mpi_handle_ready_request(req);
			
 
				 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 		}
			
 
				 
			
 
				-		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
			
 
				-		 * requests in our side, we resubmit a header request. */
			
 
				+		/* If there is no currently submitted header_req submitted to
			
 
				+                 * catch envelopes from senders, and there is some pending
			
 
				+                 * receive requests on our side, we resubmit a header request. */
			
 
				 		MPI_Request header_req;
			
 
				 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
			
 
				 		{
			
@@ -1151,11 +1154,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 
			
 
				 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				-				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
			
 
				-				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
			
 
				-				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
			
 
				+				/* Case: a data will arrive before a matching receive is
			
 
				+				 * posted by the application. Create a temporary handle to
			
 
				+				 * store the incoming data, submit a starpu_mpi_irecv_detached
			
 
				+				 * on this handle, and store it as an early_data
			
 
				+				 */
			
 
				 				if (!found_req)
			
 
				 				{
			
 
				+
			
 
				 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
			
 
				 
			
 
				 					starpu_data_handle_t data_handle = NULL;
			
@@ -1198,8 +1204,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 
			
 
				 					// We wait until the request is pushed in the
			
 
				-					// new_request list, that ensures that the next loop
			
 
				-					// will call _starpu_mpi_handle_new_request
			
 
				+					// ready_request list, that ensures that the next loop
			
 
				+					// will call _starpu_mpi_handle_ready_request
			
 
				 					// on the request and post the corresponding mpi_irecv,
			
 
				 					// otherwise, it may lead to read data as envelop
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
@@ -1214,8 +1220,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 				}
			
 
				-				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
			
 
				-				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
			
 
				+				/* Case: a matching application request has been found for
			
 
				+				 * the incoming data, we handle the correct allocation
			
 
				+				 * of the pointer associated to the data handle, then
			
 
				+				 * submit the corresponding receive with
			
 
				+				 * _starpu_mpi_handle_ready_request. */
			
 
				 				else
			
 
				 				{
			
 
				 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
			
@@ -1242,7 +1251,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 					 * application submit requests in the meantime, so we
			
 
				 					 * release the lock. */
			
 
				 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-					_starpu_mpi_handle_new_request(found_req);
			
 
				+					_starpu_mpi_handle_ready_request(found_req);
			
 
				 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				 				}
			
 
				 				header_req_submitted = 0;
			
@@ -1255,7 +1264,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 	}
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
			
 
				-	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
			
 
				+	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
			
 
				 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
			
 
				 	_starpu_mpi_early_request_check_termination();
			
 
				 	_starpu_mpi_early_data_check_termination();
			
@@ -1326,7 +1335,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 
				 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
			
 
				 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
			
 
				 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
			
 
				-	new_requests = _starpu_mpi_req_list_new();
			
 
				+	ready_requests = _starpu_mpi_req_list_new();
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
			
 
				 	detached_requests = _starpu_mpi_req_list_new();
			
@@ -1402,7 +1411,7 @@ int starpu_mpi_shutdown(void)
 
				 
			
 
				 	/* free the request queues */
			
 
				 	_starpu_mpi_req_list_delete(detached_requests);
			
 
				-	_starpu_mpi_req_list_delete(new_requests);
			
 
				+	_starpu_mpi_req_list_delete(ready_requests);
			
 
				 
			
 
				 	_starpu_mpi_comm_amounts_display(rank);
			
 
				 	_starpu_mpi_comm_amounts_free();
			
@@ -1423,3 +1432,10 @@ void starpu_mpi_data_register(starpu_data_handle_t data_handle, int tag, int ran
 
				 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
			
 
				 
			
 
				 }
			
 
				+
			
 
				+int starpu_mpi_world_rank(void)
			
 
				+{
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	return rank;
			
 
				+}
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -181,6 +181,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
				 	sched_policies/eager_central_policy.c			\
			
 
				 	sched_policies/eager_central_priority_policy.c		\
			
 
				 	sched_policies/work_stealing_policy.c			\
			
 
				+	sched_policies/locality_work_stealing_policy.c		\
			
 
				 	sched_policies/deque_modeling_policy_data_aware.c	\
			
 
				 	sched_policies/random_policy.c				\
			
 
				 	sched_policies/stack_queues.c				\
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -106,6 +106,9 @@
 
				 #define _STARPU_FUT_TASK_WAIT_FOR_ALL	0x513b
			
 
				 
			
 
				 #define _STARPU_FUT_EVENT	0x513c
			
 
				+#define _STARPU_FUT_THREAD_EVENT	0x513d
			
 
				+
			
 
				+#define	_STARPU_FUT_CODELET_DETAILS	0x513e
			
 
				 
			
 
				 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
			
 
				 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
			
@@ -193,6 +196,31 @@ void _starpu_fxt_register_thread(unsigned);
 
				 #define _STARPU_FUT_COMMIT(size) do { } while (0)
			
 
				 #endif
			
 
				 
			
 
				+#ifdef FUT_DO_PROBE1STR
			
 
				+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str) FUT_DO_PROBE1STR(CODE, P1, str)
			
 
				+#else
			
 
				+/* Sometimes we need something a little more specific than the wrappers from
			
 
				+ * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
			
 
				+ * by a string. */
			
 
				+#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str)			\
			
 
				+do {									\
			
 
				+    if(fut_active) {							\
			
 
				+	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
 
				+	/* we add a \0 just in case ... */				\
			
 
				+	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
			
 
				+	unsigned nbargs_str = (len + sizeof(unsigned long) - 1)/(sizeof(unsigned long));\
			
 
				+	unsigned nbargs = 1 + nbargs_str;				\
			
 
				+	size_t total_len = FUT_SIZE(nbargs);				\
			
 
				+	unsigned long *futargs =					\
			
 
				+		fut_getstampedbuffer(FUT_CODE(CODE, nbargs), total_len);\
			
 
				+	*(futargs++) = (unsigned long)(P1);				\
			
 
				+	snprintf((char *)futargs, len, "%s", str);			\
			
 
				+	((char *)futargs)[len - 1] = '\0';				\
			
 
				+	_STARPU_FUT_COMMIT(total_len);					\
			
 
				+    }									\
			
 
				+} while (0);
			
 
				+#endif
			
 
				+
			
 
				 #ifdef FUT_DO_PROBE2STR
			
 
				 #define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
			
 
				 #else
			
@@ -297,7 +325,7 @@ do {									\
 
				 #ifdef FUT_DO_PROBE6STR
			
 
				 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
			
 
				 #else
			
 
				-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
			
 
				+#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
			
 
				 do {									\
			
 
				     if(fut_active) {							\
			
 
				 	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
@@ -324,7 +352,7 @@ do {									\
 
				 #ifdef FUT_DO_PROBE7STR
			
 
				 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
			
 
				 #else
			
 
				-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
			
 
				+#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
			
 
				 do {									\
			
 
				     if(fut_active) {							\
			
 
				 	/* No more than FXT_MAX_PARAMS args are allowed */		\
			
@@ -378,7 +406,7 @@ do {									\
 
				 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
			
 
				 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
			
 
				 
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job)				\
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
			
 
				 do {									\
			
 
				         const char *model_name = _starpu_job_get_model_name((job));         \
			
 
				 	if (model_name)                                                 \
			
@@ -389,6 +417,11 @@ do {									\
 
				 	else {                                                          \
			
 
				 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
			
 
				 	}								\
			
 
				+	{								\
			
 
				+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
			
 
				+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
			
 
				+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
			
 
				+	}								\
			
 
				 } while(0);
			
 
				 
			
 
				 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
			
@@ -563,6 +596,9 @@ do {										\
 
				 #define _STARPU_TRACE_EVENT(S)			\
			
 
				 	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
			
 
				 
			
 
				+#define _STARPU_TRACE_THREAD_EVENT(S)			\
			
 
				+	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
			
 
				+
			
 
				 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
			
 
				 	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
			
 
				 
			
@@ -746,7 +782,7 @@ do {										\
 
				 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
			
 
				 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
			
 
				-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
			
 
				+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
			
 
				 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
			
 
				 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
			
 
				 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
			
@@ -794,6 +830,8 @@ do {										\
 
				 #define _STARPU_TRACE_USER_EVENT(code)		do {} while(0)
			
 
				 #define _STARPU_TRACE_SET_PROFILING(status)	do {} while(0)
			
 
				 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL		do {} while(0)
			
 
				+#define _STARPU_TRACE_EVENT(S)		do {} while(0)
			
 
				+#define _STARPU_TRACE_THREAD_EVENT(S)		do {} while(0)
			
 
				 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
			
 
				 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
			
 
				 #define _STARPU_TRACE_UNLOCKING_MUTEX()		do {} while(0)
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -288,9 +288,35 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 
				 
			
 
				 	return p_ret;
			
 
				 }
			
 
				+
			
 
				+#if defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)
			
 
				+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
			
 
				+{
			
 
				+	*barrier = xbt_barrier_init(count);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
			
 
				+{
			
 
				+	if (*barrier)
			
 
				+		xbt_barrier_destroy(*barrier);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
			
 
				+{
			
 
				+	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
			
 
				+
			
 
				+	xbt_barrier_wait(*barrier);
			
 
				+
			
 
				+	_STARPU_TRACE_BARRIER_WAIT_END();
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif /* defined(STARPU_SIMGRID) */
			
 
				+
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				-#if defined(STARPU_SIMGRID) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
			
 
				+#if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || !defined(STARPU_HAVE_PTHREAD_BARRIER)
			
 
				 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
			
 
				 {
			
 
				 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -116,6 +116,15 @@ void _starpu_job_destroy(struct _starpu_job *j)
 
				 	_starpu_job_delete(j);
			
 
				 }
			
 
				 
			
 
				+int _starpu_job_finished(struct _starpu_job *j)
			
 
				+{
			
 
				+	int ret;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				+	ret = j->terminated == 2;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 void _starpu_wait_job(struct _starpu_job *j)
			
 
				 {
			
 
				 	STARPU_ASSERT(j->task);
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -182,6 +182,9 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
				 /* Destroy the data structure associated to the job structure */
			
 
				 void _starpu_job_destroy(struct _starpu_job *j);
			
 
				 
			
 
				+/* Test for the termination of the job */
			
 
				+int _starpu_job_finished(struct _starpu_job *j);
			
 
				+
			
 
				 /* Wait for the termination of the job */
			
 
				 void _starpu_wait_job(struct _starpu_job *j);
			
 
				 
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -60,7 +60,11 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 	{
			
 
				 		struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 		if(sched_ctx && sched_ctx->sched_policy && sched_ctx->sched_policy->remove_workers)
			
 
				+		{
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 			sched_ctx->sched_policy->remove_workers(sched_ctx_id, &worker->workerid, 1);
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+		}
			
 
				 		_starpu_sched_ctx_list_remove(&worker->sched_ctx_list, sched_ctx_id);
			
 
				 		worker->nsched_ctxs--;
			
 
				 	}
			
@@ -185,6 +189,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 	}
			
 
				 	else if(sched_ctx->sched_policy->add_workers)
			
 
				 	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 		if(added_workers)
			
 
				 		{
			
 
				 			if(*n_added_workers > 0)
			
@@ -192,6 +197,7 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
				 		}
			
 
				 		else
			
 
				 			sched_ctx->sched_policy->add_workers(sched_ctx->id, workers_to_add, nworkers_to_add);
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				 	}
			
 
				 	return;
			
 
				 }
			
@@ -229,7 +235,11 @@ static void _starpu_sched_ctx_free_scheduling_data(struct _starpu_sched_ctx *sch
 
				 	unsigned nworkers_ctx = starpu_sched_ctx_get_workers_list(sched_ctx->id, &workerids);
			
 
				 
			
 
				 	if(nworkers_ctx > 0 && sched_ctx->sched_policy->remove_workers)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 		sched_ctx->sched_policy->remove_workers(sched_ctx->id, workerids, nworkers_ctx);
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+	}
			
 
				 
			
 
				 	free(workerids);
			
 
				 	return;
			
@@ -523,6 +533,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 	int max_prio = 0;
			
 
				 	struct starpu_sched_policy *sched_policy = NULL;
			
 
				 	unsigned hierarchy_level = 0;
			
 
				+	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				 
			
 
				 	va_start(varg_list, sched_ctx_name);
			
 
				 	while ((arg_type = va_arg(varg_list, int)) != 0)
			
@@ -551,6 +562,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 		{
			
 
				 			hierarchy_level = va_arg(varg_list, unsigned);
			
 
				 		}
			
 
				+		else if (arg_type == STARPU_SCHED_CTX_NESTED)
			
 
				+		{
			
 
				+			nesting_sched_ctx = va_arg(varg_list, unsigned);
			
 
				+		}
			
 
				 		else
			
 
				 		{
			
 
				 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
			
@@ -562,6 +577,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 
				 	struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
			
 
				 	sched_ctx->hierarchy_level = hierarchy_level;
			
 
				+	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
			
 
				 
			
 
				 	_starpu_unlock_mutex_if_prev_locked();
			
 
				 	int *added_workerids;
			
@@ -1132,6 +1148,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
				 	case STARPU_WORKER_TREE:
			
 
				 		sched_ctx->workers->has_next = worker_tree.has_next;
			
 
				 		sched_ctx->workers->get_next = worker_tree.get_next;
			
 
				+		sched_ctx->workers->has_next_master = worker_tree.has_next_master;
			
 
				+		sched_ctx->workers->get_next_master = worker_tree.get_next_master;
			
 
				 		sched_ctx->workers->add = worker_tree.add;
			
 
				 		sched_ctx->workers->remove = worker_tree.remove;
			
 
				 		sched_ctx->workers->init = worker_tree.init;
			
@@ -1144,6 +1162,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 
				 	default:
			
 
				 		sched_ctx->workers->has_next = worker_list.has_next;
			
 
				 		sched_ctx->workers->get_next = worker_list.get_next;
			
 
				+		sched_ctx->workers->has_next_master = worker_list.has_next_master;
			
 
				+		sched_ctx->workers->get_next_master = worker_list.get_next_master;
			
 
				 		sched_ctx->workers->add = worker_list.add;
			
 
				 		sched_ctx->workers->remove = worker_list.remove;
			
 
				 		sched_ctx->workers->init = worker_list.init;
			
@@ -1171,6 +1191,7 @@ void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
 
				 		starpu_worker_get_name(workerids[i], name, 256);
			
 
				 		fprintf(f, "\t\t%s\n", name);
			
 
				 	}
			
 
				+	free(workerids);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
			
@@ -1605,6 +1626,44 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
				 
			
 
				 }
			
 
				 
			
 
				+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+	struct _starpu_sched_ctx_list *l = NULL;
			
 
				+	struct _starpu_sched_ctx *sched_ctx = NULL;
			
 
				+	for (l = worker->sched_ctx_list; l; l = l->next)
			
 
				+	{ 
			
 
				+		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				+		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
			
 
				+			return sched_ctx->id;
			
 
				+	}
			
 
				+	return STARPU_NMAX_SCHED_CTXS;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
			
 
				+{
			
 
				+        _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
			
 
				+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
			
 
				+}
			
 
				+
			
 
				+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
			
 
				+{
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	struct _starpu_worker *worker  = NULL;
			
 
				+	if(workerid != -1)
			
 
				+	{
			
 
				+		worker = _starpu_get_worker_struct(workerid);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
			
 
				+	}
			
 
				+
			
 
				+	task->sched_ctx = sched_ctx;
			
 
				+	_starpu_task_submit_nodeps(task);
			
 
				+
			
 
				+	if(workerid != -1)
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
			
 
				+}
			
 
				+
			
 
				 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
			
 
				 {
			
 
				 	int s;
			
@@ -1620,6 +1679,7 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 
				 	return 0;
			
 
				 
			
 
				 }
			
 
				+
			
 
				 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
@@ -1643,7 +1703,6 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
				 		workerid = workerids[w];
			
 
				 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
			
 
				 		{
			
 
				-			sched_ctx->sleeping[workerids[w]] = 1;
			
 
				 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
			
 
				 		}
			
 
				 	}
			
@@ -1652,7 +1711,10 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
				 
			
 
				 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
			
 
				 {
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+	worker->slave = 1;
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	sched_ctx->sleeping[workerid] = 1;
			
 
				 	int master = sched_ctx->master[workerid];
			
 
				 	sem_post(&sched_ctx->fall_asleep_sem[master]);
			
 
				 
			
@@ -1666,6 +1728,9 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 
				 	sem_post(&sched_ctx->wake_up_sem[master]);
			
 
				 	sched_ctx->sleeping[workerid] = 0;
			
 
				 	sched_ctx->master[workerid] = -1;
			
 
				+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+	worker->slave = 0;
			
 
				+
			
 
				 	return;
			
 
				 }
			
 
				 
			
@@ -1720,7 +1785,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 
				 	int current_worker_id = starpu_worker_get_id();
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				-
			
 
				 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
			
 
				 	int w = 0;
			
 
				 
			
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -147,6 +147,9 @@ struct _starpu_sched_ctx
 
				 	/* bool indicating if the workers is sleeping in this ctx */
			
 
				 	unsigned sleeping[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* ctx nesting the current ctx */
			
 
				+	unsigned nesting_sched_ctx;
			
 
				+
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config;
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -38,6 +38,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 
				 	&_starpu_sched_eager_policy,
			
 
				 	&_starpu_sched_prio_policy,
			
 
				 	&_starpu_sched_random_policy,
			
 
				+	&_starpu_sched_lws_policy,
			
 
				 	&_starpu_sched_ws_policy,
			
 
				 	&_starpu_sched_dm_policy,
			
 
				 	&_starpu_sched_dmda_policy,
			
@@ -174,14 +175,20 @@ void _starpu_init_sched_policy(struct _starpu_machine_config *config, struct _st
 
				 
			
 
				 	load_sched_policy(selected_policy, sched_ctx);
			
 
				 
			
 
				+	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 	sched_ctx->sched_policy->init_sched(sched_ctx->id);
			
 
				+	_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				 }
			
 
				 
			
 
				 void _starpu_deinit_sched_policy(struct _starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				 	struct starpu_sched_policy *policy = sched_ctx->sched_policy;
			
 
				 	if (policy->deinit_sched)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 		policy->deinit_sched(sched_ctx->id);
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task *task, struct _starpu_worker *worker, int workerid, int perf_workerid)
			
@@ -193,7 +200,11 @@ static void _starpu_push_task_on_specific_worker_notify_sched(struct starpu_task
 
				         {
			
 
				 		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
			
 
				 		if (sched_ctx->sched_policy != NULL && sched_ctx->sched_policy->push_task_notify)
			
 
				+		{
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 			sched_ctx->sched_policy->push_task_notify(task, workerid, perf_workerid, sched_ctx->id);
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -867,22 +878,31 @@ profiling:
 
				 
			
 
				 struct starpu_task *_starpu_pop_every_task(struct _starpu_sched_ctx *sched_ctx)
			
 
				 {
			
 
				+	struct starpu_task *task = NULL;
			
 
				 	if(sched_ctx->sched_policy)
			
 
				 	{
			
 
				 		STARPU_ASSERT(sched_ctx->sched_policy->pop_every_task);
			
 
				 		
			
 
				 		/* TODO set profiling info */
			
 
				 		if(sched_ctx->sched_policy->pop_every_task)
			
 
				-			return sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
			
 
				+		{
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				+			task = sched_ctx->sched_policy->pop_every_task(sched_ctx->id);
			
 
				+			_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+		}
			
 
				 	}
			
 
				-	return NULL;
			
 
				+	return task;
			
 
				 }
			
 
				 
			
 
				 void _starpu_sched_pre_exec_hook(struct starpu_task *task)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 		sched_ctx->sched_policy->pre_exec_hook(task);
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_sched_post_exec_hook(struct starpu_task *task)
			
@@ -890,7 +910,11 @@ void _starpu_sched_post_exec_hook(struct starpu_task *task)
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
			
 
				 
			
 
				 	if (sched_ctx->sched_policy && sched_ctx->sched_policy->post_exec_hook)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
			
 
				 		sched_ctx->sched_policy->post_exec_hook(task);
			
 
				+		_STARPU_TRACE_WORKER_SCHEDULING_POP;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_wait_on_sched_event(void)
			
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -58,6 +58,7 @@ void _starpu_print_idle_time();
 
				 /*
			
 
				  *	Predefined policies
			
 
				  */
			
 
				+extern struct starpu_sched_policy _starpu_sched_lws_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_ws_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_prio_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_random_policy;
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -33,6 +33,8 @@ extern int starpu_main(int argc, char *argv[]);
 
				 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
			
 
				 #pragma weak smpi_simulated_main_
			
 
				 extern int smpi_simulated_main_(int argc, char *argv[]);
			
 
				+#pragma weak starpu_mpi_world_rank
			
 
				+extern int starpu_mpi_world_rank(void);
			
 
				 
			
 
				 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
			
 
				 
			
@@ -48,6 +50,13 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 
				 	return starpu_main(args->argc, args->argv);
			
 
				 }
			
 
				 
			
 
				+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
			
 
				+#ifdef HAVE_MSG_GET_AS_BY_NAME
			
 
				+static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
			
 
				+{
			
 
				+	return MSG_get_as_by_name(name);
			
 
				+}
			
 
				+#else /* HAVE_MSG_GET_AS_BY_NAME */
			
 
				 static msg_as_t __starpu_simgrid_get_as_by_name(msg_as_t root, const char *name)
			
 
				 {
			
 
				 	xbt_dict_t dict;
			
@@ -69,6 +78,8 @@ static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 
				 {
			
 
				 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
			
 
				 }
			
 
				+#endif /* HAVE_MSG_GET_AS_BY_NAME */
			
 
				+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
			
 
				 
			
 
				 int _starpu_simgrid_get_nbhosts(const char *prefix)
			
 
				 {
			
@@ -77,13 +88,16 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 
				 	unsigned i, nb;
			
 
				 	unsigned len = strlen(prefix);
			
 
				 
			
 
				+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
			
 
				 	if (_starpu_simgrid_running_smpi())
			
 
				 	{
			
 
				 		char name[16];
			
 
				-		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
			
 
				+		STARPU_ASSERT(starpu_mpi_world_rank);
			
 
				+		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
			
 
				 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
			
 
				 	}
			
 
				 	else
			
 
				+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
			
 
				 		hosts = MSG_hosts_as_dynar();
			
 
				 	nb = xbt_dynar_length(hosts);
			
 
				 
			
@@ -125,7 +139,8 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 
				 	if (_starpu_simgrid_running_smpi())
			
 
				 	{
			
 
				 		char mpiname[16];
			
 
				-		snprintf(mpiname, sizeof(mpiname), "%d-%s", smpi_current_rank, name);
			
 
				+		STARPU_ASSERT(starpu_mpi_world_rank);
			
 
				+		snprintf(mpiname, sizeof(mpiname), "%d-%s", starpu_mpi_world_rank(), name);
			
 
				 		return MSG_get_host_by_name(mpiname);
			
 
				 	}
			
 
				 	else
			
@@ -178,6 +193,7 @@ void _starpu_simgrid_init()
 
				 	xbt_dynar_t hosts;
			
 
				 	int i;
			
 
				 
			
 
				+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
			
 
				 	if (_starpu_simgrid_running_smpi())
			
 
				 	{
			
 
				 		/* Take back hand to create the local platform for this MPI
			
@@ -191,7 +207,8 @@ void _starpu_simgrid_init()
 
				 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
			
 
				 		int ret;
			
 
				 
			
 
				-		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
			
 
				+		STARPU_ASSERT(starpu_mpi_world_rank);
			
 
				+		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
			
 
				 
			
 
				 		/* Get XML platform */
			
 
				 		_starpu_simgrid_get_platform_path(path, sizeof(path));
			
@@ -212,6 +229,7 @@ void _starpu_simgrid_init()
 
				 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
			
 
				 	}
			
 
				 	else
			
 
				+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
			
 
				 		hosts = MSG_hosts_as_dynar();
			
 
				 
			
 
				 	int nb = xbt_dynar_length(hosts);
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -187,6 +187,13 @@ void starpu_task_destroy(struct starpu_task *task)
 
				 	_starpu_task_destroy(task);
			
 
				 }
			
 
				 
			
 
				+int starpu_task_finished(struct starpu_task *task)
			
 
				+{
			
 
				+	STARPU_ASSERT(task);
			
 
				+	STARPU_ASSERT_MSG(!task->detach, "starpu_task_finished can only be called on tasks with detach = 0");
			
 
				+	return _starpu_job_finished(_starpu_get_job_associated_to_task(task));
			
 
				+}
			
 
				+
			
 
				 int starpu_task_wait(struct starpu_task *task)
			
 
				 {
			
 
				         _STARPU_LOG_IN();
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -467,6 +467,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	workerarg->reverse_phase[1] = 0;
			
 
				 	workerarg->pop_ctx_priority = 1;
			
 
				 	workerarg->sched_mutex_locked = 0;
			
 
				+	workerarg->slave = 0;
			
 
				 
			
 
				 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
			
 
				 }
			
@@ -516,7 +517,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 
			
 
				 	/* Launch workers asynchronously */
			
 
				 	unsigned cpu = 0;
			
 
				-	unsigned worker;
			
 
				+	unsigned worker, i;
			
 
				 
			
 
				 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
			
 
				 	/* Get itimer of the main thread, to set it for the worker threads */
			
@@ -526,6 +527,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #ifdef HAVE_AYUDAME_H
			
 
				 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
			
 
				 #endif
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				+	for (i = 0; i < sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0]); i++)
			
 
				+		cuda_worker_set[i].workers = NULL;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+	for (i = 0; i < sizeof(mic_worker_set)/sizeof(mic_worker_set[0]); i++)
			
 
				+		mic_worker_set[i].workers = NULL;
			
 
				+#endif
			
 
				+
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
@@ -575,44 +586,44 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				driver.id.cuda_id = workerarg->devid;
			
 
				-				if (_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				-				{
			
 
				-					/* We spawn only one thread per CUDA device,
			
 
				-					 * which will control all CUDA workers of this
			
 
				-					 * device. (by using a worker set). */
			
 
				-					if (cuda_worker_set[devid].started)
			
 
				-						goto worker_set_initialized;
			
 
				+				workerarg->set = &cuda_worker_set[devid];
			
 
				 
			
 
				-					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
			
 
				-					cuda_worker_set[devid].workers = workerarg;
			
 
				-					cuda_worker_set[devid].set_is_initialized = 0;
			
 
				+				/* We spawn only one thread per CUDA device,
			
 
				+				 * which will control all CUDA workers of this
			
 
				+				 * device. (by using a worker set). */
			
 
				+				if (cuda_worker_set[devid].workers)
			
 
				+					break;
			
 
				 
			
 
				-					STARPU_PTHREAD_CREATE_ON(
			
 
				-						workerarg->name,
			
 
				-						&cuda_worker_set[devid].worker_thread,
			
 
				-						NULL,
			
 
				-						_starpu_cuda_worker,
			
 
				-						&cuda_worker_set[devid],
			
 
				-						worker+1);
			
 
				-#ifdef STARPU_USE_FXT
			
 
				-					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				-					while (!workerarg->worker_is_running)
			
 
				-						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
			
 
				-					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				-#endif
			
 
				-					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
			
 
				-					while (!cuda_worker_set[devid].set_is_initialized)
			
 
				-						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
			
 
				-									 &cuda_worker_set[devid].mutex);
			
 
				-					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
			
 
				-					cuda_worker_set[devid].started = 1;
			
 
				-		worker_set_initialized:
			
 
				-					workerarg->set = &cuda_worker_set[devid];
			
 
				-				}
			
 
				-				else
			
 
				+				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
			
 
				+				cuda_worker_set[devid].workers = workerarg;
			
 
				+				cuda_worker_set[devid].set_is_initialized = 0;
			
 
				+
			
 
				+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
			
 
				 				{
			
 
				 					workerarg->run_by_starpu = 0;
			
 
				+					break;
			
 
				 				}
			
 
				+
			
 
				+				STARPU_PTHREAD_CREATE_ON(
			
 
				+					workerarg->name,
			
 
				+					&cuda_worker_set[devid].worker_thread,
			
 
				+					NULL,
			
 
				+					_starpu_cuda_worker,
			
 
				+					&cuda_worker_set[devid],
			
 
				+					worker+1);
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				+				while (!workerarg->worker_is_running)
			
 
				+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				+#endif
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
			
 
				+				while (!cuda_worker_set[devid].set_is_initialized)
			
 
				+					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
			
 
				+								 &cuda_worker_set[devid].mutex);
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
			
 
				+				cuda_worker_set[devid].started = 1;
			
 
				+
			
 
				 				break;
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
@@ -642,11 +653,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #endif
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 			case STARPU_MIC_WORKER:
			
 
				+				workerarg->set = &mic_worker_set[devid];
			
 
				+
			
 
				 				/* We spawn only one thread
			
 
				 				 * per MIC device, which will control all MIC
			
 
				 				 * workers of this device. (by using a worker set). */
			
 
				-				if (mic_worker_set[devid].started)
			
 
				-					goto worker_set_initialized;
			
 
				+				if (mic_worker_set[devid].workers)
			
 
				+					break;
			
 
				 
			
 
				 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
			
 
				 
			
@@ -678,8 +691,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
			
 
				 
			
 
				 				mic_worker_set[devid].started = 1;
			
 
				-		worker_set_initialized:
			
 
				-				workerarg->set = &mic_worker_set[devid];
			
 
				 
			
 
				 				break;
			
 
				 #endif /* STARPU_USE_MIC */
			
@@ -1374,6 +1385,11 @@ unsigned starpu_worker_get_count(void)
 
				 	return config.topology.nworkers;
			
 
				 }
			
 
				 
			
 
				+unsigned starpu_worker_is_slave(int workerid)
			
 
				+{
			
 
				+	return config.workers[workerid].slave;
			
 
				+}
			
 
				+
			
 
				 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
			
 
				 {
			
 
				 	switch (type)
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -112,6 +112,9 @@ LIST_TYPE(_starpu_worker,
 
				 	/* flag to know if sched_mutex is locked or not */
			
 
				 	unsigned sched_mutex_locked;
			
 
				 
			
 
				+	/* bool to indicate if the worker is slave in a ctx */
			
 
				+	unsigned slave;
			
 
				+
			
 
				 #ifdef __GLIBC__
			
 
				 	cpu_set_t cpu_set;
			
 
				 #endif /* __GLIBC__ */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -151,7 +151,7 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
				 
			
 
				 	/* the data is present now */
			
 
				 	unsigned requesting_node = requesting_replicate->memory_node;
			
 
				-	requesting_replicate->requested[requesting_node] = 0;
			
 
				+	requesting_replicate->requested &= ~(1UL << requesting_node);
			
 
				 
			
 
				 	if (mode & STARPU_W)
			
 
				 	{
			
@@ -656,18 +656,25 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				-static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
			
 
				+static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
			
 
				 {
			
 
				-// XXX : this is just a hint, so we don't take the lock ...
			
 
				-//	_starpu_spin_lock(&handle->header_lock);
			
 
				+	unsigned local_node = _starpu_memory_node_get_local_key();
			
 
				+	int cpt = 0;
			
 
				+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
			
 
				+	{
			
 
				+		cpt++;
			
 
				+		_starpu_datawizard_progress(local_node, 1);
			
 
				+	}
			
 
				+	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	if (replicate->state == STARPU_INVALID)
			
 
				 	{
			
 
				 		unsigned dst_node = replicate->memory_node;
			
 
				-		replicate->requested[dst_node] = 1;
			
 
				+		replicate->requested |= 1UL << dst_node;
			
 
				 	}
			
 
				 
			
 
				-//	_starpu_spin_unlock(&handle->header_lock);
			
 
				+	_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
			
@@ -686,7 +693,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				 		prefetch_data_on_node(handle, replicate, mode);
			
 
				 
			
 
				-		_starpu_set_data_requested_flag_if_needed(replicate);
			
 
				+		_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -880,7 +887,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsig
 
				 
			
 
				 		for (i = 0; i < nnodes; i++)
			
 
				 		{
			
 
				-			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
			
 
				+			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
			
 
				 				ret = 1;
			
 
				 		}
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -48,26 +48,26 @@ LIST_TYPE(_starpu_data_replicate,
 
				 
			
 
				 	unsigned memory_node;
			
 
				 
			
 
				-	/* A buffer that is used for SCRATCH or reduction cannnot be used with
			
 
				-	 * filters. */
			
 
				-	unsigned relaxed_coherency;
			
 
				-
			
 
				-	/* We may need to initialize the replicate with some value before using it. */
			
 
				-	unsigned initialized;
			
 
				-
			
 
				 	/* describes the state of the local data in term of coherency */
			
 
				 	enum _starpu_cache_state	state;
			
 
				 
			
 
				 	int refcnt;
			
 
				 
			
 
				+	/* A buffer that is used for SCRATCH or reduction cannnot be used with
			
 
				+	 * filters. */
			
 
				+	unsigned relaxed_coherency:2;
			
 
				+
			
 
				+	/* We may need to initialize the replicate with some value before using it. */
			
 
				+	unsigned initialized:1;
			
 
				+
			
 
				 	/* is the data locally allocated ? */
			
 
				-	uint8_t allocated;
			
 
				+	unsigned allocated:1;
			
 
				 	/* was it automatically allocated ? (else it's the application-provided
			
 
				 	 * buffer, don't ever try to free it!) */
			
 
				 	/* perhaps the allocation was perform higher in the hiearchy
			
 
				 	 * for now this is just translated into !automatically_allocated
			
 
				 	 * */
			
 
				-	uint8_t automatically_allocated;
			
 
				+	unsigned automatically_allocated:1;
			
 
				 
			
 
				         /* Pointer to memchunk for LRU strategy */
			
 
				 	struct _starpu_mem_chunk * mc;
			
@@ -79,7 +79,7 @@ LIST_TYPE(_starpu_data_replicate,
 
				 	   flag when it assigns a task to a queue, policies which do not
			
 
				 	   use this hint can simply ignore it.
			
 
				 	 */
			
 
				-	uint8_t requested[STARPU_MAXNODES];
			
 
				+	uint32_t requested;
			
 
				 	struct _starpu_data_request *request[STARPU_MAXNODES];
			
 
				 )
			
 
				 
			
@@ -207,7 +207,7 @@ struct _starpu_data_state
 
				 	 * the end of the reduction. */
			
 
				 	struct _starpu_data_requester_list *reduction_req_list;
			
 
				 
			
 
				-	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
			
 
				+	starpu_data_handle_t *reduction_tmp_handles;
			
 
				 
			
 
				 	unsigned lazy_unregister;
			
 
				 
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -176,6 +176,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 		/* initialize the chunk lock */
			
 
				 		child->req_list = _starpu_data_requester_list_new();
			
 
				 		child->reduction_req_list = _starpu_data_requester_list_new();
			
 
				+		child->reduction_tmp_handles = NULL;
			
 
				 		child->refcnt = 0;
			
 
				 		child->busy_count = 0;
			
 
				 		child->busy_waiting = 0;
			
@@ -240,10 +241,10 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 			child_replicate->automatically_allocated = 0;
			
 
				 			child_replicate->refcnt = 0;
			
 
				 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
			
 
				+			child_replicate->requested = 0;
			
 
				 
			
 
				 			for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 			{
			
 
				-				child_replicate->requested[node] = 0;
			
 
				 				child_replicate->request[node] = NULL;
			
 
				 			}
			
 
				 
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -291,6 +291,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 
			
 
				 	handle->reduction_refcnt = 0;
			
 
				 	handle->reduction_req_list = _starpu_data_requester_list_new();
			
 
				+	handle->reduction_tmp_handles = NULL;
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	handle->last_submitted_ghost_sync_id_is_valid = 0;
			
@@ -346,10 +347,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 		replicate->state = STARPU_INVALID;
			
 
				 		replicate->refcnt = 0;
			
 
				 		replicate->handle = handle;
			
 
				+		replicate->requested = 0;
			
 
				 
			
 
				 		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		{
			
 
				-			replicate->requested[node] = 0;
			
 
				 			replicate->request[node] = NULL;
			
 
				 		}
			
 
				 
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -156,6 +156,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
				 
			
 
				 	/* Register all valid per-worker replicates */
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				+	STARPU_ASSERT(!handle->reduction_tmp_handles);
			
 
				+	handle->reduction_tmp_handles = malloc(nworkers * sizeof(handle->reduction_tmp_handles[0]));
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		if (handle->per_worker[worker].initialized)
			
@@ -390,4 +392,6 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 
				 			/* TODO put in cache */
			
 
				 		}
			
 
				 	}
			
 
				+	free(handle->reduction_tmp_handles);
			
 
				+	handle->reduction_tmp_handles = NULL;
			
 
				 }
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -519,9 +519,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 
				 /* Query the status of the handle on the specified memory node. */
			
 
				 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
			
 
				 {
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning FIXME
			
 
				-#endif
			
 
				+// XXX : this is just a hint, so we don't take the lock ...
			
 
				 //	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	if (is_allocated)
			
@@ -537,7 +535,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 
				 		unsigned node;
			
 
				 		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 		{
			
 
				-			if (handle->per_node[memory_node].requested[node])
			
 
				+			if (handle->per_node[memory_node].requested & (1UL << node))
			
 
				 			{
			
 
				 				requested = 1;
			
 
				 				break;
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -275,6 +275,18 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+	char container[STARPU_POTI_STR_LEN];
			
 
				+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
			
 
				+	/* TODO: set detailed state */
			
 
				+	poti_SetState(time, container, "S", name);
			
 
				+#else
			
 
				+	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
			
 
				 {
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -631,11 +643,8 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
				 	int worker;
			
 
				 	worker = find_worker_id(ev->param[2]);
			
 
				 
			
 
				-	unsigned sched_ctx = ev->param[1];
			
 
				 	if (worker < 0) return;
			
 
				 
			
 
				-	char *prefix = options->file_prefix;
			
 
				-
			
 
				 	unsigned long has_name = ev->param[3];
			
 
				 	char *name = has_name?(char *)&ev->param[4]:"unknown";
			
 
				 
			
@@ -646,8 +655,12 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
				 
			
 
				 	create_paje_state_if_not_found(name, options);
			
 
				 
			
 
				+#ifndef STARPU_ENABLE_PAJE_CODELET_DETAILS
			
 
				 	if (out_paje_file)
			
 
				 	{
			
 
				+		char *prefix = options->file_prefix;
			
 
				+		unsigned sched_ctx = ev->param[1];
			
 
				+
			
 
				 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
			
 
				 		if (sched_ctx != 0)
			
 
				 		{
			
@@ -662,9 +675,40 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
				 #endif
			
 
				 		}
			
 
				 	}
			
 
				+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
			
 
				 
			
 
				 }
			
 
				 
			
 
				+static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
			
 
				+	int worker;
			
 
				+	worker = find_worker_id(ev->param[5]);
			
 
				+
			
 
				+	unsigned sched_ctx = ev->param[1];
			
 
				+	if (worker < 0) return;
			
 
				+
			
 
				+	char *prefix = options->file_prefix;
			
 
				+
			
 
				+	if (out_paje_file)
			
 
				+	{
			
 
				+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
			
 
				+		if (sched_ctx != 0)
			
 
				+		{
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+			char container[STARPU_POTI_STR_LEN];
			
 
				+			char ctx[6];
			
 
				+			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
			
 
				+			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
			
 
				+			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
			
 
				+#else
			
 
				+			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
			
 
				+#endif
			
 
				+		}
			
 
				+	}
			
 
				+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
			
 
				+}
			
 
				+
			
 
				 static long dumped_codelets_count;
			
 
				 static struct starpu_fxt_codelet_event *dumped_codelets;
			
 
				 
			
@@ -727,7 +771,7 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 			program_container_alias (container, STARPU_POTI_STR_LEN, prefix);
			
 
				 #else
			
 
				-			fprintf(out_paje_file, "9	%.9f	event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
			
 
				+			fprintf(out_paje_file, "9	%.9f	user_event	%sp	%lu\n", get_event_time_stamp(ev, options), prefix, code);
			
 
				 #endif
			
 
				 	}
			
 
				 	else
			
@@ -736,12 +780,12 @@ static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 			thread_container_alias (container, STARPU_POTI_STR_LEN, prefix, ev->param[1]);
			
 
				 #else
			
 
				-			fprintf(out_paje_file, "9	%.9f	event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
			
 
				+			fprintf(out_paje_file, "9	%.9f	user_event	%st%"PRIu64"	%lu\n", get_event_time_stamp(ev, options), prefix, ev->param[1], code);
			
 
				 #endif
			
 
				 	}
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 	if (out_paje_file)
			
 
				-		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", paje_value);
			
 
				+		poti_NewEvent(get_event_time_stamp(ev, options), container, "user_event", paje_value);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -916,6 +960,40 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 
			
 
				 }
			
 
				 
			
 
				+
			
 
				+static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	unsigned dst = ev->param[0];
			
 
				+	unsigned src = ev->param[1];
			
 
				+	unsigned size = 0;
			
 
				+	unsigned comid = 0;
			
 
				+	
			
 
				+	char *prefix = options->file_prefix;
			
 
				+
			
 
				+	
			
 
				+	if (out_paje_file)
			
 
				+	{
			
 
				+		double time = get_event_time_stamp(ev, options);
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+		char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_worker_container[STARPU_POTI_STR_LEN], dst_worker_container[STARPU_POTI_STR_LEN];
			
 
				+		char program_container[STARPU_POTI_STR_LEN];
			
 
				+		snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
			
 
				+		snprintf(paje_key, STARPU_POTI_STR_LEN, "steal_%u", comid);
			
 
				+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
			
 
				+		worker_container_alias(src_worker_container, STARPU_POTI_STR_LEN, prefix, src);
			
 
				+		worker_container_alias(dst_worker_container, STARPU_POTI_STR_LEN, prefix, dst);
			
 
				+		poti_StartLink(time, program_container, "L", src_worker_container, paje_value, paje_key);
			
 
				+		poti_EndLink(time+0.000000001, program_container, "L", dst_worker_container, paje_value, paje_key);
			
 
				+#else
			
 
				+
			
 
				+		fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time, prefix, size, prefix, src, comid);
			
 
				+		fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time+0.000000001, prefix, size, prefix, dst, comid);
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	unsigned dst = ev->param[1];
			
@@ -1380,6 +1458,23 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	/* Add an event in the trace */
			
 
				+	if (out_paje_file)
			
 
				+	{
			
 
				+		char *event = (char*)&ev->param[1];
			
 
				+
			
 
				+#ifdef STARPU_HAVE_POTI
			
 
				+		char container[STARPU_POTI_STR_LEN];
			
 
				+		thread_container_alias(container, STARPU_POTI_STR_LEN, options->file_prefix, ev->param[0]);
			
 
				+		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", event);
			
 
				+#else
			
 
				+		fprintf(out_paje_file, "9	%.9f	thread_event	%st%"PRIu64"	%s\n", get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], event);
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static
			
 
				 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
			
 
				 {
			
@@ -1507,6 +1602,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 			case _STARPU_FUT_START_CODELET_BODY:
			
 
				 				handle_start_codelet_body(&ev, options);
			
 
				 				break;
			
 
				+			case _STARPU_FUT_CODELET_DETAILS:
			
 
				+				handle_codelet_details(&ev, options);
			
 
				+				break;
			
 
				 			case _STARPU_FUT_END_CODELET_BODY:
			
 
				 				handle_end_codelet_body(&ev, options);
			
 
				 				break;
			
@@ -1624,7 +1722,7 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				break;
			
 
				 
			
 
				 			case _STARPU_FUT_WORK_STEALING:
			
 
				-				/* XXX */
			
 
				+				handle_work_stealing(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				 			case _STARPU_FUT_WORKER_DEINIT_START:
			
@@ -1797,6 +1895,10 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 
				 				handle_event(&ev, options);
			
 
				 				break;
			
 
				 
			
 
				+			case _STARPU_FUT_THREAD_EVENT:
			
 
				+				handle_thread_event(&ev, options);
			
 
				+				break;
			
 
				+
			
 
				 			case _STARPU_FUT_LOCKING_MUTEX:
			
 
				 				break;
			
 
				 
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -130,6 +130,17 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 	fprintf(file, "%%	DestContainer	string\n");
			
 
				 	fprintf(file, "%%	Key	string\n");
			
 
				 	fprintf(file, "%%EndEventDef\n");
			
 
				+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
			
 
				+	fprintf(file, "%%EventDef PajeSetState 20\n");
			
 
				+	fprintf(file, "%%	Time	date\n");
			
 
				+	fprintf(file, "%%	Container	string\n");
			
 
				+	fprintf(file, "%%	Type	string\n");
			
 
				+	fprintf(file, "%%	Value	string\n");
			
 
				+	fprintf(file, "%%	Size	string\n");
			
 
				+	fprintf(file, "%%	Footprint	string\n");
			
 
				+	fprintf(file, "%%	Tag	string\n");
			
 
				+	fprintf(file, "%%EndEventDef\n");
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -156,6 +167,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 	poti_DefineEntityValue("No", "MS", "Nothing", ".0 .0 .0");
			
 
				 
			
 
				 	/* Types for the Worker of the Memory Node */
			
 
				+	poti_DefineEventType("user_event", "T", "user event type");
			
 
				 	poti_DefineEventType("thread_event", "T", "thread event type");
			
 
				 	poti_DefineStateType("S", "T", "Thread State");
			
 
				 	poti_DefineEntityValue("I", "S", "Initializing", "0.0 .7 1.0");
			
@@ -220,6 +232,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 
				 1       MPICt   T       \"MPI Communication Thread\"              \n\
			
 
				 1       Sc       P       \"Scheduler State\"                        \n\
			
 
				 2       prog_event   P       \"program event type\"				\n\
			
 
				+2       user_event   T       \"user event type\"				\n\
			
 
				 2       thread_event   T       \"thread event type\"				\n\
			
 
				 2       MPIev   MPICt    \"MPI event type\"			\n\
			
 
				 3       S       T       \"Thread State\"                        \n\
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -89,7 +89,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 	}
			
 
				 
			
 
				 	/* Give profiling variable */
			
 
				-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
			
 
				+	_starpu_driver_start_job(cpu_args, j, perf_arch, &codelet_start, rank, profiling);
			
 
				 
			
 
				 	/* In case this is a Fork-join parallel task, the worker does not
			
 
				 	 * execute the kernel at all. */
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -396,7 +396,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
			
 
				+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
			
 
				 
			
 
				 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 	/* We make sure we do manipulate the proper device */
			
@@ -517,7 +517,10 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 	unsigned memnode = worker0->memory_node;
			
 
				 	struct starpu_task *tasks[worker_set->nworkers], *task;
			
 
				 	struct _starpu_job *j;
			
 
				-	int i, res, idle;
			
 
				+	int i, res;
			
 
				+
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	int idle;
			
 
				 
			
 
				 	/* First poll for completed jobs */
			
 
				 	idle = 0;
			
@@ -540,13 +543,13 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		if (cures != cudaSuccess)
			
 
				 		{
			
 
				 			STARPU_ASSERT(cures == cudaErrorNotReady);
			
 
				-			idle++;
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			/* Asynchronous task completed! */
			
 
				 			_starpu_set_local_worker_key(args);
			
 
				 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
			
 
				+			idle++;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -556,6 +559,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		__starpu_datawizard_progress(memnode, 1, 0);
			
 
				 		return 0;
			
 
				 	}
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 	/* Something done, make some progress */
			
 
				 	__starpu_datawizard_progress(memnode, 1, 1);
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -34,7 +34,7 @@
 
				 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
			
 
				 #define BACKOFF_MIN 1
			
 
				 
			
 
				-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
			
 
				+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
			
 
				 {
			
 
				 	struct starpu_task *task = j->task;
			
 
				 	struct starpu_codelet *cl = task->cl;
			
@@ -74,7 +74,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 
				 	if (starpu_top)
			
 
				 		_starpu_top_task_started(task,workerid,codelet_start);
			
 
				 
			
 
				-	_STARPU_TRACE_START_CODELET_BODY(j);
			
 
				+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
			
 
				 }
			
 
				 
			
 
				 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
@@ -398,6 +398,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 		/*else try to pop a task*/
			
 
				 		else
			
 
				 		{
			
 
				+			_starpu_worker_set_status_scheduling(workers[i].workerid);
			
 
				 			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
			
 
				 			_starpu_set_local_worker_key(&workers[i]);
			
 
				 			tasks[i] = _starpu_pop_task(&workers[i]);
			
@@ -427,6 +428,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 					workers[i].current_rank = 0;
			
 
				 				}
			
 
				 
			
 
				+				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
			
 
				 				_starpu_worker_set_status_wakeup(workers[i].workerid);
			
 
				 			}
			
 
				 			else
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -23,7 +23,7 @@
 
				 #include <core/jobs.h>
			
 
				 #include <common/utils.h>
			
 
				 
			
 
				-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
			
 
				+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
			
 
				 			      struct timespec *codelet_start, int rank, int profiling);
			
 
				 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
			
 
				 			    struct timespec *codelet_end, int rank, int profiling);
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -421,7 +421,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
				 
			
 
				 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
			
 
				 
			
 
				-	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
			
 
				+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
			
 
				 
			
 
				 
			
 
				 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -619,6 +619,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
				 	struct starpu_task *task;
			
 
				 	int res;
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	task = starpu_task_get_current();
			
 
				 
			
 
				 	if (task)
			
@@ -642,6 +643,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
				 		/* Asynchronous task completed! */
			
 
				 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
			
 
				 	}
			
 
				+#endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 	__starpu_datawizard_progress(memnode, 1, 1);
			
 
				 
			
@@ -700,7 +702,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
				 	else
			
 
				 #else
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#warning No CUDA asynchronous execution with simgrid yet.
			
 
				+#warning No OpenCL asynchronous execution with simgrid yet.
			
 
				 #endif
			
 
				 #endif
			
 
				 	/* Synchronous execution */
			
@@ -823,7 +825,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 		return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
			
 
				+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
			
 
				 
			
 
				 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
			
 
				 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -286,6 +286,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	/* make sure someone coule execute that task ! */
			
 
				 	STARPU_ASSERT(best_workerid != -1);
			
 
				+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(best_workerid, sched_ctx_id);
			
 
				+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+        {
			
 
				+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
			
 
				+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
			
 
				+                return 0;
			
 
				+        }
			
 
				 
			
 
				 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
			
 
				 
			
@@ -405,9 +412,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
 
				-	while(workers->has_next(workers, &it))
			
 
				+	while(workers->has_next_master(workers, &it))
			
 
				 	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				+		worker = workers->get_next_master(workers, &it);
			
 
				 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
@@ -543,9 +550,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 
			
 
				-	while(workers->has_next(workers, &it))
			
 
				+	while(workers->has_next_master(workers, &it))
			
 
				 	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				+		worker = workers->get_next_master(workers, &it);
			
 
				 
			
 
				 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
			
 
				 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
			
@@ -692,10 +699,6 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 
			
 
				 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
			
 
				 
			
 
				-	struct starpu_sched_ctx_iterator it;
			
 
				-	if(workers->init_iterator)
			
 
				-		workers->init_iterator(workers, &it);
			
 
				-
			
 
				 	compute_all_performance_predictions(task,
			
 
				 					    nworkers_ctx,
			
 
				 					    local_task_length,
			
@@ -712,9 +715,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
				 	unsigned nimpl;
			
 
				 	if (forced_best == -1)
			
 
				 	{
			
 
				-		while(workers->has_next(workers, &it))
			
 
				+		struct starpu_sched_ctx_iterator it;
			
 
				+		if(workers->init_iterator)
			
 
				+			workers->init_iterator(workers, &it);
			
 
				+
			
 
				+		while(workers->has_next_master(workers, &it))
			
 
				 		{
			
 
				-			worker = workers->get_next(workers, &it);
			
 
				+			worker = workers->get_next_master(workers, &it);
			
 
				 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 			{
			
 
				 				if (!starpu_worker_can_execute_task(worker, task, nimpl))
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -94,9 +94,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 
				 	if(workers->init_iterator)
			
 
				 		workers->init_iterator(workers, &it);
			
 
				 	
			
 
				-	while(workers->has_next(workers, &it))
			
 
				+	while(workers->has_next_master(workers, &it))
			
 
				 	{
			
 
				-		worker = workers->get_next(workers, &it);
			
 
				+		worker = workers->get_next_master(workers, &it);
			
 
				 
			
 
				 #ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				 		if (!starpu_bitmap_get(data->waiters, worker))
			
@@ -167,6 +167,17 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 
			
 
				+	if(task)
			
 
				+	{
			
 
				+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
			
 
				+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				+		{
			
 
				+			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
			
 
				+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
			
 
				+			return NULL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	return task;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/locality_work_stealing_policy.c
+++ b/src/sched_policies/locality_work_stealing_policy.c
@@ -0,0 +1,373 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012  INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* Work stealing policy */
			
 
				+
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <sched_policies/fifo_queues.h>
			
 
				+#include <core/debug.h>
			
 
				+#include <starpu_bitmap.h>
			
 
				+
			
 
				+struct _starpu_lws_data
			
 
				+{
			
 
				+	struct _starpu_fifo_taskq **queue_array;
			
 
				+	int **proxlist;
			
 
				+	unsigned last_pop_worker;
			
 
				+	unsigned last_push_worker;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+
			
 
				+/* Return a worker to steal a task from. The worker is selected
			
 
				+ * according to the proximity list built using the info on te
			
 
				+ * architecture provided by hwloc */
			
 
				+static unsigned select_victim_neighborhood(unsigned sched_ctx_id, int workerid)
			
 
				+{
			
 
				+
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	int nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
			
 
				+
			
 
				+	int i;
			
 
				+	int neighbor;
			
 
				+	for(i=0; i<nworkers; i++){
			
 
				+		neighbor = ws->proxlist[workerid][i];
			
 
				+		int ntasks = ws->queue_array[neighbor]->ntasks;
			
 
				+		
			
 
				+		if (ntasks)
			
 
				+			return neighbor;
			
 
				+	}
			
 
				+
			
 
				+	return workerid;
			
 
				+}
			
 
				+#else
			
 
				+/* Return a worker to steal a task from. The worker is selected
			
 
				+ * in a round-robin fashion */
			
 
				+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+	unsigned worker = ws->last_pop_worker;
			
 
				+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
			
 
				+
			
 
				+	starpu_pthread_mutex_t *victim_sched_mutex;
			
 
				+	starpu_pthread_cond_t *victim_sched_cond;
			
 
				+
			
 
				+	/* If the worker's queue is empty, let's try
			
 
				+	 * the next ones */
			
 
				+	while (1)
			
 
				+	{
			
 
				+		unsigned ntasks;
			
 
				+
			
 
				+		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
			
 
				+		ntasks = ws->queue_array[worker]->ntasks;
			
 
				+		if (ntasks)
			
 
				+			break;
			
 
				+
			
 
				+		worker = (worker + 1) % nworkers;
			
 
				+		if (worker == ws->last_pop_worker)
			
 
				+		{
			
 
				+			/* We got back to the first worker,
			
 
				+			 * don't go in infinite loop */
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ws->last_pop_worker = (worker + 1) % nworkers;
			
 
				+
			
 
				+	return worker;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Return a worker to whom add a task.
			
 
				+ * Selecting a worker is done in a round-robin fashion.
			
 
				+ */
			
 
				+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+	unsigned worker = ws->last_push_worker;
			
 
				+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
			
 
				+	/* TODO: use an atomic update operation for this */
			
 
				+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
			
 
				+
			
 
				+	return worker;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * Return a worker from which a task can be stolen.
			
 
				+ */
			
 
				+static inline unsigned select_victim(unsigned sched_ctx_id, int workerid)
			
 
				+{
			
 
				+
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	return select_victim_neighborhood(sched_ctx_id, workerid);
			
 
				+#else
			
 
				+	return select_victim_round_robin(sched_ctx_id);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Return a worker on whose queue a task can be pushed. This is only
			
 
				+ * needed when the push is done by the master
			
 
				+ */
			
 
				+static inline unsigned select_worker(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	return select_worker_round_robin(sched_ctx_id);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static struct starpu_task *lws_pop_task(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	struct starpu_task *task = NULL;
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	STARPU_ASSERT(workerid != -1);
			
 
				+
			
 
				+	task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
			
 
				+	if (task)
			
 
				+	{
			
 
				+		/* there was a local task */
			
 
				+		/* printf("Own    task!%d\n",workerid); */
			
 
				+		return task;
			
 
				+	}
			
 
				+	starpu_pthread_mutex_t *worker_sched_mutex;
			
 
				+	starpu_pthread_cond_t *worker_sched_cond;
			
 
				+	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);
			
 
				+
			
 
				+	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
			
 
				+       
			
 
				+
			
 
				+	/* we need to steal someone's job */
			
 
				+	unsigned victim = select_victim(sched_ctx_id, workerid);
			
 
				+
			
 
				+	starpu_pthread_mutex_t *victim_sched_mutex;
			
 
				+	starpu_pthread_cond_t *victim_sched_cond;
			
 
				+
			
 
				+	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
			
 
				+
			
 
				+	task = _starpu_fifo_pop_task(ws->queue_array[victim], workerid);
			
 
				+	if (task)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORK_STEALING(workerid, victim);
			
 
				+	}
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
			
 
				+	if(!task)
			
 
				+	{
			
 
				+		task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
			
 
				+		if (task)
			
 
				+		{
			
 
				+			/* there was a local task */
			
 
				+			return task;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+static int lws_push_task(struct starpu_task *task)
			
 
				+{
			
 
				+	unsigned sched_ctx_id = task->sched_ctx;
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	/* If the current thread is not a worker but
			
 
				+	 * the main thread (-1), we find the better one to
			
 
				+	 * put task on its queue */
			
 
				+	if (workerid == -1)
			
 
				+		workerid = select_worker(sched_ctx_id);
			
 
				+
			
 
				+	/* int workerid = starpu_worker_get_id(); */
			
 
				+	/* print_neighborhood(sched_ctx_id, 0); */
			
 
				+	
			
 
				+	starpu_pthread_mutex_t *sched_mutex;
			
 
				+	starpu_pthread_cond_t *sched_cond;
			
 
				+	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
 
				+
			
 
				+	_starpu_fifo_push_task(ws->queue_array[workerid], task);
			
 
				+	
			
 
				+	starpu_push_task_end(task);
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				+
			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				+	struct starpu_sched_ctx_iterator it;
			
 
				+	if(workers->init_iterator)
			
 
				+		workers->init_iterator(workers, &it);
			
 
				+	while(workers->has_next(workers, &it))
			
 
				+	{
			
 
				+		worker = workers->get_next(workers, &it);
			
 
				+		starpu_pthread_mutex_t *sched_mutex;
			
 
				+		starpu_pthread_cond_t *sched_cond;
			
 
				+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				+		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+	
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	int workerid;
			
 
				+
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		workerid = workerids[i];
			
 
				+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				+		ws->queue_array[workerid] = _starpu_create_fifo();
			
 
				+
			
 
				+		/* Tell helgrid that we are fine with getting outdated values,
			
 
				+		 * this is just an estimation */
			
 
				+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->ntasks);
			
 
				+
			
 
				+		ws->queue_array[workerid]->nprocessed = 0;
			
 
				+		ws->queue_array[workerid]->ntasks = 0;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	/* Build a proximity list for every worker. It is cheaper to
			
 
				+	 * build this once and then use it for popping tasks rather
			
 
				+	 * than traversing the hwloc tree every time a task must be
			
 
				+	 * stolen */
			
 
				+	ws->proxlist = (int**)malloc(nworkers*sizeof(int*));
			
 
				+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		workerid = workerids[i];
			
 
				+		ws->proxlist[workerid] = (int*)malloc(nworkers*sizeof(int));
			
 
				+		int bindid;
			
 
				+		
			
 
				+		struct starpu_tree *neighbour = NULL;
			
 
				+		struct starpu_sched_ctx_iterator it;
			
 
				+		if(workers->init_iterator)
			
 
				+			workers->init_iterator(workers, &it);
			
 
				+	
			
 
				+		bindid   = starpu_worker_get_bindid(workerid);
			
 
				+		it.value = starpu_tree_get(tree, bindid);
			
 
				+		int cnt = 0;
			
 
				+		for(;;)
			
 
				+		{
			
 
				+			neighbour = (struct starpu_tree*)it.value;
			
 
				+			int workerids[STARPU_NMAXWORKERS];
			
 
				+			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
			
 
				+			int w;
			
 
				+			for(w = 0; w < nworkers; w++)
			
 
				+			{
			
 
				+				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
			
 
				+				{
			
 
				+					ws->proxlist[workerid][cnt++] = workerids[w];
			
 
				+					it.visited[workerids[w]] = 1;
			
 
				+				}
			
 
				+			}
			
 
				+			if(!workers->has_next(workers, &it))
			
 
				+				break;
			
 
				+			it.value = it.possible_value;
			
 
				+			it.possible_value = NULL;
			
 
				+		} 
			
 
				+	}
			
 
				+#endif	
			
 
				+}
			
 
				+
			
 
				+static void lws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	int workerid;
			
 
				+
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		workerid = workerids[i];
			
 
				+		_starpu_destroy_fifo(ws->queue_array[workerid]);
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+		free(ws->proxlist[workerid]);
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void lws_initialize_policy(unsigned sched_ctx_id)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_TREE);
			
 
				+#else
			
 
				+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
			
 
				+#endif
			
 
				+
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)malloc(sizeof(struct _starpu_lws_data));
			
 
				+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
			
 
				+
			
 
				+	ws->last_pop_worker = 0;
			
 
				+	ws->last_push_worker = 0;
			
 
				+
			
 
				+	/* unsigned nw = starpu_sched_ctx_get_nworkers(sched_ctx_id); */
			
 
				+	unsigned nw = starpu_worker_get_count();
			
 
				+	ws->queue_array = (struct _starpu_fifo_taskq**)malloc(nw*sizeof(struct _starpu_fifo_taskq*));
			
 
				+
			
 
				+}
			
 
				+	
			
 
				+static void lws_deinit_policy(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+
			
 
				+	free(ws->queue_array);
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	free(ws->proxlist);
			
 
				+#endif
			
 
				+	free(ws);
			
 
				+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
			
 
				+}
			
 
				+
			
 
				+struct starpu_sched_policy _starpu_sched_lws_policy =
			
 
				+{
			
 
				+	.init_sched = lws_initialize_policy,
			
 
				+	.deinit_sched = lws_deinit_policy,
			
 
				+	.add_workers = lws_add_workers,
			
 
				+	.remove_workers = lws_remove_workers,
			
 
				+	.push_task = lws_push_task,
			
 
				+	.pop_task = lws_pop_task,
			
 
				+	.pre_exec_hook = NULL,
			
 
				+	.post_exec_hook = NULL,
			
 
				+	.pop_every_task = NULL,
			
 
				+	.policy_name = "nws",
			
 
				+	.policy_description = "new work stealing"
			
 
				+};
			
--- a/src/starpu_parameters.h
+++ b/src/starpu_parameters.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011, 2014  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,7 +22,7 @@
 
				 
			
 
				 /* How many executions a codelet will have to be measured before we
			
 
				  * consider that calibration will provide a value good enough for scheduling */
			
 
				-#define _STARPU_CALIBRATION_MINIMUM 10
			
 
				+#define _STARPU_CALIBRATION_MINIMUM ((unsigned) starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10))
			
 
				 
			
 
				 /* Assumed relative performance ratios */
			
 
				 /* TODO: benchmark a bit instead */
			
--- a/src/worker_collection/worker_list.c
+++ b/src/worker_collection/worker_list.c
@@ -42,6 +42,30 @@ static int list_get_next(struct starpu_worker_collection *workers, struct starpu
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static unsigned list_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				+{
			
 
				+	int nworkers = workers->nmasters;
			
 
				+	STARPU_ASSERT(it != NULL);
			
 
				+
			
 
				+	unsigned ret = it->cursor < nworkers ;
			
 
				+
			
 
				+	if(!ret) it->cursor = 0;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int list_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				+{
			
 
				+	int *workerids = (int *)workers->masters;
			
 
				+	int nworkers = (int)workers->nmasters;
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(it->cursor < nworkers, "cursor %d nworkers %d\n", it->cursor, nworkers);
			
 
				+
			
 
				+	int ret = workerids[it->cursor++];
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
			
 
				 {
			
 
				 	int *workerids = (int *)workers->workerids;
			
@@ -108,9 +132,12 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 
				 {
			
 
				 	int *workerids = (int *)workers->workerids;
			
 
				 	unsigned nworkers = workers->nworkers;
			
 
				+
			
 
				+	int *masters = (int *)workers->masters;
			
 
				+	unsigned nmasters = workers->nmasters;
			
 
				 	
			
 
				-	int found_worker = -1;
			
 
				 	unsigned i;
			
 
				+	int found_worker = -1;
			
 
				 	for(i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		if(workerids[i] == worker)
			
@@ -125,13 +152,29 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 
				 	if(found_worker != -1)
			
 
				 		workers->nworkers--;
			
 
				 
			
 
				+	int found_master = -1;
			
 
				+	for(i = 0; i < nmasters; i++)
			
 
				+	{
			
 
				+		if(masters[i] == worker)
			
 
				+		{
			
 
				+			masters[i] = -1;
			
 
				+			found_master = worker;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	_rearange_workerids(masters, nmasters);
			
 
				+	if(found_master != -1)
			
 
				+		workers->nmasters--;
			
 
				+	printf("rem %d\n", found_worker);
			
 
				 	return found_worker;
			
 
				 }
			
 
				 
			
 
				 static void _init_workers(int *workerids)
			
 
				 {
			
 
				 	unsigned i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	int nworkers = starpu_worker_get_count();
			
 
				+	for(i = 0; i < nworkers; i++)
			
 
				 		workerids[i] = -1;
			
 
				 	return;
			
 
				 }
			
@@ -139,10 +182,14 @@ static void _init_workers(int *workerids)
 
				 static void list_init(struct starpu_worker_collection *workers)
			
 
				 {
			
 
				 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
			
 
				+	int *masters = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
			
 
				 	_init_workers(workerids);
			
 
				+	_init_workers(masters);
			
 
				 
			
 
				 	workers->workerids = (void*)workerids;
			
 
				 	workers->nworkers = 0;
			
 
				+	workers->masters = (void*)masters;
			
 
				+	workers->nmasters = 0;
			
 
				 
			
 
				 	return;
			
 
				 }
			
@@ -150,17 +197,32 @@ static void list_init(struct starpu_worker_collection *workers)
 
				 static void list_deinit(struct starpu_worker_collection *workers)
			
 
				 {
			
 
				 	free(workers->workerids);
			
 
				+	free(workers->masters);
			
 
				 }
			
 
				 
			
 
				-static void list_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
			
 
				+static void list_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				 {
			
 
				 	it->cursor = 0;
			
 
				+
			
 
				+	int *workerids = (int *)workers->workerids;
			
 
				+	unsigned nworkers = workers->nworkers;
			
 
				+	unsigned i;
			
 
				+	int nm = 0;
			
 
				+	for(i = 0;  i < nworkers; i++)
			
 
				+	{
			
 
				+		if(!starpu_worker_is_slave(workerids[i]))
			
 
				+			((int*)workers->masters)[nm++] = workerids[i];
			
 
				+	}
			
 
				+	workers->nmasters = nm;
			
 
				+
			
 
				 }
			
 
				 
			
 
				 struct starpu_worker_collection worker_list =
			
 
				 {
			
 
				 	.has_next = list_has_next,
			
 
				 	.get_next = list_get_next,
			
 
				+	.has_next_master = list_has_next_master,
			
 
				+	.get_next_master = list_get_next_master,
			
 
				 	.add = list_add,
			
 
				 	.remove = list_remove,
			
 
				 	.init = list_init,
			
--- a/src/worker_collection/worker_tree.c
+++ b/src/worker_collection/worker_tree.c
@@ -89,6 +89,75 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static unsigned tree_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				+{
			
 
				+	STARPU_ASSERT(it != NULL);
			
 
				+	if(workers->nworkers == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
			
 
				+	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
			
 
				+	
			
 
				+	if(!neighbour)
			
 
				+	{
			
 
				+		starpu_tree_reset_visited(tree, it->visited);
			
 
				+		it->value = NULL;
			
 
				+		it->possible_value = NULL;
			
 
				+		return 0;
			
 
				+	}
			
 
				+	int id = -1;
			
 
				+	int workerids[STARPU_NMAXWORKERS];
			
 
				+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
			
 
				+	int w;
			
 
				+	for(w = 0; w < nworkers; w++)
			
 
				+	{
			
 
				+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
			
 
				+		{
			
 
				+			id = workerids[w];
			
 
				+			it->possible_value = neighbour;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(id != -1, "bind id (%d) for workerid (%d) not correct", neighbour->id, id);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				+{
			
 
				+	int ret = -1;
			
 
				+	
			
 
				+	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
			
 
				+	struct starpu_tree *neighbour = NULL;
			
 
				+	if(it->possible_value)
			
 
				+	{
			
 
				+		neighbour = it->possible_value;
			
 
				+		it->possible_value = NULL;
			
 
				+	}
			
 
				+	else
			
 
				+		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
			
 
				+	
			
 
				+	STARPU_ASSERT_MSG(neighbour, "no element anymore");
			
 
				+	
			
 
				+	
			
 
				+	int workerids[STARPU_NMAXWORKERS];
			
 
				+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
			
 
				+	int w;
			
 
				+	for(w = 0; w < nworkers; w++)
			
 
				+	{
			
 
				+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
			
 
				+		{
			
 
				+			ret = workerids[w];
			
 
				+			it->visited[workerids[w]] = 1;
			
 
				+			it->value = neighbour;
			
 
				+		}
			
 
				+	}
			
 
				+	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static int tree_add(struct starpu_worker_collection *workers, int worker)
			
 
				 {
			
 
				 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
			
@@ -111,6 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 
				 	if(workers->present[worker])
			
 
				 	{
			
 
				 		workers->present[worker] = 0;
			
 
				+		workers->is_master[worker] = 0;
			
 
				 		workers->nworkers--;
			
 
				 		return worker;
			
 
				 	}
			
@@ -122,10 +192,14 @@ static void tree_init(struct starpu_worker_collection *workers)
 
				 {
			
 
				 	workers->workerids = (void*)starpu_workers_get_tree();
			
 
				 	workers->nworkers = 0;
			
 
				-	
			
 
				+
			
 
				 	int i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	int nworkers = starpu_worker_get_count();
			
 
				+	for(i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				 		workers->present[i] = 0;
			
 
				+		workers->is_master[i] = 0;
			
 
				+	}
			
 
				 	
			
 
				 	return;
			
 
				 }
			
@@ -135,19 +209,25 @@ static void tree_deinit(struct starpu_worker_collection *workers)
 
				 //	free(workers->workerids);
			
 
				 }
			
 
				 
			
 
				-static void tree_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
			
 
				+static void tree_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
			
 
				 {
			
 
				 	it->value = NULL;
			
 
				 	it->possible_value = NULL;
			
 
				 	int i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	int nworkers = starpu_worker_get_count();
			
 
				+	for(i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		workers->is_master[i] = (workers->present[i] && !starpu_worker_is_slave(i));
			
 
				 		it->visited[i] = 0;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 struct starpu_worker_collection worker_tree =
			
 
				 {
			
 
				 	.has_next = tree_has_next,
			
 
				 	.get_next = tree_get_next,
			
 
				+	.has_next_master = tree_has_next_master,
			
 
				+	.get_next_master = tree_get_next_master,
			
 
				 	.add = tree_add,
			
 
				 	.remove = tree_remove,
			
 
				 	.init = tree_init,
			
--- a/tests/datawizard/commute.c
+++ b/tests/datawizard/commute.c
@@ -171,6 +171,7 @@ int main(int argc, char **argv)
 
				 		test(STARPU_R, STARPU_RW, i);
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(x_handle);
			
 
				 	starpu_shutdown();
			
 
				 	STARPU_RETURN(0);
			
 
				 
			
--- a/tests/heat/dmda.sh
+++ b/tests/heat/dmda.sh
@@ -2,7 +2,7 @@
 
				 
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				-# Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -52,7 +52,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 
				 mkdir -p $TIMINGDIR
			
 
				 mkdir -p $SAMPLINGDIR
			
 
				 
			
 
				-#schedlist="ws no-prio greedy prio dm random"
			
 
				+#schedlist="ws lws no-prio greedy prio dm random"
			
 
				 #schedlist="random random random random"
			
 
				 
			
 
				 export STARPU_NCUDA=3
			
--- a/tests/heat/gflops_sched.gp
+++ b/tests/heat/gflops_sched.gp
@@ -3,7 +3,7 @@
 
				 
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				-# Copyright (C) 2008, 2009  Université de Bordeaux 1
			
 
				+# Copyright (C) 2008, 2009, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -30,7 +30,8 @@ set key right bottom
 
				 set datafile missing 'x'
			
 
				 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
			
 
				      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
			
 
				-     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "ws" 
			
 
				+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "ws" 	    ,\
			
 
				+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "lws" 
			
 
				 
			
 
				 set output "gflops_sched_gain.eps"
			
 
				 set title "LU Decomposition : scheduling strategies : gain"
			
@@ -43,4 +44,5 @@ set logscale x
 
				 set key right bottom
			
 
				 set datafile missing 'x'
			
 
				 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
			
 
				-	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    
			
 
				+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    ,\
			
 
				+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain lws"    
			
--- a/tests/heat/gflops_sched.sh
+++ b/tests/heat/gflops_sched.sh
@@ -2,7 +2,7 @@
 
				 
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -137,6 +137,15 @@ do
 
				 done
			
 
				 
			
 
				 
			
 
				+filename=$TIMINGDIR/gflops.lws.data
			
 
				+policy=lws
			
 
				+trace_header 
			
 
				+for size in $sizelist
			
 
				+do
			
 
				+	trace_size $size;
			
 
				+done
			
 
				+
			
 
				+
			
 
				 filename=$TIMINGDIR/gflops.noprio.data
			
 
				 policy=no-prio
			
 
				 trace_header 
			
--- a/tests/heat/granularity.r
+++ b/tests/heat/granularity.r
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -19,7 +19,7 @@ max <- 28
 
				 maxy <- 400
			
 
				 
			
 
				 sizelist <- seq(2048, max*1024, 64);
			
 
				-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
			
 
				+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
			
 
				 #schedlist <- c("greedy", "prio", "dm", "random");
			
 
				 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
			
 
				 grainlist <- c(256, 512, 1024, 2048);
			
--- a/tests/heat/granularity_model.r
+++ b/tests/heat/granularity_model.r
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -17,7 +17,7 @@
 
				 max <- 30
			
 
				 
			
 
				 sizelist <- seq(64, max*1024, 64);
			
 
				-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
			
 
				+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
			
 
				 #schedlist <- c("greedy", "prio", "dm", "random");
			
 
				 #grainlist <- c(256, 512, 1024)
			
 
				 grainlist <- c(512, 1024)
			
--- a/tests/heat/model.r
+++ b/tests/heat/model.r
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -15,7 +15,7 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 sizelist <- seq(2048, 24576, 2048);
			
 
				-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
			
 
				+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
			
 
				 schedlist <- c("prio", "dm", "random");
			
 
				 
			
 
				 print(schedlist);
			
--- a/tests/heat/random.r
+++ b/tests/heat/random.r
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -15,7 +15,7 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 sizelist <- seq(2048, 24576, 2048);
			
 
				-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
			
 
				+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
			
 
				 schedlist <- c("prio","random");
			
 
				 
			
 
				 print(schedlist);
			
@@ -97,13 +97,14 @@ display_sched <- function()
 
				 	trace_sched("prio", "red", 4);
			
 
				 	#trace_sched("no-prio", "black");
			
 
				 	#trace_sched("ws", "purple");
			
 
				+	#trace_sched("lws", "purple");
			
 
				 
			
 
				 	axis(1, at=sizelist)
			
 
				 	axis(2, at=seq(0, 100, 10), tck=1)
			
 
				 #	axis(4, at=seq(0, 100, 10))
			
 
				 	box(bty="u")
			
 
				 
			
 
				-        #labels <- c("greedy", "priority", "model", "random", "black", "ws")
			
 
				+        #labels <- c("greedy", "priority", "model", "random", "black", "ws", "lws")
			
 
				 #        labels <- c("greedy", "priority", "model", "random")
			
 
				 	#labels <- c("model", "weighted random", "greedy", "priority")
			
 
				 	labels <- c("weighted random", "priority")
			
--- a/tests/heat/sched.r
+++ b/tests/heat/sched.r
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -15,7 +15,7 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 sizelist <- seq(2048, 24576, 2048);
			
 
				-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
			
 
				+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
			
 
				 schedlist <- c("greedy", "prio", "dm", "random");
			
 
				 
			
 
				 print(schedlist);
			
--- a/tests/heat/sched.sh
+++ b/tests/heat/sched.sh
@@ -2,7 +2,7 @@
 
				 
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -94,7 +94,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 
				 mkdir -p $TIMINGDIR
			
 
				 mkdir -p $SAMPLINGDIR
			
 
				 
			
 
				-#schedlist="ws no-prio greedy prio dm random"
			
 
				+#schedlist="ws lws no-prio greedy prio dm random"
			
 
				 #schedlist="random random random random"
			
 
				 
			
 
				 export STARPU_NCUDA=3
			
--- a/tests/main/driver_api/init_run_deinit.c
+++ b/tests/main/driver_api/init_run_deinit.c
@@ -49,8 +49,11 @@ run(struct starpu_task *task, struct starpu_driver *d)
 
				 	int ret;
			
 
				 	ret = starpu_task_submit(task);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	ret = starpu_driver_run_once(d);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
			
 
				+	while (!starpu_task_finished(task))
			
 
				+	{
			
 
				+		ret = starpu_driver_run_once(d);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
			
 
				+	}
			
 
				 	ret = starpu_task_wait(task);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
			
 
				 }
			
--- a/tests/main/subgraph_repeat.c
+++ b/tests/main/subgraph_repeat.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 
				 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
			
 
				 
			
 
				 	starpu_free(check_cnt);
			
 
				+	starpu_data_unregister(check_data);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
@@ -179,6 +180,7 @@ enodev:
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_data_unregister(check_data);
			
 
				 	starpu_shutdown();
			
 
				 	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/main/subgraph_repeat_regenerate.c
+++ b/tests/main/subgraph_repeat_regenerate.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -168,6 +168,7 @@ int main(int argc, char **argv)
 
				 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
			
 
				 
			
 
				 	starpu_free(check_cnt);
			
 
				+	starpu_data_unregister(check_data);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
@@ -183,6 +184,7 @@ enodev:
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_data_unregister(check_data);
			
 
				 	starpu_shutdown();
			
 
				 	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/main/subgraph_repeat_regenerate_tag.c
+++ b/tests/main/subgraph_repeat_regenerate_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -198,6 +198,7 @@ enodev:
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_data_unregister(check_data);
			
 
				 	starpu_shutdown();
			
 
				 	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/main/subgraph_repeat_tag.c
+++ b/tests/main/subgraph_repeat_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -182,6 +182,7 @@ enodev:
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				+	starpu_data_unregister(check_data);
			
 
				 	starpu_shutdown();
			
 
				 	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/perfmodels/feed.c
+++ b/tests/perfmodels/feed.c
@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
			
 
				+	 if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
			
 
				+	 {
			
 
				+		 starpu_shutdown();
			
 
				 		 return STARPU_TEST_SKIPPED;
			
 
				+	 }
			
 
				 
			
 
				 	starpu_task_init(&task);
			
 
				 	task.cl = &cl;
			
@@ -76,7 +79,7 @@ int main(int argc, char **argv)
 
				 		arch.devid = 0;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
			
 
				 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
			
 
				-		
			
 
				+
			
 
				 		/* Simulate Slow GPU */
			
 
				 		arch.devid = 1;
			
 
				 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);
			
--- a/tests/regression/profiles.in
+++ b/tests/regression/profiles.in
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -32,6 +32,8 @@ STARPU_NCUDA=1
 
				 # Execution configuration
			
 
				 STARPU_SCHED=ws
			
 
				 # Execution configuration
			
 
				+STARPU_SCHED=lws
			
 
				+# Execution configuration
			
 
				 STARPU_SCHED=prio
			
 
				 # Execution configuration
			
 
				 STARPU_SCHED=no-prio
			
--- a/tests/regression/regression_test.sh
+++ b/tests/regression/regression_test.sh
@@ -2,7 +2,7 @@
 
				 
			
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 # 
			
 
				-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
			
 
				+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				 # 
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -65,6 +65,10 @@ echo "heat.ws.8k.v2"
 
				 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
			
 
				 save_cov "heat.ws.8k.v2";
			
 
				 
			
 
				+echo "heat.lws.8k.v2"
			
 
				+timing=`STARPU_SCHED="lws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
			
 
				+save_cov "heat.lws.8k.v2";
			
 
				+
			
 
				 echo "heat.greedy.8k.v2"
			
 
				 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
			
 
				 save_cov "heat.greedy.8k.v2";