Samuel Thibault 11 роки тому
батько
коміт
03b796bdcb
82 змінених файлів з 1502 додано та 371 видалено
  1. 1 0
      AUTHORS
  2. 74 51
      ChangeLog
  3. 9 1
      configure.ac
  4. 5 0
      doc/doxygen/chapters/08scheduling.doxy
  5. 8 0
      doc/doxygen/chapters/13offline_performance_tools.doxy
  6. 46 42
      doc/doxygen/chapters/16mpi_support.doxy
  7. 8 0
      doc/doxygen/chapters/41configure_options.doxy
  8. 6 1
      doc/doxygen/chapters/api/insert_task.doxy
  9. 5 0
      examples/Makefile.am
  10. 2 0
      examples/axpy/axpy.c
  11. 2 2
      examples/binary/binary.c
  12. 5 0
      examples/cg/cg_kernels.c
  13. 3 0
      examples/cholesky/cholesky_implicit.c
  14. 2 1
      examples/cpp/incrementer_cpp.cpp
  15. 1 10
      examples/lu/xlu.c
  16. 12 1
      examples/lu/xlu.h
  17. 9 1
      examples/lu/xlu_implicit.c
  18. 15 1
      examples/lu/xlu_implicit_pivot.c
  19. 1 12
      examples/lu/xlu_pivot.c
  20. 1 0
      examples/mandelbrot/mandelbrot.c
  21. 5 1
      examples/pipeline/pipeline.c
  22. 212 0
      examples/sched_ctx/nested_sched_ctxs.c
  23. 5 1
      examples/sched_ctx/sched_ctx_without_sched_policy.c
  24. 2 2
      examples/scheduler/schedulers.sh
  25. 1 0
      examples/worker_collections/worker_list_example.c
  26. 8 0
      include/starpu_sched_ctx.h
  27. 31 27
      include/starpu_task.h
  28. 2 1
      include/starpu_task_util.h
  29. 7 0
      include/starpu_worker.h
  30. 3 1
      mpi/include/starpu_mpi.h
  31. 61 45
      mpi/src/starpu_mpi.c
  32. 12 0
      mpi/src/starpu_mpi_task_insert.c
  33. 1 0
      src/Makefile.am
  34. 11 4
      src/common/fxt.h
  35. 16 16
      src/core/jobs.h
  36. 56 2
      src/core/sched_ctx.c
  37. 3 0
      src/core/sched_ctx.h
  38. 1 0
      src/core/sched_policy.c
  39. 1 0
      src/core/sched_policy.h
  40. 22 3
      src/core/simgrid.c
  41. 10 2
      src/core/tree.c
  42. 54 38
      src/core/workers.c
  43. 3 0
      src/core/workers.h
  44. 15 8
      src/datawizard/coherency.c
  45. 11 11
      src/datawizard/coherency.h
  46. 1 1
      src/datawizard/datawizard.c
  47. 2 1
      src/datawizard/filters.c
  48. 2 1
      src/datawizard/interfaces/data_interface.c
  49. 5 1
      src/datawizard/reduction.c
  50. 2 4
      src/datawizard/user_interactions.c
  51. 86 5
      src/debug/traces/starpu_fxt.c
  52. 11 0
      src/debug/traces/starpu_paje.c
  53. 1 1
      src/drivers/cpu/driver_cpu.c
  54. 1 1
      src/drivers/cuda/driver_cuda.c
  55. 2 2
      src/drivers/driver_common/driver_common.c
  56. 2 2
      src/drivers/driver_common/driver_common.h
  57. 1 1
      src/drivers/mp_common/source_common.c
  58. 1 1
      src/drivers/opencl/driver_opencl.c
  59. 17 10
      src/sched_policies/deque_modeling_policy_data_aware.c
  60. 13 2
      src/sched_policies/eager_central_policy.c
  61. 373 0
      src/sched_policies/locality_work_stealing_policy.c
  62. 7 3
      src/util/starpu_task_insert_utils.c
  63. 65 3
      src/worker_collection/worker_list.c
  64. 84 4
      src/worker_collection/worker_tree.c
  65. 1 0
      tests/datawizard/commute.c
  66. 3 16
      tests/datawizard/increment_init.c
  67. 2 2
      tests/heat/dmda.sh
  68. 5 3
      tests/heat/gflops_sched.gp
  69. 10 1
      tests/heat/gflops_sched.sh
  70. 2 2
      tests/heat/granularity.r
  71. 2 2
      tests/heat/granularity_model.r
  72. 2 2
      tests/heat/model.r
  73. 4 3
      tests/heat/random.r
  74. 2 2
      tests/heat/sched.r
  75. 2 2
      tests/heat/sched.sh
  76. 3 1
      tests/main/subgraph_repeat.c
  77. 3 1
      tests/main/subgraph_repeat_regenerate.c
  78. 2 1
      tests/main/subgraph_repeat_regenerate_tag.c
  79. 2 1
      tests/main/subgraph_repeat_tag.c
  80. 5 2
      tests/perfmodels/feed.c
  81. 3 1
      tests/regression/profiles.in
  82. 5 1
      tests/regression/regression_test.sh

+ 1 - 0
AUTHORS

@@ -1,6 +1,7 @@
 Simon Archipoff <simon.archipoff@etu.u-bordeaux1.fr>
 Cédric Augonnet <cedric.augonnet@inria.fr>
 William Braik <wbraik@gmail.com>
+Alfredo Buttari <alfredo.buttari@enseeiht.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>

+ 74 - 51
ChangeLog

@@ -17,28 +17,6 @@
 StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 
-Small features:
-  * New function starpu_sched_ctx_display_workers() to display worker
-    information belonging to a given scheduler context
-  * The option --enable-verbose can be called with
-    --enable-verbose=extra to increase the verbosity
-
-StarPU 1.1.2 (svn revision xxxx)
-==============================================
-The scheduling context release
-
-New features:
-  * The reduction init codelet is automatically used to initialize temporary
-    buffers.
-  * Traces now include a "scheduling" state, to show the overhead of the
-    scheduler.
-  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
-    number of calibration measurements.
-
-StarPU 1.1.1 (svn revision 12638)
-==============================================
-The scheduling context release
-
 New features:
   * Xeon Phi support
   * SCC support
@@ -52,50 +30,95 @@ New features:
 	  before the corresponding data, which allows the receiver to
 	  allocate data correctly, and to submit the matching receive of
 	  the envelope.
+        - New function
+   	  starpu_mpi_irecv_detached_sequential_consistency which
+	  allows to enable or disable the sequential consistency for
+	  the given data handle (sequential consistency will be
+	  enabled or disabled based on the value of the function
+	  parameter and the value of the sequential consistency
+	  defined for the given data)
+        - New functions starpu_mpi_task_build() and
+  	  starpu_mpi_task_post_build()
   * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
     let starpu commute write accesses.
   * Out-of-core support, through registration of disk areas as additional memory
     nodes.
-  * StarPU-MPI: new function
-    starpu_mpi_irecv_detached_sequential_consistency which allows to
-    enable or disable the sequential consistency for the given data
-    handle (sequential consistency will be enabled or disabled based
-    on the value of the function parameter and the value of the
-    sequential consistency defined for the given data)
   * New hierarchical schedulers which allow the user to easily build
     its own scheduler, by coding itself each "box" it wants, or by
     combining existing boxes in StarPU to build it. Hierarchical
     schedulers have very interesting scalability properties.
-  * New functions starpu_mpi_task_build() and starpu_mpi_task_post_build()
-  * New functions starpu_pause() and starpu_resume()
-  * New codelet specific_nodes field to specify explicit target nodes for data.
-  * Use streams for all CUDA transfers, even initiated by CPUs.
   * Add STARPU_CUDA_ASYNC and STARPU_OPENCL_ASYNC flags to allow asynchronous
     CUDA and OpenCL kernel execution.
-  * Add paje traces statistics tools.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
-  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+  * New locality work stealing scheduler (lws).
 
 Small features:
+  * Tasks can now have a name (via the field const char *name of
+    struct starpu_task)
   * New functions starpu_data_acquire_cb_sequential_consistency() and
     starpu_data_acquire_on_node_cb_sequential_consistency() which allows
     to enable or disable sequential consistency
   * New configure option --enable-fxt-lock which enables additional
     trace events focused on locks behaviour during the execution
-  * New function starpu_perfmodel_directory() to print directory
-    storing performance models. Available through the new option -d of
-    the tool starpu_perfmodel_display
-  * New batch files to execute StarPU applications under Microsoft
-    Visual Studio (They are installed in path_to_starpu/bin/msvc)/
   * Functions starpu_insert_task and starpu_mpi_insert_task are
     renamed in starpu_task_insert and starpu_mpi_task_insert. Old
     names are kept to avoid breaking old codes.
   * New configure option --enable-calibration-heuristic which allows
     the user to set the maximum authorized deviation of the
     history-based calibrator.
-  * Tasks can now have a name (via the field const char *name of
-    struct starpu_task)
+  * Allow application to provide the task footprint itself.
+  * New function starpu_sched_ctx_display_workers() to display worker
+    information belonging to a given scheduler context
+  * The option --enable-verbose can be called with
+    --enable-verbose=extra to increase the verbosity
+  * Add codelet size, footprint and tag id in the paje trace.
+  * Add STARPU_TAG_ONLY, to specify a tag for traces without making StarPU
+    manage the tag.
+
+Changes:
+  * Data interfaces (variable, vector, matrix and block) now define
+    pack und unpack functions
+  * StarPU-MPI: Fix for being able to receive data which have not yet
+    been registered by the application (i.e it did not call
+    starpu_data_set_tag(), data are received as a raw memory)
+  * StarPU-MPI: Fix for being able to receive data with the same tag
+    from several nodes (see mpi/tests/gather.c)
+
+Small changes:
+  * Rename function starpu_trace_user_event() as
+    starpu_fxt_trace_user_event()
+
+StarPU 1.1.2 (svn revision xxx)
+==============================================
+The scheduling context release
+
+New features:
+  * The reduction init codelet is automatically used to initialize temporary
+    buffers.
+  * Traces now include a "scheduling" state, to show the overhead of the
+    scheduler.
+  * Add STARPU_CALIBRATE_MINIMUM environment variable to specify the minimum
+    number of calibration measurements.
+
+StarPU 1.1.1 (svn revision 12638)
+==============================================
+The scheduling context release
+
+New features:
+  * MPI:
+        - New variable STARPU_MPI_CACHE_STATS to print statistics on
+   	  cache holding received data.
+        - New function starpu_mpi_data_register() which sets the rank
+  	  and tag of a data, and also allows to automatically clear
+	  the MPI communication cache when unregistering the data. It
+	  should be called instead of both calling
+	  starpu_data_set_tag() and starpu_data_set_rank()
+  * Use streams for all CUDA transfers, even initiated by CPUs.
+  * Add paje traces statistics tools.
+  * Use streams for GPUA->GPUB and GPUB->GPUA transfers.
+
+Small features:
   * New STARPU_EXECUTE_ON_WORKER flag to specify the worker on which
     to execute the task.
   * New STARPU_DISABLE_PINNING environment variable to disable host memory
@@ -105,23 +128,23 @@ Small features:
   * New starpu_memory_get_total function to get the size of a memory node.
   * New starpu_parallel_task_barrier_init_n function to let a scheduler decide
     a set of workers without going through combined workers.
-  * Allow application to provide the task footprint itself.
 
 Changes:
-  * Data interfaces (variable, vector, matrix and block) now define
-    pack und unpack functions
-  * StarPU-MPI: Fix for being able to receive data which have not yet
-    been registered by the application (i.e it did not call
-    starpu_data_set_tag(), data are received as a raw memory)
-  * StarPU-MPI: Fix for being able to receive data with the same tag
-    from several nodes (see mpi/tests/gather.c)
+  * Fix simgrid execution.
+  * Rename starpu_get_nready_tasks_of_sched_ctx to starpu_sched_ctx_get_nready_tasks
+  * Rename starpu_get_nready_flops_of_sched_ctx to starpu_sched_ctx_get_nready_flops
+  * New functions starpu_pause() and starpu_resume()
+  * New codelet specific_nodes field to specify explicit target nodes for data.
   * StarPU-MPI: Fix overzealous allocation of memory.
   * Interfaces: Allow interface implementation to change pointers at will, in
     unpack notably.
 
 Small changes:
-  * Rename function starpu_trace_user_event() as
-    starpu_fxt_trace_user_event()
+  * Use big fat abortions when one tries to make a task or callback
+    sleep, instead of just returning EDEADLCK which few people will test
+  * By default, StarPU FFT examples are not compiled and checked, the
+    configure option --enable-starpufft-examples needs to be specified
+    to change this behaviour.
 
 StarPU 1.1.0 (svn revision 11960)
 ==============================================

+ 9 - 1
configure.ac

@@ -278,6 +278,8 @@ AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define
 
 AC_CHECK_HEADERS([aio.h])
 
+AC_CHECK_FUNCS([mkstemps])
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 
@@ -997,7 +999,7 @@ if test x$enable_simgrid = xyes ; then
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
 		]
 	)
-   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name])
+   	AC_CHECK_FUNCS([MSG_process_join MSG_get_as_by_name MSG_environment_get_routing_root])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		    		[[#include <msg/msg.h>]],
@@ -1482,6 +1484,12 @@ if test x$use_fxt = xyes; then
 	AC_CHECK_DECLS([fut_set_filename])
 	CFLAGS="$save_CFLAGS"
 
+        AC_ARG_ENABLE(paje-codelet-details, [AS_HELP_STRING([--enable-paje-codelet-details],
+			[enable details about codelets in the paje trace])],
+			enable_paje_codelet_details=$enableval, enable_paje_codelet_details=no)
+        if  test x$enable_paje_codelet_details = xyes; then
+        	AC_DEFINE(STARPU_ENABLE_PAJE_CODELET_DETAILS, [1], [enable details about codelets in the paje trace])
+        fi
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################

+ 5 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -45,6 +45,11 @@ a task on the worker which released it by
 default. When a worker becomes idle, it steals a task from the most loaded
 worker.
 
+The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
+a task on the worker which released it by
+default. When a worker becomes idle, it steals a task from neighbour workers. It
+also takes into account priorities.
+
 The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
 perform a HEFT-similar scheduling strategy: it schedules tasks where their
 termination time will be minimal. The difference with HEFT is that <b>dm</b>

+ 8 - 0
doc/doxygen/chapters/13offline_performance_tools.doxy

@@ -118,6 +118,9 @@ $ vite paje.trace
 
 To get names of tasks instead of "unknown", fill the optional
 starpu_codelet::name, or use a performance model for them.
+Details of the codelet execution can be obtained by passing
+<c>--enable-paje-codelet-details</c> and using a recent enough version of ViTE
+(at least r1430).
 
 In the MPI execution case, collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
@@ -129,6 +132,11 @@ $ starpu_fxt_tool -i filename1 -i filename2
 By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
+To identify tasks precisely, the application can set the ::tag_id field of the
+tasks (or use STARPU_TAG_ONY when using starpu_task_insert), and with a recent
+enough version of vite (>= r1430) and the <c>--enable-paje-codelet-details</c>
+configure option, the value of the tag will show up in the trace.
+
 Traces can also be inspected by hand by using the tool <c>fxt_print</c>, for instance:
 
 \verbatim

+ 46 - 42
doc/doxygen/chapters/16mpi_support.doxy

@@ -121,49 +121,53 @@ automatically released. This mechanism is similar to the pthread
 detach state attribute which determines whether a thread will be
 created in a joinable or a detached state.
 
-For any communication, the call of the function will result in the
-creation of a StarPU-MPI request, the function
-starpu_data_acquire_cb() is then called to asynchronously request
-StarPU to fetch the data in main memory; when the data is available in
-main memory, a StarPU-MPI function is called to put the new request in
-the list of the ready requests if it is a send request, or in an
-hashmap if it is a receive request.
-
-Internally, all MPI communications submitted by StarPU uses a unique
-tag which has a default value, and can be accessed with the functions
+Internally, all communication are divided in 2 communications, a first
+message is used to exchange an envelope describing the data (i.e its
+tag and its size), the data itself is sent in a second message. All
+MPI communications submitted by StarPU uses a unique tag which has a
+default value, and can be accessed with the functions
 starpu_mpi_get_communication_tag() and
-starpu_mpi_set_communication_tag().
-
-The matching of tags with corresponding requests is done into StarPU-MPI.
-To handle this, any communication is a double-communication based on a
-envelope + data system. Every data which will be sent needs to send an
-envelope which describes the data (particularly its tag) before sending
-the data, so the receiver can get the matching pending receive request
-from the hashmap, and submit it to recieve the data correctly.
-
-To this aim, the StarPU-MPI progression thread has a permanent-submitted
-request destined to receive incoming envelopes from all sources.
-
-The StarPU-MPI progression thread regularly polls this list of ready
-requests. For each new ready request, the appropriate function is
-called to post the corresponding MPI call. For example, calling
-starpu_mpi_isend() will result in posting <c>MPI_Isend</c>. If
-the request is marked as detached, the request will be put in the list
-of detached requests.
-
-The StarPU-MPI progression thread also polls the list of detached
-requests. For each detached request, it regularly tests the completion
-of the MPI request by calling <c>MPI_Test</c>. On completion, the data
-handle is released, and if a callback was defined, it is called.
-
-Finally, the StarPU-MPI progression thread checks if an envelope has
-arrived. If it is, it'll check if the corresponding receive has already
-been submitted by the application. If it is, it'll submit the request
-just as like as it does with those on the list of ready requests.
-If it is not, it'll allocate a temporary handle to store the data that
-will arrive just after, so as when the corresponding receive request
-will be submitted by the application, it'll copy this temporary handle
-into its one instead of submitting a new StarPU-MPI request.
+starpu_mpi_set_communication_tag(). The matching of tags with
+corresponding requests is done within StarPU-MPI.
+
+For any userland communication, the call of the corresponding function
+(e.g starpu_mpi_isend()) will result in the creation of a StarPU-MPI
+request, the function starpu_data_acquire_cb() is then called to
+asynchronously request StarPU to fetch the data in main memory; when
+the data is ready and the corresponding buffer has already been
+received by MPI, it will be copied in the memory of the data,
+otherwise the request is stored in the <em>early requests list</em>. Sending
+requests are stored in the <em>ready requests list</em>.
+
+While requests need to be processed, the StarPU-MPI progression thread
+does the following:
+
+<ol>
+<li> it polls the <em>ready requests list</em>. For all the ready
+requests, the appropriate function is called to post the corresponding
+MPI call. For example, an initial call to starpu_mpi_isend() will
+result in a call to <c>MPI_Isend</c>. If the request is marked as
+detached, the request will then be added in the <em>detached requests
+list</em>.
+</li>
+<li> it posts a <c>MPI_Irecv()</c> to retrieve a data envelope.
+</li>
+<li> it polls the <em>detached requests list</em>. For all the detached
+requests, it tests its completion of the MPI request by calling
+<c>MPI_Test</c>. On completion, the data handle is released, and if a
+callback was defined, it is called.
+</li>
+<li> finally, it checks if a data envelope has been received. If so,
+if the data envelope matches a request in the <em>early requests list</em> (i.e
+the request has already been posted by the application), the
+corresponding MPI call is posted (similarly to the first step above).
+
+If the data envelope does not match any application request, a
+temporary handle is created to receive the data, a StarPU-MPI request
+is created and added into the <em>ready requests list</em>, and thus will be
+processed in the first step of the next loop.
+</li>
+</ol>
 
 \ref MPIPtpCommunication "Communication" gives the list of all the
 point to point communications defined in StarPU-MPI.

+ 8 - 0
doc/doxygen/chapters/41configure_options.doxy

@@ -372,6 +372,14 @@ Enable performance debugging through gprof.
 Enable performance model debugging.
 </dd>
 
+<dt>--enable-paje-codelet-details</dt>
+<dd>
+\anchor enable-paje-codelet-details
+\addindex __configure__--enable-paje-codelet-details
+Enable details about codelets in the paje trace. This requires a recent enough
+version of ViTE (at least r1430).
+</dd>
+
 <dt>--enable-fxt-lock</dt>
 <dd>
 \anchor enable-fxt-lock

+ 6 - 1
doc/doxygen/chapters/api/insert_task.doxy

@@ -28,7 +28,7 @@ specifying the worker on which to execute the task (as specified by
 starpu_task::execute_on_a_specific_worker)
 <li> the specific values ::STARPU_VALUE, ::STARPU_CALLBACK,
 ::STARPU_CALLBACK_ARG, ::STARPU_CALLBACK_WITH_ARG, ::STARPU_PRIORITY,
-::STARPU_TAG, ::STARPU_FLOPS, ::STARPU_SCHED_CTX followed by the
+::STARPU_TAG, ::STARPU_TAG_ONLY, ::STARPU_FLOPS, ::STARPU_SCHED_CTX followed by the
 appropriated objects as defined elsewhere.
 </ul>
 
@@ -84,6 +84,11 @@ the task (as specified by starpu_task::execute_on_a_specific_worker)
 \ingroup API_Insert_Task
 this macro is used when calling starpu_task_insert(), and must be followed by a tag.
 
+\def STARPU_TAG_ONLY
+\ingroup API_Insert_Task
+this macro is used when calling starpu_task_insert(), and must be followed by a tag.
+It sets ::tag_id, but leaves ::use_tag as 0.
+
 \def STARPU_FLOPS
 \ingroup API_Insert_Task
 this macro is used when calling starpu_task_insert(), and must

+ 5 - 0
examples/Makefile.am

@@ -190,6 +190,7 @@ examplebin_PROGRAMS +=				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/prio				\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
@@ -270,6 +271,7 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/prio				\
 	sched_ctx/dummy_sched_with_ctx		\
 	sched_ctx/sched_ctx_without_sched_policy\
+	sched_ctx/nested_sched_ctxs		\
 	worker_collections/worker_tree_example  \
 	worker_collections/worker_list_example  \
 	reductions/dot_product			\
@@ -925,6 +927,9 @@ sched_ctx_parallel_code_CFLAGS = \
 sched_ctx_sched_ctx_without_sched_policy_CFLAGS = \
 	$(AM_CFLAGS) -fopenmp
 
+sched_ctx_nested_sched_ctxs_CFLAGS = \
+	$(AM_CFLAGS) -fopenmp
+
 endif
 
 showcheck:

+ 2 - 0
examples/axpy/axpy.c

@@ -183,6 +183,8 @@ int main(int argc, char **argv)
 		task->handles[0] = starpu_data_get_sub_data(_handle_x, 1, b);
 		task->handles[1] = starpu_data_get_sub_data(_handle_y, 1, b);
 
+		task->tag_id = b;
+
 		ret = starpu_task_submit(task);
 		if (ret == -ENODEV)
 		{

+ 2 - 2
examples/binary/binary.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,7 +68,7 @@ int compute(char *file_name, int load_as_file)
 
 	for (i = 0; i < niter; i++)
 	{
-		ret = starpu_task_insert(&cl, STARPU_RW, float_array_handle, 0);
+		ret = starpu_task_insert(&cl, STARPU_RW, float_array_handle, STARPU_TAG_ONLY, (starpu_tag_t) i, 0);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
 		{
 			FPRINTF(stderr, "No worker may execute this task\n");

+ 5 - 0
examples/cg/cg_kernels.c

@@ -298,6 +298,7 @@ int dot_kernel(starpu_data_handle_t v1,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
 					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	}
@@ -443,6 +444,7 @@ int gemv_kernel(starpu_data_handle_t v1,
 		ret = starpu_task_insert(&scal_kernel_cl,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
+					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
 					 0);
 		if (ret == -ENODEV) return ret;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
@@ -459,6 +461,7 @@ int gemv_kernel(starpu_data_handle_t v1,
 						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
+						 STARPU_TAG_ONLY, (starpu_tag_t) (b2 * nblocks + b1),
 						 0);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
@@ -538,6 +541,7 @@ int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
+					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
@@ -611,6 +615,7 @@ int axpy_kernel(starpu_data_handle_t v1,
 					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
 					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
+					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");

+ 3 - 0
examples/cholesky/cholesky_implicit.c

@@ -57,6 +57,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					 STARPU_RW, sdatakk,
 					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
 					 STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
+					 STARPU_TAG_ONLY, TAG11(k),
 					 0);
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
@@ -70,6 +71,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 						 STARPU_R, sdatakk,
 						 STARPU_RW, sdatakj,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
+						 STARPU_TAG_ONLY, TAG21(k,j),
 						 0);
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
@@ -87,6 +89,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 								 STARPU_R, sdatakj,
 								 STARPU_RW | STARPU_COMMUTE, sdataij,
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
+								 STARPU_TAG_ONLY, TAG22(k,i,j),
 								 0);
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");

+ 2 - 1
examples/cpp/incrementer_cpp.cpp

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2013-2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -75,6 +75,7 @@ int main(int argc, char **argv)
 	{
 		ret = starpu_task_insert(&cl,
 					 STARPU_RW, float_array_handle,
+					 STARPU_TAG_ONLY, (starpu_tag_t) i,
 					 0);
                 if (STARPU_UNLIKELY(ret == -ENODEV))
                 {

+ 1 - 10
examples/lu/xlu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -19,15 +19,6 @@
 #include "xlu.h"
 #include "xlu_kernels.h"
 
-#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
-#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(i))))
-#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(j))))
-#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
-					| ((unsigned long long)(i)<<16)	\
-					| (unsigned long long)(j))))
-
 static unsigned no_prio = 0;
 
 

+ 12 - 1
examples/lu/xlu.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -22,6 +22,17 @@
 #include <starpu.h>
 #include <common/blas.h>
 
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+#define PIVOT(k,i)	((starpu_tag_t)(((5ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 #define BLAS3_FLOP(n1,n2,n3)    \

+ 9 - 1
examples/lu/xlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -30,6 +30,8 @@ static int create_task_11(starpu_data_handle_t dataA, unsigned k)
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
+	task->tag_id = TAG11(k);
+
 	/* this is an important task */
 	if (!no_prio)
 		task->priority = STARPU_MAX_PRIO;
@@ -49,6 +51,8 @@ static int create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned j)
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 	task->handles[1] = starpu_data_get_sub_data(dataA, 2, j, k);
 
+	task->tag_id = TAG12(k,j);
+
 	if (!no_prio && (j == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
@@ -68,6 +72,8 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned i)
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, i);
 
+	task->tag_id = TAG21(k,i);
+
 	if (!no_prio && (i == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
@@ -88,6 +94,8 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 	task->handles[1] = starpu_data_get_sub_data(dataA, 2, j, k);
 	task->handles[2] = starpu_data_get_sub_data(dataA, 2, j, i);
 
+	task->tag_id = TAG22(k,i,j);
+
 	if (!no_prio &&  (i == k + 1) && (j == k +1) )
 		task->priority = STARPU_MAX_PRIO;
 

+ 15 - 1
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -39,6 +39,8 @@ static int create_task_pivot(starpu_data_handle_t *dataAp, unsigned nblocks,
 	/* which sub-data is manipulated ? */
 	task->handles[0] = get_block(dataAp, nblocks, k, i);
 
+	task->tag_id = PIVOT(k, i);
+
 	task->cl_arg = &piv_description[k];
 
 	/* this is an important task */
@@ -65,6 +67,8 @@ static int create_task_11_pivot(starpu_data_handle_t *dataAp, unsigned nblocks,
 	/* which sub-data is manipulated ? */
 	task->handles[0] = get_block(dataAp, nblocks, k, k);
 
+	task->tag_id = TAG11(k);
+
 	/* this is an important task */
 	if (!no_prio)
 		task->priority = STARPU_MAX_PRIO;
@@ -86,6 +90,8 @@ static int create_task_12(starpu_data_handle_t *dataAp, unsigned nblocks, unsign
 	task->handles[0] = get_block(dataAp, nblocks, k, k);
 	task->handles[1] = get_block(dataAp, nblocks, j, k);
 
+	task->tag_id = TAG12(k,j);
+
 	if (!no_prio && (j == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
@@ -106,6 +112,8 @@ static int create_task_21(starpu_data_handle_t *dataAp, unsigned nblocks, unsign
 	task->handles[0] = get_block(dataAp, nblocks, k, k);
 	task->handles[1] = get_block(dataAp, nblocks, k, i);
 
+	task->tag_id = TAG21(k,i);
+
 	if (!no_prio && (i == k+1))
 		task->priority = STARPU_MAX_PRIO;
 
@@ -127,6 +135,8 @@ static int create_task_22(starpu_data_handle_t *dataAp, unsigned nblocks, unsign
 	task->handles[1] = get_block(dataAp, nblocks, j, k);
 	task->handles[2] = get_block(dataAp, nblocks, j, i);
 
+	task->tag_id = TAG22(k,i,j);
+
 	if (!no_prio &&  (i == k + 1) && (j == k +1) )
 		task->priority = STARPU_MAX_PRIO;
 
@@ -237,6 +247,8 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	double timing;
 	int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing);
+	if (ret)
+		return ret;
 
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stderr, "%2.2f\n", timing/1000);
@@ -290,6 +302,8 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 
 	double timing;
 	int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing);
+	if (ret)
+		return ret;
 
 	FPRINTF(stderr, "Computation took (in ms)\n");
 	FPRINTF(stderr, "%2.2f\n", timing/1000);

+ 1 - 12
examples/lu/xlu_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,17 +18,6 @@
 #include "xlu.h"
 #include "xlu_kernels.h"
 
-#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
-#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(i))))
-#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(j))))
-#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
-					| ((unsigned long long)(i)<<16)	\
-					| (unsigned long long)(j))))
-#define PIVOT(k,i)	((starpu_tag_t)(((5ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(i))))
-
 static unsigned no_prio = 0;
 
 /*

+ 1 - 0
examples/mandelbrot/mandelbrot.c

@@ -535,6 +535,7 @@ int main(int argc, char **argv)
 						 STARPU_VALUE, &stepY, sizeof(stepY),
 						 STARPU_W, block_handles[iby],
 						 STARPU_VALUE, &pcnt, sizeof(int *),
+						 STARPU_TAG_ONLY, (starpu_tag_t) (niter*nblocks + iby),
 						 0);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}

+ 5 - 1
examples/pipeline/pipeline.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012, 2013, 2014  Centre National de la Recherche Scientifique
- * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2012, 2014  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -205,6 +205,7 @@ int main(void)
 		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersX[l%K],
 				STARPU_VALUE, &x, sizeof(x),
+				STARPU_TAG_ONLY, (starpu_tag_t) (100*l),
 				0);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert x");
@@ -212,6 +213,7 @@ int main(void)
 		ret = starpu_task_insert(&pipeline_codelet_x,
 				STARPU_W, buffersY[l%K],
 				STARPU_VALUE, &y, sizeof(y),
+				STARPU_TAG_ONLY, (starpu_tag_t) (100*l+1),
 				0);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert y");
@@ -219,6 +221,7 @@ int main(void)
 		ret = starpu_task_insert(&pipeline_codelet_axpy,
 				STARPU_R, buffersX[l%K],
 				STARPU_RW, buffersY[l%K],
+				STARPU_TAG_ONLY, (starpu_tag_t) l,
 				0);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert axpy");
@@ -226,6 +229,7 @@ int main(void)
 		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
 				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
+				STARPU_TAG_ONLY, (starpu_tag_t) l,
 				0);
 		if (ret == -ENODEV) goto enodev;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert sum");

+ 212 - 0
examples/sched_ctx/nested_sched_ctxs.c

@@ -0,0 +1,212 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <omp.h>
+
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 64
+#else
+#define NTASKS 100
+#endif
+
+int tasks_executed[2];
+starpu_pthread_mutex_t mut;
+
+int parallel_code(int sched_ctx)
+{
+	int i;
+	int t = 0;
+	int *cpuids = NULL;
+	int ncpuids = 0;
+	starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
+
+//	printf("execute task of %d threads \n", ncpuids);
+#pragma omp parallel num_threads(ncpuids)
+	{
+		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
+// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
+#pragma omp for
+		for(i = 0; i < NTASKS; i++)
+			t++;
+	}
+
+	free(cpuids);
+	return t;
+}
+
+static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
+{
+	int w = starpu_worker_get_id();
+	unsigned sched_ctx = (unsigned)arg;
+	int n = parallel_code(sched_ctx);
+//	printf("w %d executed %d it \n", w, n);
+}
+
+
+static struct starpu_codelet sched_ctx_codelet =
+{
+	.cpu_funcs = {sched_ctx_func, NULL},
+	.cuda_funcs = {NULL},
+	.opencl_funcs = {NULL},
+	.model = NULL,
+	.nbuffers = 0,
+	.name = "sched_ctx"
+};
+
+int main(int argc, char **argv)
+{
+	tasks_executed[0] = 0;
+	tasks_executed[1] = 0;
+	int ntasks = NTASKS;
+	int ret, j, k;
+	unsigned ncpus = 0;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_pthread_mutex_init(&mut, NULL);
+	int nprocs1 = 1;
+	int nprocs2 = 1;
+	int *procs1, *procs2;
+
+#ifdef STARPU_USE_CPU
+	ncpus =  starpu_cpu_worker_get_count();
+	procs1 = (int*)malloc(ncpus*sizeof(int));
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
+
+	if (ncpus > 1)
+	{
+		nprocs1 = ncpus/2;
+		nprocs2 =  nprocs1;
+		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		for(j = nprocs1; j < nprocs1+nprocs2; j++)
+			procs2[k++] = procs1[j];
+	}
+	else
+	{
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
+		procs2[0] = procs1[0];
+	}
+#endif
+
+	if (ncpus == 0)
+	{
+#ifdef STARPU_USE_CPU
+		free(procs1);
+		free(procs2);
+#endif
+		starpu_shutdown();
+		return 77;
+	}
+
+	/*create contexts however you want*/
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
+	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
+
+	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
+//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
+
+	int nprocs3 = nprocs1/2;
+	int nprocs4 = nprocs1/2;
+	int nprocs5 = nprocs2/2;
+	int nprocs6 = nprocs2/2;
+	int procs3[nprocs3];
+	int procs4[nprocs4];
+	int procs5[nprocs5];
+	int procs6[nprocs6];
+
+	k = 0;
+	for(j = 0; j < nprocs3; j++)
+		procs3[k++] = procs1[j];
+	k = 0;
+	for(j = nprocs3; j < nprocs3+nprocs4; j++)
+		procs4[k++] = procs1[j];
+
+	k = 0;
+	for(j = 0; j < nprocs5; j++)
+		procs5[k++] = procs2[j];
+	k = 0;
+	for(j = nprocs5; j < nprocs5+nprocs6; j++)
+		procs6[k++] = procs2[j];
+
+	unsigned sched_ctx3 = starpu_sched_ctx_create(procs3, nprocs3, "ctx3", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+	unsigned sched_ctx4 = starpu_sched_ctx_create(procs4, nprocs4, "ctx4", STARPU_SCHED_CTX_NESTED, sched_ctx1, 0);
+
+	unsigned sched_ctx5 = starpu_sched_ctx_create(procs5, nprocs5, "ctx5", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+	unsigned sched_ctx6 = starpu_sched_ctx_create(procs6, nprocs6, "ctx6", STARPU_SCHED_CTX_NESTED, sched_ctx2, 0);
+
+
+	int i;
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx1;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (i = 0; i < ntasks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &sched_ctx_codelet;
+		task->cl_arg = sched_ctx2;
+
+		/*submit tasks to context*/
+		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
+
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+
+	/* tell starpu when you finished submitting tasks to this context
+	   in order to allow moving resources from this context to the inheritor one
+	   when its corresponding tasks finished executing */
+
+
+
+	/* wait for all tasks at the end*/
+	starpu_task_wait_for_all();
+
+	starpu_sched_ctx_delete(sched_ctx3);
+	starpu_sched_ctx_delete(sched_ctx4);
+
+	starpu_sched_ctx_delete(sched_ctx5);
+	starpu_sched_ctx_delete(sched_ctx6);
+
+	starpu_sched_ctx_delete(sched_ctx1);
+	starpu_sched_ctx_delete(sched_ctx2);
+
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
+	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
+
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
+	starpu_shutdown();
+	return 0;
+}

+ 5 - 1
examples/sched_ctx/sched_ctx_without_sched_policy.c

@@ -88,7 +88,6 @@ int main(int argc, char **argv)
 #ifdef STARPU_USE_CPU
 	ncpus = starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
-	procs2 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 
 	if(ncpus > 1)
@@ -96,6 +95,7 @@ int main(int argc, char **argv)
 		nprocs1 = ncpus/2;
 		nprocs2 =  ncpus-nprocs1;
 		k = 0;
+		procs2 = (int*)malloc(nprocs2*sizeof(int));
 		for(j = nprocs1; j < nprocs1+nprocs2; j++)
 			procs2[k++] = procs1[j];
 	}
@@ -156,6 +156,10 @@ int main(int argc, char **argv)
 	printf("ctx%d: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
 
 enodev:
+#ifdef STARPU_USE_CPU
+	free(procs1);
+	free(procs2);
+#endif
 	starpu_shutdown();
 	return ncpus == 0 ? 77 : 0;
 }

+ 2 - 2
examples/scheduler/schedulers.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2012  Centre National de la Recherche Scientifique
+# Copyright (C) 2012, 2014  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,7 @@ check_success()
 
 [ -x ./cholesky/cholesky_tag ] || exit 77
 
-SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/->/ {print $1}'`
+SCHEDULERS=`STARPU_SCHED="help" ./basic_examples/hello_world 2>&1 | awk '/\t->/ {print $1}'`
 
 for sched in $SCHEDULERS
 do

+ 1 - 0
examples/worker_collections/worker_list_example.c

@@ -85,6 +85,7 @@ int main()
 
 	FPRINTF(stderr, "timing init = %lf \n", timing);
 	co->deinit(co);
+	free(co);
 	starpu_shutdown();
 
 	return 0;

+ 8 - 0
include/starpu_sched_ctx.h

@@ -29,6 +29,7 @@ extern "C"
 #define STARPU_SCHED_CTX_POLICY_MIN_PRIO	 (3<<16)
 #define STARPU_SCHED_CTX_POLICY_MAX_PRIO	 (4<<16)
 #define STARPU_SCHED_CTX_HIERARCHY_LEVEL         (5<<16)
+#define STARPU_SCHED_CTX_NESTED                  (6<<16)
 
 unsigned starpu_sched_ctx_create(int *workerids_ctx, int nworkers_ctx, const char *sched_ctx_name, ...);
 
@@ -127,6 +128,13 @@ int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids
 
 void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master);
 
+/* return the first context (child of sched_ctx_id) where the workerid is master */
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id);
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif /* STARPU_USE_SC_HYPERVISOR */

+ 31 - 27
include/starpu_task.h

@@ -119,9 +119,13 @@ struct starpu_codelet
 
 struct starpu_task
 {
+	const char *name;
+
 	struct starpu_codelet *cl;
 
+	/* TODO: remove someday, this is costly */
 	struct starpu_data_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
+
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
 
@@ -130,61 +134,61 @@ struct starpu_task
 
 	void *cl_arg;
 	size_t cl_arg_size;
-	unsigned cl_arg_free;
 
 	void (*callback_func)(void *);
 	void *callback_arg;
 	/* must StarPU release callback_arg ? - 0 by default */
-	unsigned callback_arg_free;
 
 	void (*prologue_callback_func)(void *);
 	void *prologue_callback_arg;
-	/* must StarPU release prologue_callback_arg ? - 0 by default */
-	unsigned prologue_callback_arg_free;
 
 	void (*prologue_callback_pop_func)(void *);
 	void *prologue_callback_pop_arg;
-	/* must StarPU release prologue_callback_pop_arg ? - 0 by default */
-	unsigned prologue_callback_pop_arg_free;
 
-	unsigned use_tag;
 	starpu_tag_t tag_id;
 
-	unsigned sequential_consistency;
+	unsigned cl_arg_free:1;
+	unsigned callback_arg_free:1;
+	/* must StarPU release prologue_callback_arg ? - 0 by default */
+	unsigned prologue_callback_arg_free:1;
+	/* must StarPU release prologue_callback_pop_arg ? - 0 by default */
+	unsigned prologue_callback_pop_arg_free:1;
 
-	unsigned synchronous;
-	int priority;
+	unsigned use_tag:1;
+	unsigned sequential_consistency:1;
+	unsigned synchronous:1;
+	unsigned execute_on_a_specific_worker:1;
 
-	unsigned execute_on_a_specific_worker;
-	unsigned workerid;
+	unsigned detach:1;
+	unsigned destroy:1;
+	unsigned regenerate:1;
 
-	starpu_task_bundle_t bundle;
+	unsigned scheduled:1;
+
+	unsigned int mf_skip:1;
+
+	unsigned workerid;
 
-	int detach;
-	int destroy;
-	int regenerate;
+	int priority;
 
 	enum starpu_task_status status;
 
+	int magic;
+
+	unsigned sched_ctx;
+	int hypervisor_tag;
+
+	starpu_task_bundle_t bundle;
+
 	struct starpu_profiling_task_info *profiling_info;
 
+	double flops;
 	double predicted;
 	double predicted_transfer;
 
-	unsigned int mf_skip;
-
 	struct starpu_task *prev;
 	struct starpu_task *next;
 	void *starpu_private;
-	int magic;
-
-	const char *name;
-
-	unsigned sched_ctx;
-	int hypervisor_tag;
-	double flops;
-
-	unsigned scheduled;
 	unsigned prefetched;
 };
 

+ 2 - 1
include/starpu_task_util.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2014       INRIA
  *
@@ -49,6 +49,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_PROLOGUE_CALLBACK_POP   (15<<18)
 #define STARPU_PROLOGUE_CALLBACK_POP_ARG (16<<18)
 #define STARPU_EXECUTE_ON_WORKER (17<<18)
+#define STARPU_TAG_ONLY          (18<<18)
 
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
 int starpu_task_insert(struct starpu_codelet *cl, ...);

+ 7 - 0
include/starpu_worker.h

@@ -57,10 +57,15 @@ struct starpu_worker_collection
 {
 	void *workerids;
 	unsigned nworkers;
+	void *masters;
+	unsigned nmasters;
 	int present[STARPU_NMAXWORKERS];
+	int is_master[STARPU_NMAXWORKERS];
 	enum starpu_worker_collection_type type;
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	unsigned (*has_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
+	int (*get_next_master)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*add)(struct starpu_worker_collection *workers, int worker);
 	int (*remove)(struct starpu_worker_collection *workers, int worker);
 	void (*init)(struct starpu_worker_collection *workers);
@@ -109,6 +114,8 @@ int starpu_worker_get_mp_nodeid(int id);
 struct starpu_tree* starpu_workers_get_tree(void);
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
+
+unsigned starpu_worker_is_slave(int workerid);
 #ifdef __cplusplus
 }
 #endif

+ 3 - 1
mpi/include/starpu_mpi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,6 +71,8 @@ void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
 void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
 
+int starpu_mpi_world_rank(void);
+
 int starpu_mpi_get_communication_tag(void);
 void starpu_mpi_set_communication_tag(int tag);
 

+ 61 - 45
mpi/src/starpu_mpi.c

@@ -30,7 +30,7 @@
 #include <datawizard/coherency.h>
 
 static void _starpu_mpi_add_sync_point_in_fxt(void);
-static void _starpu_mpi_submit_new_mpi_request(void *arg);
+static void _starpu_mpi_submit_ready_request(void *arg);
 static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req);
 #ifdef STARPU_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
@@ -46,8 +46,8 @@ static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t dat
 							ssize_t count);
 static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req);
 
-/* The list of requests that have been newly submitted by the application */
-static struct _starpu_mpi_req_list *new_requests;
+/* The list of ready requests */
+static struct _starpu_mpi_req_list *ready_requests;
 
 /* The list of detached requests that have already been submitted to MPI */
 static struct _starpu_mpi_req_list *detached_requests;
@@ -61,7 +61,7 @@ static starpu_pthread_mutex_t mutex;
 static starpu_pthread_t progress_thread;
 static int running = 0;
 
-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
+/* Count requests posted by the application and not yet submitted to MPI */
 static starpu_pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0, newer_requests, barrier_running = 0;
 
@@ -151,9 +151,9 @@ static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 	req->count = count;
 
 	/* Asynchronously request StarPU to fetch the data in main memory: when
-	 * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
+	 * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
 	 * the request is actually submitted */
-	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_new_mpi_request, (void *)req, sequential_consistency);
+	starpu_data_acquire_cb_sequential_consistency(data_handle, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency);
 
 	_STARPU_MPI_LOG_OUT();
 	return req;
@@ -447,7 +447,7 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	waiting_req->func = _starpu_mpi_wait_func;
 	waiting_req->request_type = WAIT_REQ;
 
-	_starpu_mpi_submit_new_mpi_request(waiting_req);
+	_starpu_mpi_submit_ready_request(waiting_req);
 
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
@@ -532,7 +532,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 		testing_req->request_type = TEST_REQ;
 
 		_STARPU_MPI_INC_POSTED_REQUESTS(1);
-		_starpu_mpi_submit_new_mpi_request(testing_req);
+		_starpu_mpi_submit_ready_request(testing_req);
 
 		/* We wait for the test request to finish */
 		STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
@@ -619,7 +619,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	barrier_req->comm = comm;
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(1);
-	_starpu_mpi_submit_new_mpi_request(barrier_req);
+	_starpu_mpi_submit_ready_request(barrier_req);
 
 	/* We wait for the MPI request to finish */
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
@@ -785,24 +785,25 @@ static void _starpu_mpi_early_data_cb(void* arg)
 	free(args);
 }
 
-static void _starpu_mpi_submit_new_mpi_request(void *arg)
+static void _starpu_mpi_submit_ready_request(void *arg)
 {
 	_STARPU_MPI_LOG_IN();
 	struct _starpu_mpi_req *req = arg;
 
 	_STARPU_MPI_INC_POSTED_REQUESTS(-1);
 
-	_STARPU_MPI_DEBUG(3, "calling _starpu_mpi_submit_new_mpi_request with req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
+	_STARPU_MPI_DEBUG(3, "new req %p srcdst %d tag %d and type %s\n", req, req->srcdst, req->mpi_tag, _starpu_mpi_request_type(req->request_type));
 
 	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 	if (req->request_type == RECV_REQ)
 	{
-		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
-		 * incoming data without a matching pending receive already submitted by the application.
-		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
-		 * the list of new_requests, so as the real MPI request can be submitted before the next
-		 * submission of the envelope-catching request. */
+		/* Case : the request is the internal receive request submitted
+		 * by StarPU-MPI to receive incoming data without a matching
+		 * early_request from the application. We immediately allocate the
+		 * pointer associated to the data_handle, and push it into the
+		 * ready_requests list, so as the real MPI request can be submitted
+		 * before the next submission of the envelope-catching request. */
 		if (req->is_internal_req)
 		{
 			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
@@ -818,10 +819,12 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 			}
 
-			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-			_starpu_mpi_req_list_push_front(new_requests, req);
+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
+					  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
+					  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+			_starpu_mpi_req_list_push_front(ready_requests, req);
 
-			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
+			/* inform the starpu mpi thread that the request has been pushed in the ready_requests list */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
 			req->posted = 1;
@@ -834,10 +837,10 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			/* test whether the receive request has already been submitted internally by StarPU-MPI*/
 			struct _starpu_mpi_early_data_handle *early_data_handle = _starpu_mpi_early_data_find(req->mpi_tag, req->srcdst);
 
-			/* Case : the request has already been submitted internally by StarPU.
-			 * We'll asynchronously ask a Read permission over the temporary handle, so as when
-			 * the internal receive will be over, the _starpu_mpi_early_data_cb function will be called to
-			 * bring the data back to the original data handle associated to the request.*/
+			/* Case: a receive request for a data with the given tag and source has already been
+			 * posted by StarPU. Asynchronously requests a Read permission over the temporary handle ,
+			 * so as when the internal receive is completed, the _starpu_mpi_early_data_cb function
+			 * will be called to bring the data back to the original data handle associated to the request.*/
 			if (early_data_handle)
 			{
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -861,8 +864,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 				_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 				starpu_data_acquire_cb(early_data_handle->handle,STARPU_R,_starpu_mpi_early_data_cb,(void*) cb_args);
 			}
-			/* Case : a classic receive request with no send received earlier than expected.
-			 * We just add the pending receive request to the requests' hashmap. */
+			/* Case: no matching data has been received. Store the receive request as an early_request. */
 			else
 			{
 				_STARPU_MPI_DEBUG(3, "Adding the pending receive request %p (srcdst %d tag %d) into the request hashmap\n", req, req->srcdst, req->mpi_tag);
@@ -872,7 +874,7 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 	}
 	else
 	{
-		_starpu_mpi_req_list_push_front(new_requests, req);
+		_starpu_mpi_req_list_push_front(ready_requests, req);
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
 	}
@@ -986,7 +988,7 @@ static void _starpu_mpi_handle_detached_request(struct _starpu_mpi_req *req)
 	}
 }
 
-static void _starpu_mpi_handle_new_request(struct _starpu_mpi_req *req)
+static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 	STARPU_ASSERT_MSG(req, "Invalid request");
@@ -1080,10 +1082,10 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
  	int header_req_submitted = 0;
 
-	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(ready_requests)) || !(_starpu_mpi_req_list_empty(detached_requests)))
 	{
 		/* shall we block ? */
-		unsigned block = _starpu_mpi_req_list_empty(new_requests) && _starpu_mpi_early_request_count() == 0;
+		unsigned block = _starpu_mpi_req_list_empty(ready_requests) && _starpu_mpi_early_request_count() == 0;
 
 #ifndef STARPU_MPI_ACTIVITY
 		STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
@@ -1107,21 +1109,22 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 		/* get one request */
 		struct _starpu_mpi_req *req;
-		while (!_starpu_mpi_req_list_empty(new_requests))
+		while (!_starpu_mpi_req_list_empty(ready_requests))
 		{
-			req = _starpu_mpi_req_list_pop_back(new_requests);
+			req = _starpu_mpi_req_list_pop_back(ready_requests);
 
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
 			 * application submit requests in the meantime, so we
 			 * release the lock. */
 			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-			_starpu_mpi_handle_new_request(req);
+			_starpu_mpi_handle_ready_request(req);
 			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 
-		/* If there is no currently submitted header_req submitted to catch envelopes from senders, and there is some pending receive
-		 * requests in our side, we resubmit a header request. */
+		/* If there is no currently submitted header_req submitted to
+                 * catch envelopes from senders, and there is some pending
+                 * receive requests on our side, we resubmit a header request. */
 		MPI_Request header_req;
 		if ((_starpu_mpi_early_request_count() > 0) && (header_req_submitted == 0))// && (HASH_COUNT(_starpu_mpi_early_data_handle_hashmap) == 0))
 		{
@@ -1151,11 +1154,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 				struct _starpu_mpi_req *found_req = _starpu_mpi_early_request_find(recv_env->mpi_tag, status.MPI_SOURCE);
 
-				/* Case : a data will arrive before the matching receive has been submitted in our side of the application.
-				 * We will allow a temporary handle to store the incoming data, by submitting a starpu_mpi_irecv_detached
-				 * on this handle, and register this so as the StarPU-MPI layer can remember it.*/
+				/* Case: a data will arrive before a matching receive is
+				 * posted by the application. Create a temporary handle to
+				 * store the incoming data, submit a starpu_mpi_irecv_detached
+				 * on this handle, and store it as an early_data
+				 */
 				if (!found_req)
 				{
+
 					_STARPU_MPI_DEBUG(3, "Request with tag %d and source %d not found, creating a early_handle to receive incoming data..\n", recv_env->mpi_tag, status.MPI_SOURCE);
 
 					starpu_data_handle_t data_handle = NULL;
@@ -1198,8 +1204,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 					// We wait until the request is pushed in the
-					// new_request list, that ensures that the next loop
-					// will call _starpu_mpi_handle_new_request
+					// ready_request list, that ensures that the next loop
+					// will call _starpu_mpi_handle_ready_request
 					// on the request and post the corresponding mpi_irecv,
 					// otherwise, it may lead to read data as envelop
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -1214,8 +1220,11 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
-				/* Case : a matching receive has been found for the incoming data, we handle the correct allocation of the pointer associated to
-				 * the data handle, then submit the corresponding receive with _starpu_mpi_handle_new_request. */
+				/* Case: a matching application request has been found for
+				 * the incoming data, we handle the correct allocation
+				 * of the pointer associated to the data handle, then
+				 * submit the corresponding receive with
+				 * _starpu_mpi_handle_ready_request. */
 				else
 				{
 					_STARPU_MPI_DEBUG(3, "A matching receive has been found for the incoming data with tag %d\n", recv_env->mpi_tag);
@@ -1242,7 +1251,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 					 * application submit requests in the meantime, so we
 					 * release the lock. */
 					STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-					_starpu_mpi_handle_new_request(found_req);
+					_starpu_mpi_handle_ready_request(found_req);
 					STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 				}
 				header_req_submitted = 0;
@@ -1255,7 +1264,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	}
 
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(detached_requests), "List of detached requests not empty");
-	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
+	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(ready_requests), "List of ready requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	_starpu_mpi_early_request_check_termination();
 	_starpu_mpi_early_data_check_termination();
@@ -1326,7 +1335,7 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi)
 	STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
-	new_requests = _starpu_mpi_req_list_new();
+	ready_requests = _starpu_mpi_req_list_new();
 
 	STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
 	detached_requests = _starpu_mpi_req_list_new();
@@ -1402,7 +1411,7 @@ int starpu_mpi_shutdown(void)
 
 	/* free the request queues */
 	_starpu_mpi_req_list_delete(detached_requests);
-	_starpu_mpi_req_list_delete(new_requests);
+	_starpu_mpi_req_list_delete(ready_requests);
 
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_free();
@@ -1423,3 +1432,10 @@ void starpu_mpi_data_register(starpu_data_handle_t data_handle, int tag, int ran
 	_starpu_data_set_unregister_hook(data_handle, _starpu_mpi_clear_cache);
 
 }
+
+int starpu_mpi_world_rank(void)
+{
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	return rank;
+}

+ 12 - 0
mpi/src/starpu_mpi_task_insert.c

@@ -309,6 +309,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			(void)va_arg(varg_list_copy, double);
 		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			(void)va_arg(varg_list, starpu_tag_t);
+		}
 		else if (arg_type==STARPU_TAG)
 		{
 			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
@@ -471,6 +475,10 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		{
 			(void)va_arg(varg_list_copy, double);
 		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			(void)va_arg(varg_list, starpu_tag_t);
+		}
 		else if (arg_type==STARPU_TAG)
 		{
 			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
@@ -609,6 +617,10 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, struct starpu_codelet *codelet,
 		{
 			(void)va_arg(varg_list_copy, double);
 		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			(void)va_arg(varg_list, starpu_tag_t);
+		}
 		else if (arg_type==STARPU_TAG)
 		{
 			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");

+ 1 - 0
src/Makefile.am

@@ -183,6 +183,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/eager_central_policy.c			\
 	sched_policies/eager_central_priority_policy.c		\
 	sched_policies/work_stealing_policy.c			\
+	sched_policies/locality_work_stealing_policy.c		\
 	sched_policies/deque_modeling_policy_data_aware.c	\
 	sched_policies/random_policy.c				\
 	sched_policies/stack_queues.c				\

+ 11 - 4
src/common/fxt.h

@@ -108,6 +108,8 @@
 #define _STARPU_FUT_EVENT	0x513c
 #define _STARPU_FUT_THREAD_EVENT	0x513d
 
+#define	_STARPU_FUT_CODELET_DETAILS	0x513e
+
 #define _STARPU_FUT_LOCKING_MUTEX	0x5140	
 #define _STARPU_FUT_MUTEX_LOCKED	0x5141	
 
@@ -326,7 +328,7 @@ do {									\
 #ifdef FUT_DO_PROBE6STR
 #define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #else
-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
+#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
 do {									\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -353,7 +355,7 @@ do {									\
 #ifdef FUT_DO_PROBE7STR
 #define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #else
-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
+#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
 do {									\
     if(fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
@@ -407,7 +409,7 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
-#define _STARPU_TRACE_START_CODELET_BODY(job)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
@@ -418,6 +420,11 @@ do {									\
 	else {                                                          \
 		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
 	}								\
+	{								\
+		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
+		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
+	}								\
 } while(0);
 
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
@@ -784,7 +791,7 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
 #define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)

+ 16 - 16
src/core/jobs.h

@@ -62,8 +62,8 @@ struct _starpu_data_descr {
 /* A job is the internal representation of a task. */
 LIST_TYPE(_starpu_job,
 
-	/* The implementation associated to the job */
-	unsigned nimpl;
+	/* Each job is attributed a unique id. */
+	unsigned long job_id;
 
 	/* The task associated to that job */
 	struct starpu_task *task;
@@ -94,40 +94,34 @@ LIST_TYPE(_starpu_job,
 
 	/* The value of the footprint that identifies the job may be stored in
 	 * this structure. */
-	unsigned footprint_is_computed;
 	uint32_t footprint;
+	unsigned footprint_is_computed:1;
 
 	/* Indicates whether the task associated to that job has already been
 	 * submitted to StarPU (1) or not (0) (using starpu_task_submit).
 	 * Becomes and stays 2 when the task is submitted several times.
 	 */
-	unsigned submitted;
+	unsigned submitted:2;
 
 	/* Indicates whether the task associated to this job is terminated or
 	 * not. */
-	unsigned terminated;
+	unsigned terminated:2;
 
 	/* Should that task appear in the debug tools ? (eg. the DAG generated
 	 * with dot) */
-        unsigned exclude_from_dag;
+	unsigned exclude_from_dag:1;
 
 	/* Is that task internal to StarPU? */
-	unsigned internal;
-
-	/* Each job is attributed a unique id. */
-	unsigned long job_id;
+	unsigned internal:1;
 
 	/* During the reduction of a handle, StarPU may have to submit tasks to
 	 * perform the reduction itself: those task should not be stalled while
 	 * other tasks are blocked until the handle has been properly reduced,
 	 * so we need a flag to differentiate them from "normal" tasks. */
-	unsigned reduction_task;
-
-	/* Used to record codelet start time instead of using a
-	 * local variable */
-	struct timespec cl_start;
+	unsigned reduction_task:1;
 
-	struct bound_task *bound_task;
+	/* The implementation associated to the job */
+	unsigned nimpl;
 
 	/* Number of workers executing that task (>1 if the task is parallel)
 	 * */
@@ -140,6 +134,12 @@ LIST_TYPE(_starpu_job,
 	 * parallel tasks only). */
 	int active_task_alias_count;
 
+	/* Used to record codelet start time instead of using a
+	 * local variable */
+	struct timespec cl_start;
+
+	struct bound_task *bound_task;
+
 	/* Parallel workers may have to synchronize before/after the execution of a parallel task. */
 	starpu_pthread_barrier_t before_work_barrier;
 	starpu_pthread_barrier_t after_work_barrier;

+ 56 - 2
src/core/sched_ctx.c

@@ -533,6 +533,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	int max_prio = 0;
 	struct starpu_sched_policy *sched_policy = NULL;
 	unsigned hierarchy_level = 0;
+	unsigned nesting_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 
 	va_start(varg_list, sched_ctx_name);
 	while ((arg_type = va_arg(varg_list, int)) != 0)
@@ -561,6 +562,10 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 		{
 			hierarchy_level = va_arg(varg_list, unsigned);
 		}
+		else if (arg_type == STARPU_SCHED_CTX_NESTED)
+		{
+			nesting_sched_ctx = va_arg(varg_list, unsigned);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
@@ -572,6 +577,7 @@ unsigned starpu_sched_ctx_create(int *workerids, int nworkers, const char *sched
 	struct _starpu_sched_ctx *sched_ctx = NULL;
 	sched_ctx = _starpu_create_sched_ctx(sched_policy, workerids, nworkers, 0, sched_ctx_name, min_prio_set, min_prio, max_prio_set, max_prio);
 	sched_ctx->hierarchy_level = hierarchy_level;
+	sched_ctx->nesting_sched_ctx = nesting_sched_ctx;
 
 	_starpu_unlock_mutex_if_prev_locked();
 	int *added_workerids;
@@ -1142,6 +1148,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	case STARPU_WORKER_TREE:
 		sched_ctx->workers->has_next = worker_tree.has_next;
 		sched_ctx->workers->get_next = worker_tree.get_next;
+		sched_ctx->workers->has_next_master = worker_tree.has_next_master;
+		sched_ctx->workers->get_next_master = worker_tree.get_next_master;
 		sched_ctx->workers->add = worker_tree.add;
 		sched_ctx->workers->remove = worker_tree.remove;
 		sched_ctx->workers->init = worker_tree.init;
@@ -1154,6 +1162,8 @@ struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsig
 	default:
 		sched_ctx->workers->has_next = worker_list.has_next;
 		sched_ctx->workers->get_next = worker_list.get_next;
+		sched_ctx->workers->has_next_master = worker_list.has_next_master;
+		sched_ctx->workers->get_next_master = worker_list.get_next_master;
 		sched_ctx->workers->add = worker_list.add;
 		sched_ctx->workers->remove = worker_list.remove;
 		sched_ctx->workers->init = worker_list.init;
@@ -1181,6 +1191,7 @@ void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f)
 		starpu_worker_get_name(workerids[i], name, 256);
 		fprintf(f, "\t\t%s\n", name);
 	}
+	free(workerids);
 }
 
 unsigned starpu_sched_ctx_get_workers_list(unsigned sched_ctx_id, int **workerids)
@@ -1615,6 +1626,44 @@ void starpu_sched_ctx_bind_current_thread_to_cpuid(unsigned cpuid STARPU_ATTRIBU
 
 }
 
+unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx_list *l = NULL;
+	struct _starpu_sched_ctx *sched_ctx = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
+	{ 
+		 sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
+		if(sched_ctx-> main_master == workerid && sched_ctx->nesting_sched_ctx == sched_ctx_id)
+			return sched_ctx->id;
+	}
+	return STARPU_NMAX_SCHED_CTXS;
+
+}
+
+void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops)
+{
+        _starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx_id);
+        _starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx_id, flops);
+}
+
+void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx)
+{
+	int workerid = starpu_worker_get_id();
+	struct _starpu_worker *worker  = NULL;
+	if(workerid != -1)
+	{
+		worker = _starpu_get_worker_struct(workerid);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	}
+
+	task->sched_ctx = sched_ctx;
+	_starpu_task_submit_nodeps(task);
+
+	if(workerid != -1)
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+}
+
 static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workerid)
 {
 	int s;
@@ -1630,6 +1679,7 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 	return 0;
 
 }
+
 static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
@@ -1653,7 +1703,6 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 		workerid = workerids[w];
 		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
 		{
-			sched_ctx->sleeping[workerids[w]] = 1;
 			sem_wait(&sched_ctx->fall_asleep_sem[master]);
 		}
 	}
@@ -1662,7 +1711,10 @@ static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *w
 
 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
 {
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 1;
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	sched_ctx->sleeping[workerid] = 1;
 	int master = sched_ctx->master[workerid];
 	sem_post(&sched_ctx->fall_asleep_sem[master]);
 
@@ -1676,6 +1728,9 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 	sem_post(&sched_ctx->wake_up_sem[master]);
 	sched_ctx->sleeping[workerid] = 0;
 	sched_ctx->master[workerid] = -1;
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	worker->slave = 0;
+
 	return;
 }
 
@@ -1730,7 +1785,6 @@ void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids,
 	int current_worker_id = starpu_worker_get_id();
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
-
 	(*cpuids) = (int*)malloc(workers->nworkers*sizeof(int));
 	int w = 0;
 

+ 3 - 0
src/core/sched_ctx.h

@@ -147,6 +147,9 @@ struct _starpu_sched_ctx
 	/* bool indicating if the workers is sleeping in this ctx */
 	unsigned sleeping[STARPU_NMAXWORKERS];
 
+	/* ctx nesting the current ctx */
+	unsigned nesting_sched_ctx;
+
 };
 
 struct _starpu_machine_config;

+ 1 - 0
src/core/sched_policy.c

@@ -49,6 +49,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_eager_policy,
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_random_policy,
+	&_starpu_sched_lws_policy,
 	&_starpu_sched_ws_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,

+ 1 - 0
src/core/sched_policy.h

@@ -58,6 +58,7 @@ void _starpu_print_idle_time();
 /*
  *	Predefined policies
  */
+extern struct starpu_sched_policy _starpu_sched_lws_policy;
 extern struct starpu_sched_policy _starpu_sched_ws_policy;
 extern struct starpu_sched_policy _starpu_sched_prio_policy;
 extern struct starpu_sched_policy _starpu_sched_random_policy;

+ 22 - 3
src/core/simgrid.c

@@ -33,6 +33,8 @@ extern int starpu_main(int argc, char *argv[]);
 extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *argv[]);
 #pragma weak smpi_simulated_main_
 extern int smpi_simulated_main_(int argc, char *argv[]);
+#pragma weak starpu_mpi_world_rank
+extern int starpu_mpi_world_rank(void);
 
 #define _starpu_simgrid_running_smpi() (getenv("SMPI_GLOBAL_SIZE") != NULL)
 
@@ -48,6 +50,7 @@ int do_starpu_main(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBU
 	return starpu_main(args->argc, args->argv);
 }
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 #ifdef HAVE_MSG_GET_AS_BY_NAME
 static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 {
@@ -76,6 +79,7 @@ static msg_as_t _starpu_simgrid_get_as_by_name(const char *name)
 	return __starpu_simgrid_get_as_by_name(MSG_environment_get_routing_root(), name);
 }
 #endif /* HAVE_MSG_GET_AS_BY_NAME */
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 
 int _starpu_simgrid_get_nbhosts(const char *prefix)
 {
@@ -84,13 +88,16 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 	unsigned i, nb;
 	unsigned len = strlen(prefix);
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	{
 		char name[16];
-		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
 	}
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 	nb = xbt_dynar_length(hosts);
 
@@ -132,7 +139,8 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 	if (_starpu_simgrid_running_smpi())
 	{
 		char mpiname[16];
-		snprintf(mpiname, sizeof(mpiname), "%d-%s", smpi_current_rank, name);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(mpiname, sizeof(mpiname), "%d-%s", starpu_mpi_world_rank(), name);
 		return MSG_get_host_by_name(mpiname);
 	}
 	else
@@ -185,6 +193,7 @@ void _starpu_simgrid_init()
 	xbt_dynar_t hosts;
 	int i;
 
+#ifdef HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT
 	if (_starpu_simgrid_running_smpi())
 	{
 		/* Take back hand to create the local platform for this MPI
@@ -195,16 +204,25 @@ void _starpu_simgrid_init()
 		char cmdline[1024];
 		FILE *in;
 		int out;
+#ifdef HAVE_MKSTEMPS
 		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX.xml";
+#else
+		char template[] = "/tmp/"STARPU_MPI_AS_PREFIX"-platform-XXXXXX";
+#endif
 		int ret;
 
-		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", smpi_current_rank);
+		STARPU_ASSERT(starpu_mpi_world_rank);
+		snprintf(asname, sizeof(asname), STARPU_MPI_AS_PREFIX"%u", starpu_mpi_world_rank());
 
 		/* Get XML platform */
 		_starpu_simgrid_get_platform_path(path, sizeof(path));
 		in = fopen(path, "r");
 		STARPU_ASSERT_MSG(in, "Could not open platform file %s", path);
+#ifdef HAVE_MKSTEMPS
 		out = mkstemps(template, strlen(".xml"));
+#else
+		out = mkstemp(template);
+#endif
 
 		/* Generate modified XML platform */
 		STARPU_ASSERT_MSG(out >= 0, "Could not create temporary file like %s", template);
@@ -219,6 +237,7 @@ void _starpu_simgrid_init()
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(asname));
 	}
 	else
+#endif /* HAVE_MSG_ENVIRONMENT_GET_ROUTING_ROOT */
 		hosts = MSG_hosts_as_dynar();
 
 	int nb = xbt_dynar_length(hosts);

+ 10 - 2
src/core/tree.c

@@ -104,9 +104,17 @@ struct starpu_tree* starpu_tree_get_neighbour(struct starpu_tree *tree, struct s
 {
 	struct starpu_tree *father = node == NULL ? tree : node->father;
 	
-	int i;
-	for(i = 0; i < father->arity; i++)
+	int i, st, n;
+
+	for(st = 0; st < father->arity; st++)
+	{
+		if(father->nodes[st] == node)
+			break;
+	}
+
+	for(n = 0; n < father->arity; n++)
 	{
+		i = (st+n)%father->arity;
 		if(father->nodes[i] != node)
 		{
 			if(father->nodes[i]->arity == 0)

+ 54 - 38
src/core/workers.c

@@ -467,6 +467,7 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->reverse_phase[1] = 0;
 	workerarg->pop_ctx_priority = 1;
 	workerarg->sched_mutex_locked = 0;
+	workerarg->slave = 0;
 
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }
@@ -516,7 +517,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 	/* Launch workers asynchronously */
 	unsigned cpu = 0;
-	unsigned worker;
+	unsigned worker, i;
 
 #if defined(STARPU_PERF_DEBUG) && !defined(STARPU_SIMGRID)
 	/* Get itimer of the main thread, to set it for the worker threads */
@@ -526,6 +527,16 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 #endif
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	for (i = 0; i < sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0]); i++)
+		cuda_worker_set[i].workers = NULL;
+#endif
+#ifdef STARPU_USE_MIC
+	for (i = 0; i < sizeof(mic_worker_set)/sizeof(mic_worker_set[0]); i++)
+		mic_worker_set[i].workers = NULL;
+#endif
+
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -575,44 +586,44 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 				driver.id.cuda_id = workerarg->devid;
-				if (_starpu_may_launch_driver(pconfig->conf, &driver))
-				{
-					/* We spawn only one thread per CUDA device,
-					 * which will control all CUDA workers of this
-					 * device. (by using a worker set). */
-					if (cuda_worker_set[devid].started)
-						goto worker_set_initialized;
+				workerarg->set = &cuda_worker_set[devid];
 
-					cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
-					cuda_worker_set[devid].workers = workerarg;
-					cuda_worker_set[devid].set_is_initialized = 0;
+				/* We spawn only one thread per CUDA device,
+				 * which will control all CUDA workers of this
+				 * device. (by using a worker set). */
+				if (cuda_worker_set[devid].workers)
+					break;
 
-					STARPU_PTHREAD_CREATE_ON(
-						workerarg->name,
-						&cuda_worker_set[devid].worker_thread,
-						NULL,
-						_starpu_cuda_worker,
-						&cuda_worker_set[devid],
-						worker+1);
-#ifdef STARPU_USE_FXT
-					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
-					while (!workerarg->worker_is_running)
-						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
-#endif
-					STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
-					while (!cuda_worker_set[devid].set_is_initialized)
-						STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
-									 &cuda_worker_set[devid].mutex);
-					STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
-					cuda_worker_set[devid].started = 1;
-		worker_set_initialized:
-					workerarg->set = &cuda_worker_set[devid];
-				}
-				else
+				cuda_worker_set[devid].nworkers = starpu_get_env_number_default("STARPU_NWORKER_PER_CUDA", 1);
+				cuda_worker_set[devid].workers = workerarg;
+				cuda_worker_set[devid].set_is_initialized = 0;
+
+				if (!_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 					workerarg->run_by_starpu = 0;
+					break;
 				}
+
+				STARPU_PTHREAD_CREATE_ON(
+					workerarg->name,
+					&cuda_worker_set[devid].worker_thread,
+					NULL,
+					_starpu_cuda_worker,
+					&cuda_worker_set[devid],
+					worker+1);
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+				STARPU_PTHREAD_MUTEX_LOCK(&cuda_worker_set[devid].mutex);
+				while (!cuda_worker_set[devid].set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&cuda_worker_set[devid].ready_cond,
+								 &cuda_worker_set[devid].mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_worker_set[devid].mutex);
+				cuda_worker_set[devid].started = 1;
+
 				break;
 #endif
 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
@@ -642,11 +653,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #ifdef STARPU_USE_MIC
 			case STARPU_MIC_WORKER:
+				workerarg->set = &mic_worker_set[devid];
+
 				/* We spawn only one thread
 				 * per MIC device, which will control all MIC
 				 * workers of this device. (by using a worker set). */
-				if (mic_worker_set[devid].started)
-					goto worker_set_initialized;
+				if (mic_worker_set[devid].workers)
+					break;
 
 				mic_worker_set[devid].nworkers = pconfig->topology.nmiccores[devid];
 
@@ -678,8 +691,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[devid].mutex);
 
 				mic_worker_set[devid].started = 1;
-		worker_set_initialized:
-				workerarg->set = &mic_worker_set[devid];
 
 				break;
 #endif /* STARPU_USE_MIC */
@@ -1374,6 +1385,11 @@ unsigned starpu_worker_get_count(void)
 	return config.topology.nworkers;
 }
 
+unsigned starpu_worker_is_slave(int workerid)
+{
+	return config.workers[workerid].slave;
+}
+
 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 {
 	switch (type)

+ 3 - 0
src/core/workers.h

@@ -112,6 +112,9 @@ LIST_TYPE(_starpu_worker,
 	/* flag to know if sched_mutex is locked or not */
 	unsigned sched_mutex_locked;
 
+	/* bool to indicate if the worker is slave in a ctx */
+	unsigned slave;
+
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */

+ 15 - 8
src/datawizard/coherency.c

@@ -150,7 +150,7 @@ void _starpu_update_data_state(starpu_data_handle_t handle,
 
 	/* the data is present now */
 	unsigned requesting_node = requesting_replicate->memory_node;
-	requesting_replicate->requested[requesting_node] = 0;
+	requesting_replicate->requested &= ~(1UL << requesting_node);
 
 	if (mode & STARPU_W)
 	{
@@ -655,18 +655,25 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
-static void _starpu_set_data_requested_flag_if_needed(struct _starpu_data_replicate *replicate)
+static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
 {
-// XXX : this is just a hint, so we don't take the lock ...
-//	_starpu_spin_lock(&handle->header_lock);
+	unsigned local_node = _starpu_memory_node_get_local_key();
+	int cpt = 0;
+	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
+	{
+		cpt++;
+		_starpu_datawizard_progress(local_node, 1);
+	}
+	if (cpt == STARPU_SPIN_MAXTRY)
+		_starpu_spin_lock(&handle->header_lock);
 
 	if (replicate->state == STARPU_INVALID)
 	{
 		unsigned dst_node = replicate->memory_node;
-		replicate->requested[dst_node] = 1;
+		replicate->requested |= 1UL << dst_node;
 	}
 
-//	_starpu_spin_unlock(&handle->header_lock);
+	_starpu_spin_unlock(&handle->header_lock);
 }
 
 int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
@@ -687,7 +694,7 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
 		
 		prefetch_data_on_node(handle, replicate, mode);
 
-		_starpu_set_data_requested_flag_if_needed(replicate);
+		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
 
 	return 0;
@@ -878,7 +885,7 @@ unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsig
 
 		for (i = 0; i < nnodes; i++)
 		{
-			if (handle->per_node[node].requested[i] || handle->per_node[node].request[i])
+			if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
 				ret = 1;
 		}
 

+ 11 - 11
src/datawizard/coherency.h

@@ -47,26 +47,26 @@ LIST_TYPE(_starpu_data_replicate,
 
 	unsigned memory_node;
 
-	/* A buffer that is used for SCRATCH or reduction cannnot be used with
-	 * filters. */
-	unsigned relaxed_coherency;
-
-	/* We may need to initialize the replicate with some value before using it. */
-	unsigned initialized;
-
 	/* describes the state of the local data in term of coherency */
 	enum _starpu_cache_state	state;
 
 	int refcnt;
 
+	/* A buffer that is used for SCRATCH or reduction cannnot be used with
+	 * filters. */
+	unsigned relaxed_coherency:2;
+
+	/* We may need to initialize the replicate with some value before using it. */
+	unsigned initialized:1;
+
 	/* is the data locally allocated ? */
-	uint8_t allocated;
+	unsigned allocated:1;
 	/* was it automatically allocated ? (else it's the application-provided
 	 * buffer, don't ever try to free it!) */
 	/* perhaps the allocation was perform higher in the hiearchy
 	 * for now this is just translated into !automatically_allocated
 	 * */
-	uint8_t automatically_allocated;
+	unsigned automatically_allocated:1;
 
         /* Pointer to memchunk for LRU strategy */
 	struct _starpu_mem_chunk * mc;
@@ -78,7 +78,7 @@ LIST_TYPE(_starpu_data_replicate,
 	   flag when it assigns a task to a queue, policies which do not
 	   use this hint can simply ignore it.
 	 */
-	uint8_t requested[STARPU_MAXNODES];
+	uint32_t requested;
 	struct _starpu_data_request *request[STARPU_MAXNODES];
 )
 
@@ -206,7 +206,7 @@ struct _starpu_data_state
 	 * the end of the reduction. */
 	struct _starpu_data_requester_list *reduction_req_list;
 
-	starpu_data_handle_t reduction_tmp_handles[STARPU_NMAXWORKERS];
+	starpu_data_handle_t *reduction_tmp_handles;
 
 	unsigned lazy_unregister;
 

+ 1 - 1
src/datawizard/datawizard.c

@@ -39,7 +39,7 @@ int __starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsig
 	/* in case some other driver requested data */
 	if (_starpu_handle_pending_node_data_requests(memory_node))
 		ret = 1;
-	if (push_requests)
+	if (ret || push_requests)
 	{
 		unsigned pushed;
 		if (_starpu_handle_node_data_requests(memory_node, may_alloc, &pushed) == 0)

+ 2 - 1
src/datawizard/filters.c

@@ -176,6 +176,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		/* initialize the chunk lock */
 		child->req_list = _starpu_data_requester_list_new();
 		child->reduction_req_list = _starpu_data_requester_list_new();
+		child->reduction_tmp_handles = NULL;
 		child->refcnt = 0;
 		child->busy_count = 0;
 		child->busy_waiting = 0;
@@ -240,10 +241,10 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 			child_replicate->automatically_allocated = 0;
 			child_replicate->refcnt = 0;
 			child_replicate->memory_node = starpu_worker_get_memory_node(worker);
+			child_replicate->requested = 0;
 
 			for (node = 0; node < STARPU_MAXNODES; node++)
 			{
-				child_replicate->requested[node] = 0;
 				child_replicate->request[node] = NULL;
 			}
 

+ 2 - 1
src/datawizard/interfaces/data_interface.c

@@ -213,6 +213,7 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
 	handle->reduction_refcnt = 0;
 	handle->reduction_req_list = _starpu_data_requester_list_new();
+	handle->reduction_tmp_handles = NULL;
 
 #ifdef STARPU_USE_FXT
 	handle->last_submitted_ghost_sync_id_is_valid = 0;
@@ -268,10 +269,10 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 		replicate->state = STARPU_INVALID;
 		replicate->refcnt = 0;
 		replicate->handle = handle;
+		replicate->requested = 0;
 
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			replicate->requested[node] = 0;
 			replicate->request[node] = NULL;
 		}
 

+ 5 - 1
src/datawizard/reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -156,6 +156,8 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 
 	/* Register all valid per-worker replicates */
 	unsigned nworkers = starpu_worker_get_count();
+	STARPU_ASSERT(!handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = malloc(nworkers * sizeof(handle->reduction_tmp_handles[0]));
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		if (handle->per_worker[worker].initialized)
@@ -390,4 +392,6 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 			/* TODO put in cache */
 		}
 	}
+	free(handle->reduction_tmp_handles);
+	handle->reduction_tmp_handles = NULL;
 }

+ 2 - 4
src/datawizard/user_interactions.c

@@ -519,9 +519,7 @@ void starpu_data_set_default_sequential_consistency_flag(unsigned flag)
 /* Query the status of the handle on the specified memory node. */
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested)
 {
-#ifdef STARPU_DEVEL
-#warning FIXME
-#endif
+// XXX : this is just a hint, so we don't take the lock ...
 //	_starpu_spin_lock(&handle->header_lock);
 
 	if (is_allocated)
@@ -537,7 +535,7 @@ void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int
 		unsigned node;
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		{
-			if (handle->per_node[memory_node].requested[node])
+			if (handle->per_node[memory_node].requested & (1UL << node))
 			{
 				requested = 1;
 				break;

+ 86 - 5
src/debug/traces/starpu_fxt.c

@@ -275,6 +275,18 @@ static void worker_set_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	/* TODO: set detailed state */
+	poti_SetState(time, container, "S", name);
+#else
+	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
+#endif
+}
+
 static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
@@ -631,11 +643,8 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	int worker;
 	worker = find_worker_id(ev->param[2]);
 
-	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
 
-	char *prefix = options->file_prefix;
-
 	unsigned long has_name = ev->param[3];
 	char *name = has_name?(char *)&ev->param[4]:"unknown";
 
@@ -646,8 +655,12 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 
 	create_paje_state_if_not_found(name, options);
 
+#ifndef STARPU_ENABLE_PAJE_CODELET_DETAILS
 	if (out_paje_file)
 	{
+		char *prefix = options->file_prefix;
+		unsigned sched_ctx = ev->param[1];
+
 		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
 		if (sched_ctx != 0)
 		{
@@ -662,9 +675,40 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 #endif
 		}
 	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
 
 }
 
+static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	int worker;
+	worker = find_worker_id(ev->param[5]);
+
+	unsigned sched_ctx = ev->param[1];
+	if (worker < 0) return;
+
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+	{
+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
+		if (sched_ctx != 0)
+		{
+#ifdef STARPU_HAVE_POTI
+			char container[STARPU_POTI_STR_LEN];
+			char ctx[6];
+			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
+			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
+			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
+#else
+			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+#endif
+		}
+	}
+#endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
+}
+
 static long dumped_codelets_count;
 static struct starpu_fxt_codelet_event *dumped_codelets;
 
@@ -916,6 +960,40 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
 }
 
+
+static void handle_work_stealing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned dst = ev->param[0];
+	unsigned src = ev->param[1];
+	unsigned size = 0;
+	unsigned comid = 0;
+	
+	char *prefix = options->file_prefix;
+
+	
+	if (out_paje_file)
+	{
+		double time = get_event_time_stamp(ev, options);
+#ifdef STARPU_HAVE_POTI
+		char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN], src_worker_container[STARPU_POTI_STR_LEN], dst_worker_container[STARPU_POTI_STR_LEN];
+		char program_container[STARPU_POTI_STR_LEN];
+		snprintf(paje_value, STARPU_POTI_STR_LEN, "%u", size);
+		snprintf(paje_key, STARPU_POTI_STR_LEN, "steal_%u", comid);
+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
+		worker_container_alias(src_worker_container, STARPU_POTI_STR_LEN, prefix, src);
+		worker_container_alias(dst_worker_container, STARPU_POTI_STR_LEN, prefix, dst);
+		poti_StartLink(time, program_container, "L", src_worker_container, paje_value, paje_key);
+		poti_EndLink(time+0.000000001, program_container, "L", dst_worker_container, paje_value, paje_key);
+#else
+
+		fprintf(out_paje_file, "18	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time, prefix, size, prefix, src, comid);
+		fprintf(out_paje_file, "19	%.9f	L	%sp	%u	%sw%d	steal_%u\n", time+0.000000001, prefix, size, prefix, dst, comid);
+#endif
+	}
+
+}
+
+
 static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	unsigned dst = ev->param[1];
@@ -1389,7 +1467,7 @@ static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
 #ifdef STARPU_HAVE_POTI
 		char container[STARPU_POTI_STR_LEN];
-		thread_container_alias(container, STARPU_POTI_STR_LEN, prefix);
+		thread_container_alias(container, STARPU_POTI_STR_LEN, options->file_prefix, ev->param[0]);
 		poti_NewEvent(get_event_time_stamp(ev, options), container, "thread_event", event);
 #else
 		fprintf(out_paje_file, "9	%.9f	thread_event	%st%"PRIu64"	%s\n", get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], event);
@@ -1524,6 +1602,9 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 			case _STARPU_FUT_START_CODELET_BODY:
 				handle_start_codelet_body(&ev, options);
 				break;
+			case _STARPU_FUT_CODELET_DETAILS:
+				handle_codelet_details(&ev, options);
+				break;
 			case _STARPU_FUT_END_CODELET_BODY:
 				handle_end_codelet_body(&ev, options);
 				break;
@@ -1641,7 +1722,7 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				break;
 
 			case _STARPU_FUT_WORK_STEALING:
-				/* XXX */
+				handle_work_stealing(&ev, options);
 				break;
 
 			case _STARPU_FUT_WORKER_DEINIT_START:

+ 11 - 0
src/debug/traces/starpu_paje.c

@@ -130,6 +130,17 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	fprintf(file, "%%	DestContainer	string\n");
 	fprintf(file, "%%	Key	string\n");
 	fprintf(file, "%%EndEventDef\n");
+#ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
+	fprintf(file, "%%EventDef PajeSetState 20\n");
+	fprintf(file, "%%	Time	date\n");
+	fprintf(file, "%%	Container	string\n");
+	fprintf(file, "%%	Type	string\n");
+	fprintf(file, "%%	Value	string\n");
+	fprintf(file, "%%	Size	string\n");
+	fprintf(file, "%%	Footprint	string\n");
+	fprintf(file, "%%	Tag	string\n");
+	fprintf(file, "%%EndEventDef\n");
+#endif
 #endif
 
 #ifdef STARPU_HAVE_POTI

+ 1 - 1
src/drivers/cpu/driver_cpu.c

@@ -81,7 +81,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 	}
 
 	/* Give profiling variable */
-	_starpu_driver_start_job(cpu_args, j, &codelet_start, rank, profiling);
+	_starpu_driver_start_job(cpu_args, j, perf_arch, &codelet_start, rank, profiling);
 
 	/* In case this is a Fork-join parallel task, the worker does not
 	 * execute the kernel at all. */

+ 1 - 1
src/drivers/cuda/driver_cuda.c

@@ -396,7 +396,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
 	/* We make sure we do manipulate the proper device */

+ 2 - 2
src/drivers/driver_common/driver_common.c

@@ -33,7 +33,7 @@
 #define BACKOFF_MAX 32  /* TODO : use parameter to define them */
 #define BACKOFF_MIN 1
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct timespec *codelet_start, int rank, int profiling)
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, struct timespec *codelet_start, int rank, int profiling)
 {
 	struct starpu_task *task = j->task;
 	struct starpu_codelet *cl = task->cl;
@@ -73,7 +73,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 
-	_STARPU_TRACE_START_CODELET_BODY(j);
+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)

+ 2 - 2
src/drivers/driver_common/driver_common.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012, 2014  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,7 +23,7 @@
 #include <core/jobs.h>
 #include <common/utils.h>
 
-void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j,
+void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			      struct timespec *codelet_start, int rank, int profiling);
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch,
 			    struct timespec *codelet_end, int rank, int profiling);

+ 1 - 1
src/drivers/mp_common/source_common.c

@@ -421,7 +421,7 @@ static int _starpu_src_common_execute(struct _starpu_job *j,
 
 	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
 
-	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
 
 
 	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);

+ 1 - 1
src/drivers/opencl/driver_opencl.c

@@ -825,7 +825,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		return -EAGAIN;
 	}
 
-	_starpu_driver_start_job(args, j, &j->cl_start, 0, profiling);
+	_starpu_driver_start_job(args, j, &args->perf_arch, &j->cl_start, 0, profiling);
 
 	starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
 	STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");

+ 17 - 10
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -286,6 +286,13 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 	struct _starpu_dmda_data *dt = (struct _starpu_dmda_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	/* make sure someone coule execute that task ! */
 	STARPU_ASSERT(best_workerid != -1);
+	unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(best_workerid, sched_ctx_id);
+        if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+        {
+		starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+                starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+                return 0;
+        }
 
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[best_workerid];
 
@@ -405,9 +412,9 @@ static int _dm_push_task(struct starpu_task *task, unsigned prio, unsigned sched
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 		struct _starpu_fifo_taskq *fifo  = dt->queue_array[worker];
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -543,9 +550,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 		struct _starpu_fifo_taskq *fifo = dt->queue_array[worker];
 		struct starpu_perfmodel_arch* perf_arch = starpu_worker_get_perf_archtype(worker);
@@ -692,10 +699,6 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 
 	double fitness[nworkers_ctx][STARPU_MAXIMPLEMENTATIONS];
 
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-		workers->init_iterator(workers, &it);
-
 	compute_all_performance_predictions(task,
 					    nworkers_ctx,
 					    local_task_length,
@@ -712,9 +715,13 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, unsigned sch
 	unsigned nimpl;
 	if (forced_best == -1)
 	{
-		while(workers->has_next(workers, &it))
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+
+		while(workers->has_next_master(workers, &it))
 		{
-			worker = workers->get_next(workers, &it);
+			worker = workers->get_next_master(workers, &it);
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
 				if (!starpu_worker_can_execute_task(worker, task, nimpl))

+ 13 - 2
src/sched_policies/eager_central_policy.c

@@ -94,9 +94,9 @@ static int push_task_eager_policy(struct starpu_task *task)
 	if(workers->init_iterator)
 		workers->init_iterator(workers, &it);
 	
-	while(workers->has_next(workers, &it))
+	while(workers->has_next_master(workers, &it))
 	{
-		worker = workers->get_next(workers, &it);
+		worker = workers->get_next_master(workers, &it);
 
 #ifdef STARPU_NON_BLOCKING_DRIVERS
 		if (!starpu_bitmap_get(data->waiters, worker))
@@ -167,6 +167,17 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
+	if(task)
+	{
+		unsigned child_sched_ctx = starpu_sched_ctx_worker_is_master_for_child_ctx(workerid, sched_ctx_id);
+		if(child_sched_ctx != STARPU_NMAX_SCHED_CTXS)
+		{
+			starpu_sched_ctx_revert_task_counters(sched_ctx_id, task->flops);
+			starpu_sched_ctx_move_task_to_ctx(task, child_sched_ctx);
+			return NULL;
+		}
+	}
+
 	return task;
 }
 

+ 373 - 0
src/sched_policies/locality_work_stealing_policy.c

@@ -0,0 +1,373 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2014  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Work stealing policy */
+
+#include <float.h>
+
+#include <core/workers.h>
+#include <sched_policies/fifo_queues.h>
+#include <core/debug.h>
+#include <starpu_bitmap.h>
+
+struct _starpu_lws_data
+{
+	struct _starpu_fifo_taskq **queue_array;
+	int **proxlist;
+	unsigned last_pop_worker;
+	unsigned last_push_worker;
+};
+
+
+#ifdef STARPU_HAVE_HWLOC
+
+/* Return a worker to steal a task from. The worker is selected
+ * according to the proximity list built using the info on te
+ * architecture provided by hwloc */
+static unsigned select_victim_neighborhood(unsigned sched_ctx_id, int workerid)
+{
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	int i;
+	int neighbor;
+	for(i=0; i<nworkers; i++){
+		neighbor = ws->proxlist[workerid][i];
+		int ntasks = ws->queue_array[neighbor]->ntasks;
+		
+		if (ntasks)
+			return neighbor;
+	}
+
+	return workerid;
+}
+#else
+/* Return a worker to steal a task from. The worker is selected
+ * in a round-robin fashion */
+static unsigned select_victim_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_pop_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	/* If the worker's queue is empty, let's try
+	 * the next ones */
+	while (1)
+	{
+		unsigned ntasks;
+
+		starpu_worker_get_sched_condition(worker, &victim_sched_mutex, &victim_sched_cond);
+		ntasks = ws->queue_array[worker]->ntasks;
+		if (ntasks)
+			break;
+
+		worker = (worker + 1) % nworkers;
+		if (worker == ws->last_pop_worker)
+		{
+			/* We got back to the first worker,
+			 * don't go in infinite loop */
+			break;
+		}
+	}
+
+	ws->last_pop_worker = (worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+#endif
+
+
+/**
+ * Return a worker to whom add a task.
+ * Selecting a worker is done in a round-robin fashion.
+ */
+static unsigned select_worker_round_robin(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+	unsigned worker = ws->last_push_worker;
+	unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx_id);
+	/* TODO: use an atomic update operation for this */
+	ws->last_push_worker = (ws->last_push_worker + 1) % nworkers;
+
+	return worker;
+}
+
+
+/**
+ * Return a worker from which a task can be stolen.
+ */
+static inline unsigned select_victim(unsigned sched_ctx_id, int workerid)
+{
+
+#ifdef STARPU_HAVE_HWLOC
+	return select_victim_neighborhood(sched_ctx_id, workerid);
+#else
+	return select_victim_round_robin(sched_ctx_id);
+#endif
+}
+
+/**
+ * Return a worker on whose queue a task can be pushed. This is only
+ * needed when the push is done by the master
+ */
+static inline unsigned select_worker(unsigned sched_ctx_id)
+{
+	return select_worker_round_robin(sched_ctx_id);
+}
+
+
+static struct starpu_task *lws_pop_task(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	struct starpu_task *task = NULL;
+
+	int workerid = starpu_worker_get_id();
+
+	STARPU_ASSERT(workerid != -1);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+	if (task)
+	{
+		/* there was a local task */
+		/* printf("Own    task!%d\n",workerid); */
+		return task;
+	}
+	starpu_pthread_mutex_t *worker_sched_mutex;
+	starpu_pthread_cond_t *worker_sched_cond;
+	starpu_worker_get_sched_condition(workerid, &worker_sched_mutex, &worker_sched_cond);
+
+	/* Note: Releasing this mutex before taking the victim mutex, to avoid interlock*/
+	STARPU_PTHREAD_MUTEX_UNLOCK(worker_sched_mutex);
+       
+
+	/* we need to steal someone's job */
+	unsigned victim = select_victim(sched_ctx_id, workerid);
+
+	starpu_pthread_mutex_t *victim_sched_mutex;
+	starpu_pthread_cond_t *victim_sched_cond;
+
+	starpu_worker_get_sched_condition(victim, &victim_sched_mutex, &victim_sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(victim_sched_mutex);
+
+	task = _starpu_fifo_pop_task(ws->queue_array[victim], workerid);
+	if (task)
+	{
+		_STARPU_TRACE_WORK_STEALING(workerid, victim);
+	}
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(victim_sched_mutex);
+
+	STARPU_PTHREAD_MUTEX_LOCK(worker_sched_mutex);
+	if(!task)
+	{
+		task = _starpu_fifo_pop_task(ws->queue_array[workerid], workerid);
+		if (task)
+		{
+			/* there was a local task */
+			return task;
+		}
+	}
+
+	return task;
+}
+
+static int lws_push_task(struct starpu_task *task)
+{
+	unsigned sched_ctx_id = task->sched_ctx;
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	int workerid = starpu_worker_get_id();
+
+	/* If the current thread is not a worker but
+	 * the main thread (-1), we find the better one to
+	 * put task on its queue */
+	if (workerid == -1)
+		workerid = select_worker(sched_ctx_id);
+
+	/* int workerid = starpu_worker_get_id(); */
+	/* print_neighborhood(sched_ctx_id, 0); */
+	
+	starpu_pthread_mutex_t *sched_mutex;
+	starpu_pthread_cond_t *sched_cond;
+	starpu_worker_get_sched_condition(workerid, &sched_mutex, &sched_cond);
+	STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
+
+	_starpu_fifo_push_task(ws->queue_array[workerid], task);
+	
+	starpu_push_task_end(task);
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
+
+#ifndef STARPU_NON_BLOCKING_DRIVERS
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_sched_ctx_iterator it;
+	if(workers->init_iterator)
+		workers->init_iterator(workers, &it);
+	while(workers->has_next(workers, &it))
+	{
+		worker = workers->get_next(workers, &it);
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+		STARPU_PTHREAD_COND_SIGNAL(sched_cond);
+	}
+#endif
+
+
+	
+	return 0;
+}
+
+static void lws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+		ws->queue_array[workerid] = _starpu_create_fifo();
+
+		/* Tell helgrid that we are fine with getting outdated values,
+		 * this is just an estimation */
+		STARPU_HG_DISABLE_CHECKING(ws->queue_array[workerid]->ntasks);
+
+		ws->queue_array[workerid]->nprocessed = 0;
+		ws->queue_array[workerid]->ntasks = 0;
+	}
+
+
+#ifdef STARPU_HAVE_HWLOC
+	/* Build a proximity list for every worker. It is cheaper to
+	 * build this once and then use it for popping tasks rather
+	 * than traversing the hwloc tree every time a task must be
+	 * stolen */
+	ws->proxlist = (int**)malloc(nworkers*sizeof(int*));
+	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		ws->proxlist[workerid] = (int*)malloc(nworkers*sizeof(int));
+		int bindid;
+		
+		struct starpu_tree *neighbour = NULL;
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+	
+		bindid   = starpu_worker_get_bindid(workerid);
+		it.value = starpu_tree_get(tree, bindid);
+		int cnt = 0;
+		for(;;)
+		{
+			neighbour = (struct starpu_tree*)it.value;
+			int workerids[STARPU_NMAXWORKERS];
+			int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+			int w;
+			for(w = 0; w < nworkers; w++)
+			{
+				if(!it.visited[workerids[w]] && workers->present[workerids[w]])
+				{
+					ws->proxlist[workerid][cnt++] = workerids[w];
+					it.visited[workerids[w]] = 1;
+				}
+			}
+			if(!workers->has_next(workers, &it))
+				break;
+			it.value = it.possible_value;
+			it.possible_value = NULL;
+		} 
+	}
+#endif	
+}
+
+static void lws_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	unsigned i;
+	int workerid;
+
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		_starpu_destroy_fifo(ws->queue_array[workerid]);
+#ifdef STARPU_HAVE_HWLOC
+		free(ws->proxlist[workerid]);
+#endif
+	}
+}
+
+static void lws_initialize_policy(unsigned sched_ctx_id)
+{
+#ifdef STARPU_HAVE_HWLOC
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_TREE);
+#else
+	starpu_sched_ctx_create_worker_collection(sched_ctx_id, STARPU_WORKER_LIST);
+#endif
+
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)malloc(sizeof(struct _starpu_lws_data));
+	starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)ws);
+
+	ws->last_pop_worker = 0;
+	ws->last_push_worker = 0;
+
+	/* unsigned nw = starpu_sched_ctx_get_nworkers(sched_ctx_id); */
+	unsigned nw = starpu_worker_get_count();
+	ws->queue_array = (struct _starpu_fifo_taskq**)malloc(nw*sizeof(struct _starpu_fifo_taskq*));
+
+}
+	
+static void lws_deinit_policy(unsigned sched_ctx_id)
+{
+	struct _starpu_lws_data *ws = (struct _starpu_lws_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+
+	free(ws->queue_array);
+#ifdef STARPU_HAVE_HWLOC
+	free(ws->proxlist);
+#endif
+	free(ws);
+	starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
+}
+
+struct starpu_sched_policy _starpu_sched_lws_policy =
+{
+	.init_sched = lws_initialize_policy,
+	.deinit_sched = lws_deinit_policy,
+	.add_workers = lws_add_workers,
+	.remove_workers = lws_remove_workers,
+	.push_task = lws_push_task,
+	.pop_task = lws_pop_task,
+	.pre_exec_hook = NULL,
+	.post_exec_hook = NULL,
+	.pop_every_task = NULL,
+	.policy_name = "nws",
+	.policy_description = "new work stealing"
+};

+ 7 - 3
src/util/starpu_task_insert_utils.c

@@ -126,8 +126,7 @@ size_t _starpu_task_insert_get_arg_size(va_list varg_list)
 		{
 			(void)va_arg(varg_list, double);
 		}
-
-		else if (arg_type==STARPU_TAG)
+		else if (arg_type==STARPU_TAG || arg_type==STARPU_TAG_ONLY)
 		{
 			(void)va_arg(varg_list, starpu_tag_t);
 		}
@@ -237,7 +236,7 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t arg_buffer_size, va_list
 		{
 			(void)va_arg(varg_list, double);
 		}
-		else if (arg_type==STARPU_TAG)
+		else if (arg_type==STARPU_TAG || arg_type==STARPU_TAG_ONLY)
 		{
 			(void)va_arg(varg_list, starpu_tag_t);
 		}
@@ -416,6 +415,11 @@ void _starpu_task_insert_create(void *arg_buffer, size_t arg_buffer_size, struct
 			(*task)->tag_id = tag;
 			(*task)->use_tag = 1;
 		}
+		else if (arg_type==STARPU_TAG_ONLY)
+		{
+			starpu_tag_t tag = va_arg(varg_list, starpu_tag_t);
+			(*task)->tag_id = tag;
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);

+ 65 - 3
src/worker_collection/worker_list.c

@@ -42,6 +42,30 @@ static int list_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 }
 
+static unsigned list_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int nworkers = workers->nmasters;
+	STARPU_ASSERT(it != NULL);
+
+	unsigned ret = it->cursor < nworkers ;
+
+	if(!ret) it->cursor = 0;
+
+	return ret;
+}
+
+static int list_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int *workerids = (int *)workers->masters;
+	int nworkers = (int)workers->nmasters;
+
+	STARPU_ASSERT_MSG(it->cursor < nworkers, "cursor %d nworkers %d\n", it->cursor, nworkers);
+
+	int ret = workerids[it->cursor++];
+
+	return ret;
+}
+
 static unsigned _worker_belongs_to_ctx(struct starpu_worker_collection *workers, int workerid)
 {
 	int *workerids = (int *)workers->workerids;
@@ -108,9 +132,12 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 {
 	int *workerids = (int *)workers->workerids;
 	unsigned nworkers = workers->nworkers;
+
+	int *masters = (int *)workers->masters;
+	unsigned nmasters = workers->nmasters;
 	
-	int found_worker = -1;
 	unsigned i;
+	int found_worker = -1;
 	for(i = 0; i < nworkers; i++)
 	{
 		if(workerids[i] == worker)
@@ -125,13 +152,29 @@ static int list_remove(struct starpu_worker_collection *workers, int worker)
 	if(found_worker != -1)
 		workers->nworkers--;
 
+	int found_master = -1;
+	for(i = 0; i < nmasters; i++)
+	{
+		if(masters[i] == worker)
+		{
+			masters[i] = -1;
+			found_master = worker;
+			break;
+		}
+	}
+
+	_rearange_workerids(masters, nmasters);
+	if(found_master != -1)
+		workers->nmasters--;
+	printf("rem %d\n", found_worker);
 	return found_worker;
 }
 
 static void _init_workers(int *workerids)
 {
 	unsigned i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
 		workerids[i] = -1;
 	return;
 }
@@ -139,10 +182,14 @@ static void _init_workers(int *workerids)
 static void list_init(struct starpu_worker_collection *workers)
 {
 	int *workerids = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
+	int *masters = (int*)malloc(STARPU_NMAXWORKERS * sizeof(int));
 	_init_workers(workerids);
+	_init_workers(masters);
 
 	workers->workerids = (void*)workerids;
 	workers->nworkers = 0;
+	workers->masters = (void*)masters;
+	workers->nmasters = 0;
 
 	return;
 }
@@ -150,17 +197,32 @@ static void list_init(struct starpu_worker_collection *workers)
 static void list_deinit(struct starpu_worker_collection *workers)
 {
 	free(workers->workerids);
+	free(workers->masters);
 }
 
-static void list_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void list_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	it->cursor = 0;
+
+	int *workerids = (int *)workers->workerids;
+	unsigned nworkers = workers->nworkers;
+	unsigned i;
+	int nm = 0;
+	for(i = 0;  i < nworkers; i++)
+	{
+		if(!starpu_worker_is_slave(workerids[i]))
+			((int*)workers->masters)[nm++] = workerids[i];
+	}
+	workers->nmasters = nm;
+
 }
 
 struct starpu_worker_collection worker_list =
 {
 	.has_next = list_has_next,
 	.get_next = list_get_next,
+	.has_next_master = list_has_next_master,
+	.get_next_master = list_get_next_master,
 	.add = list_add,
 	.remove = list_remove,
 	.init = list_init,

+ 84 - 4
src/worker_collection/worker_tree.c

@@ -89,6 +89,75 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 	return ret;
 }
 
+static unsigned tree_has_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	STARPU_ASSERT(it != NULL);
+	if(workers->nworkers == 0)
+		return 0;
+
+	struct starpu_tree *tree = (struct starpu_tree*)workers->workerids;
+	struct starpu_tree *neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	if(!neighbour)
+	{
+		starpu_tree_reset_visited(tree, it->visited);
+		it->value = NULL;
+		it->possible_value = NULL;
+		return 0;
+	}
+	int id = -1;
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			id = workerids[w];
+			it->possible_value = neighbour;
+		}
+	}
+
+	STARPU_ASSERT_MSG(id != -1, "bind id (%d) for workerid (%d) not correct", neighbour->id, id);
+
+	return 1;
+}
+
+static int tree_get_next_master(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
+{
+	int ret = -1;
+	
+	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
+	struct starpu_tree *neighbour = NULL;
+	if(it->possible_value)
+	{
+		neighbour = it->possible_value;
+		it->possible_value = NULL;
+	}
+	else
+		neighbour = starpu_tree_get_neighbour(tree, (struct starpu_tree*)it->value, it->visited, workers->is_master);
+	
+	STARPU_ASSERT_MSG(neighbour, "no element anymore");
+	
+	
+	int workerids[STARPU_NMAXWORKERS];
+	int nworkers = _starpu_worker_get_workerids(neighbour->id, workerids);
+	int w;
+	for(w = 0; w < nworkers; w++)
+	{
+		if(!it->visited[workerids[w]] && workers->is_master[workerids[w]])
+		{
+			ret = workerids[w];
+			it->visited[workerids[w]] = 1;
+			it->value = neighbour;
+		}
+	}
+	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
+
+	return ret;
+}
+
+
 static int tree_add(struct starpu_worker_collection *workers, int worker)
 {
 	struct starpu_tree *tree = (struct starpu_tree *)workers->workerids;
@@ -111,6 +180,7 @@ static int tree_remove(struct starpu_worker_collection *workers, int worker)
 	if(workers->present[worker])
 	{
 		workers->present[worker] = 0;
+		workers->is_master[worker] = 0;
 		workers->nworkers--;
 		return worker;
 	}
@@ -122,10 +192,14 @@ static void tree_init(struct starpu_worker_collection *workers)
 {
 	workers->workerids = (void*)starpu_workers_get_tree();
 	workers->nworkers = 0;
-	
+
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
 		workers->present[i] = 0;
+		workers->is_master[i] = 0;
+	}
 	
 	return;
 }
@@ -135,19 +209,25 @@ static void tree_deinit(struct starpu_worker_collection *workers)
 //	free(workers->workerids);
 }
 
-static void tree_init_iterator(struct starpu_worker_collection *workers STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_ctx_iterator *it)
+static void tree_init_iterator(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it)
 {
 	it->value = NULL;
 	it->possible_value = NULL;
 	int i;
-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	int nworkers = starpu_worker_get_count();
+	for(i = 0; i < nworkers; i++)
+	{
+		workers->is_master[i] = (workers->present[i] && !starpu_worker_is_slave(i));
 		it->visited[i] = 0;
+	}
 }
 
 struct starpu_worker_collection worker_tree =
 {
 	.has_next = tree_has_next,
 	.get_next = tree_get_next,
+	.has_next_master = tree_has_next_master,
+	.get_next_master = tree_get_next_master,
 	.add = tree_add,
 	.remove = tree_remove,
 	.init = tree_init,

+ 1 - 0
tests/datawizard/commute.c

@@ -171,6 +171,7 @@ int main(int argc, char **argv)
 		test(STARPU_R, STARPU_RW, i);
 	}
 
+	starpu_data_unregister(x_handle);
 	starpu_shutdown();
 	STARPU_RETURN(0);
 

+ 3 - 16
tests/datawizard/increment_init.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,8 +28,6 @@ static starpu_data_handle_t handle;
 #ifdef STARPU_USE_CUDA
 static void neutral_cuda_kernel(void *descr[], void *arg)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	/* This is a dummy technique of course */
@@ -42,8 +40,6 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 #ifdef STARPU_USE_OPENCL
 static void neutral_opencl_kernel(void *descr[], void *arg)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	unsigned h_dst = 0;
 	cl_mem d_dst = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
@@ -59,8 +55,6 @@ static void neutral_opencl_kernel(void *descr[], void *arg)
 
 static void neutral_cpu_kernel(void *descr[], void *arg)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	unsigned *dst = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*dst = 0;
 }
@@ -86,8 +80,6 @@ static struct starpu_codelet neutral_cl =
 /* dummy OpenCL implementation */
 static void increment_opencl_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE_UNUSED)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	cl_mem d_token = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned h_token;
 
@@ -105,8 +97,6 @@ static void increment_opencl_kernel(void *descr[], void *cl_arg STARPU_ATTRIBUTE
 #ifdef STARPU_USE_CUDA
 static void increment_cuda_kernel(void *descr[], void *arg)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned host_token;
 
@@ -123,8 +113,6 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 
 static void increment_cpu_kernel(void *descr[], void *arg)
 {
-	STARPU_SKIP_IF_VALGRIND;
-
 	unsigned *tokenptr = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	*tokenptr = *tokenptr + 1;
 }
@@ -144,7 +132,7 @@ static struct starpu_codelet increment_cl =
 
 int main(int argc, char **argv)
 {
-	unsigned *pvar;
+	unsigned *pvar = NULL;
 	int ret;
 
 	ret = starpu_init(NULL);
@@ -209,6 +197,5 @@ enodev:
 
 err:
 	starpu_shutdown();
-	STARPU_RETURN(EXIT_FAILURE);
-
+	return EXIT_FAILURE;
 }

+ 2 - 2
tests/heat/dmda.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 
 export STARPU_NCUDA=3

+ 5 - 3
tests/heat/gflops_sched.gp

@@ -3,7 +3,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -30,7 +30,8 @@ set key right bottom
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$2* 1000000)) with linespoint title "greedy"  ,\
      "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "prio" 	    ,\
-     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "ws" 
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$4* 1000000)) with linespoint title "ws" 	    ,\
+     "timings/gflops.merged.data" usi 1:(2*$1*$1*$1 / (3*$6* 1000000)) with linespoint title "lws" 
 
 set output "gflops_sched_gain.eps"
 set title "LU Decomposition : scheduling strategies : gain"
@@ -43,4 +44,5 @@ set logscale x
 set key right bottom
 set datafile missing 'x'
 plot "timings/gflops.merged.data" usi 1:(100*(($2 / $4)-1)) with linespoint title "gain prio"	,\
-	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain ws"    ,\
+	"timings/gflops.merged.data" usi 1:(100*(($2 / $6)-1)) with linespoint title "gain lws"    

+ 10 - 1
tests/heat/gflops_sched.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -137,6 +137,15 @@ do
 done
 
 
+filename=$TIMINGDIR/gflops.lws.data
+policy=lws
+trace_header 
+for size in $sizelist
+do
+	trace_size $size;
+done
+
+
 filename=$TIMINGDIR/gflops.noprio.data
 policy=no-prio
 trace_header 

+ 2 - 2
tests/heat/granularity.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@ max <- 28
 maxy <- 400
 
 sizelist <- seq(2048, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 # grainlist <- c(64, 128, 256, 512, 768, 1024, 1280, 1536, 2048);
 grainlist <- c(256, 512, 1024, 2048);

+ 2 - 2
tests/heat/granularity_model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -17,7 +17,7 @@
 max <- 30
 
 sizelist <- seq(64, max*1024, 64);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 #schedlist <- c("greedy", "prio", "dm", "random");
 #grainlist <- c(256, 512, 1024)
 grainlist <- c(512, 1024)

+ 2 - 2
tests/heat/model.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio", "dm", "random");
 
 print(schedlist);

+ 4 - 3
tests/heat/random.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("prio","random");
 
 print(schedlist);
@@ -97,13 +97,14 @@ display_sched <- function()
 	trace_sched("prio", "red", 4);
 	#trace_sched("no-prio", "black");
 	#trace_sched("ws", "purple");
+	#trace_sched("lws", "purple");
 
 	axis(1, at=sizelist)
 	axis(2, at=seq(0, 100, 10), tck=1)
 #	axis(4, at=seq(0, 100, 10))
 	box(bty="u")
 
-        #labels <- c("greedy", "priority", "model", "random", "black", "ws")
+        #labels <- c("greedy", "priority", "model", "random", "black", "ws", "lws")
 #        labels <- c("greedy", "priority", "model", "random")
 	#labels <- c("model", "weighted random", "greedy", "priority")
 	labels <- c("weighted random", "priority")

+ 2 - 2
tests/heat/sched.r

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -15,7 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 sizelist <- seq(2048, 24576, 2048);
-#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws");
+#schedlist <- c("greedy", "prio", "dm", "random", "no-prio", "ws", "lws");
 schedlist <- c("greedy", "prio", "dm", "random");
 
 print(schedlist);

+ 2 - 2
tests/heat/sched.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -94,7 +94,7 @@ export STARPU_PERF_MODEL_DIR=$SAMPLINGDIR
 mkdir -p $TIMINGDIR
 mkdir -p $SAMPLINGDIR
 
-#schedlist="ws no-prio greedy prio dm random"
+#schedlist="ws lws no-prio greedy prio dm random"
 #schedlist="random random random random"
 
 export STARPU_NCUDA=3

+ 3 - 1
tests/main/subgraph_repeat.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -164,6 +164,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 	starpu_shutdown();
 
@@ -179,6 +180,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 3 - 1
tests/main/subgraph_repeat_regenerate.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -168,6 +168,7 @@ int main(int argc, char **argv)
 	STARPU_ASSERT(*check_cnt == (4*loop_cnt));
 
 	starpu_free(check_cnt);
+	starpu_data_unregister(check_data);
 
 	starpu_shutdown();
 
@@ -183,6 +184,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 2 - 1
tests/main/subgraph_repeat_regenerate_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -198,6 +198,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 2 - 1
tests/main/subgraph_repeat_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2014  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -182,6 +182,7 @@ enodev:
 	fprintf(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_data_unregister(check_data);
 	starpu_shutdown();
 	return STARPU_TEST_SKIPPED;
 }

+ 5 - 2
tests/perfmodels/feed.c

@@ -50,8 +50,11 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	 if(starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) < 2)
+	 {
+		 starpu_shutdown();
 		 return STARPU_TEST_SKIPPED;
+	 }
 
 	starpu_task_init(&task);
 	task.cl = &cl;
@@ -76,7 +79,7 @@ int main(int argc, char **argv)
 		arch.devid = 0;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_fast);
 		starpu_perfmodel_update_history(&nl_model, &task, &arch, 0, 0, measured_fast);
-		
+
 		/* Simulate Slow GPU */
 		arch.devid = 1;
 		starpu_perfmodel_update_history(&model, &task, &arch, 0, 0, measured_slow);

+ 3 - 1
tests/regression/profiles.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -32,6 +32,8 @@ STARPU_NCUDA=1
 # Execution configuration
 STARPU_SCHED=ws
 # Execution configuration
+STARPU_SCHED=lws
+# Execution configuration
 STARPU_SCHED=prio
 # Execution configuration
 STARPU_SCHED=no-prio

+ 5 - 1
tests/regression/regression_test.sh

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2008, 2009, 2010  Université de Bordeaux 1
+# Copyright (C) 2008, 2009, 2010, 2014  Université de Bordeaux 1
 # Copyright (C) 2010  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -65,6 +65,10 @@ echo "heat.ws.8k.v2"
 timing=`STARPU_SCHED="ws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.ws.8k.v2";
 
+echo "heat.lws.8k.v2"
+timing=`STARPU_SCHED="lws" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
+save_cov "heat.lws.8k.v2";
+
 echo "heat.greedy.8k.v2"
 timing=`STARPU_SCHED="greedy" $ROOTDIR/examples/heat/heat -ntheta 66 -nthick 130 -nblocks 8 -pin -v2 2> log`
 save_cov "heat.greedy.8k.v2";