Browse Source

Merge branch 'master' into starpurm

Nathalie Furmento 7 years ago
parent
commit
38fc69d444
63 changed files with 698 additions and 359 deletions
  1. 1 0
      AUTHORS
  2. 10 0
      ChangeLog
  3. 13 26
      doc/doxygen/chapters/110_basic_examples.doxy
  4. 6 3
      doc/doxygen/chapters/210_check_list_performance.doxy
  5. 4 6
      doc/doxygen/chapters/301_tasks.doxy
  6. 10 26
      doc/doxygen/chapters/310_data_management.doxy
  7. 12 12
      doc/doxygen/chapters/330_scheduling_contexts.doxy
  8. 15 31
      doc/doxygen/chapters/350_modularized_scheduler.doxy
  9. 5 8
      doc/doxygen/chapters/370_online_performance_tools.doxy
  10. 2 3
      doc/doxygen/chapters/390_faq.doxy
  11. 3 3
      doc/doxygen/chapters/401_out_of_core.doxy
  12. 73 78
      doc/doxygen/chapters/410_mpi_support.doxy
  13. 10 25
      doc/doxygen/chapters/440_c_extensions.doxy
  14. 9 1
      doc/doxygen/chapters/470_simgrid.doxy
  15. 4 3
      doc/doxygen/chapters/490_clustering_a_machine.doxy
  16. 10 8
      doc/doxygen/chapters/501_environment_variables.doxy
  17. 2 1
      doc/doxygen/chapters/510_configure_options.doxy
  18. 16 1
      doc/doxygen/chapters/api/data_interfaces.doxy
  19. 15 1
      doc/doxygen/chapters/api/insert_task.doxy
  20. 2 5
      doc/doxygen/chapters/api/opencl_extensions.doxy
  21. 4 7
      doc/doxygen/chapters/code/complex.c
  22. 1 4
      doc/doxygen/chapters/code/disk_compute.c
  23. 2 3
      doc/doxygen/chapters/code/matmul_pragma2.c
  24. 3 5
      doc/doxygen/chapters/code/scal_pragma.cu
  25. 3 5
      doc/doxygen/chapters/code/vector_scal_c.c
  26. 3 5
      doc/doxygen/chapters/code/vector_scal_cuda.cu
  27. 4 7
      doc/doxygen/chapters/code/vector_scal_opencl.c
  28. 3 1
      examples/basic_examples/vector_scal.c
  29. 2 2
      examples/filters/frecursive.c
  30. 2 1
      include/fstarpu_mod.f90
  31. 3 1
      include/starpu_data_interfaces.h
  32. 2 0
      include/starpu_sched_component.h
  33. 2 0
      include/starpu_sched_ctx.h
  34. 3 1
      include/starpu_task_util.h
  35. 5 1
      mpi/src/starpu_mpi_task_insert.c
  36. 6 1
      mpi/src/starpu_mpi_task_insert_fortran.c
  37. 2 2
      src/common/list.h
  38. 14 5
      src/common/utils.h
  39. 6 6
      src/core/dependencies/implicit_data_deps.c
  40. 13 9
      src/core/disk.c
  41. 2 2
      src/core/disk_ops/unistd/disk_unistd_global.c
  42. 3 2
      src/core/perfmodel/multiple_regression.c
  43. 1 1
      src/core/perfmodel/perfmodel.c
  44. 7 0
      src/core/sched_ctx.c
  45. 5 2
      src/core/simgrid.c
  46. 2 2
      src/core/simgrid.h
  47. 2 2
      src/core/task.h
  48. 2 2
      src/core/topology.c
  49. 9 0
      src/core/workers.c
  50. 27 1
      src/datawizard/copy_driver.c
  51. 7 5
      src/datawizard/interfaces/block_interface.c
  52. 7 5
      src/datawizard/interfaces/matrix_interface.c
  53. 4 3
      src/datawizard/interfaces/multiformat_interface.c
  54. 3 2
      src/drivers/cuda/driver_cuda.c
  55. 15 10
      src/drivers/opencl/driver_opencl.c
  56. 3 1
      src/util/fstarpu.c
  57. 16 7
      src/util/starpu_task_insert.c
  58. 13 0
      src/util/starpu_task_insert_utils.c
  59. 2 1
      tests/Makefile.am
  60. 132 0
      tests/parallel_tasks/parallel_kernels_trivial.c
  61. 2 2
      tools/dev/cppcheck/suppressions.txt
  62. 115 1
      tools/dev/valgrind/hwloc.suppr
  63. 14 1
      tools/dev/valgrind/libc.suppr

+ 1 - 0
AUTHORS

@@ -6,6 +6,7 @@ William Braik <wbraik@gmail.com>
 Berenger Bramas <berenger.bramas@inria.fr>
 Alfredo Buttari <alfredo.buttari@enseeiht.fr>
 Adrien Cassagne <adrien.cassagne@inria.fr>
+Arthur Chevalier <arthur.chevalier@inria.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@inria.fr>
 Terry Cojean <terry.cojean@inria.fr>
 Nicolas Collin <nicolas.collin@inria.fr>

+ 10 - 0
ChangeLog

@@ -80,6 +80,8 @@ Small features:
     the output graph
   * New environment variable STARPU_GENERATE_TRACE_OPTIONS to specify
     fxt options (to be used with STARPU_GENERATE_TRACE)
+  * New function starpu_task_set() similar as starpu_task_build() but
+    with a task object given as the first parameter
 
 Changes:
   * Vastly improve simgrid simulation time.
@@ -92,6 +94,13 @@ Small changes:
     scheduler context
   * Fonction starpu_is_initialized() is moved to the public API.
 
+StarPU 1.2.5 (git revision xxx)
+==============================================
+
+Small features:
+  * Add a new value STARPU_TASK_COLOR to be used in
+    starpu_task_insert() to pick up the color of a task in dag.dot
+
 StarPU 1.2.4 (git revision 255cf98175ef462749780f30bfed21452b74b594)
 ==============================================
 
@@ -109,6 +118,7 @@ Small features:
      acquisitions too.
    * Add a way to choose the dag.dot colors.
 
+
 StarPU 1.2.3 (svn revision 22444)
 ==============================================
 

+ 13 - 26
doc/doxygen/chapters/110_basic_examples.doxy

@@ -307,7 +307,7 @@ struct starpu_codelet cl =
     .where = STARPU_CPU,
     .cpu_funcs = { cpu_func },
     .cpu_funcs_name = { "cpu_func" },
-     .nbuffers = 0
+    .nbuffers = 0
 };
 \endcode
 
@@ -338,13 +338,10 @@ has to be defined:
 
 \code{.c}
 /* Declare the `vector_scal' task.  */
-static void vector_scal (unsigned size, float vector[size],
-                         float factor)
-  __attribute__ ((task));
+static void vector_scal (unsigned size, float vector[size], float factor) __attribute__ ((task));
 
 /* Define the standard CPU implementation.  */
-static void
-vector_scal (unsigned size, float vector[size], float factor)
+static void vector_scal (unsigned size, float vector[size], float factor)
 {
   unsigned i;
   for (i = 0; i < size; i++)
@@ -415,12 +412,10 @@ in our C file like this:
 /* The OpenCL programs, loaded from 'main' (see below). */
 static struct starpu_opencl_program cl_programs;
 
-static void vector_scal_opencl (unsigned size, float vector[size],
-                                float factor)
+static void vector_scal_opencl (unsigned size, float vector[size], float factor)
   __attribute__ ((task_implementation ("opencl", vector_scal)));
 
-static void
-vector_scal_opencl (unsigned size, float vector[size], float factor)
+static void vector_scal_opencl (unsigned size, float vector[size], float factor)
 {
   int id, devid, err;
   cl_kernel kernel;
@@ -434,22 +429,17 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
   devid = starpu_worker_get_devid (id);
 
   /* Prepare to invoke the kernel.  In the future, this will be largely automated.  */
-  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
-                                   "vector_mult_opencl", devid);
-  if (err != CL_SUCCESS)
-    STARPU_OPENCL_REPORT_ERROR (err);
+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs, "vector_mult_opencl", devid);
+  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR (err);
 
   err = clSetKernelArg (kernel, 0, sizeof (size), &size);
   err |= clSetKernelArg (kernel, 1, sizeof (val), &val);
   err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
-  if (err)
-    STARPU_OPENCL_REPORT_ERROR (err);
+  if (err) STARPU_OPENCL_REPORT_ERROR (err);
 
   size_t global = 1, local = 1;
-  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global,
-                                &local, 0, NULL, &event);
-  if (err != CL_SUCCESS)
-    STARPU_OPENCL_REPORT_ERROR (err);
+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
+  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR (err);
 
   clFinish (queue);
   starpu_opencl_collect_stats (event);
@@ -464,8 +454,7 @@ The OpenCL kernel itself must be loaded from <c>main</c>, sometime after
 the pragma <c>initialize</c>:
 
 \code{.c}
-starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
-                                       &cl_programs, "");
+starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl", &cl_programs, "");
 \endcode
 
 And that's it.  The task <c>vector_scal</c> now has an additional
@@ -482,8 +471,7 @@ with <c>nvcc</c>.  Thus, the C file only needs to contain an external
 declaration for the task implementation:
 
 \code{.c}
-extern void vector_scal_cuda (unsigned size, float vector[size],
-                              float factor)
+extern void vector_scal_cuda (unsigned size, float vector[size], float factor)
   __attribute__ ((task_implementation ("cuda", vector_scal)));
 \endcode
 
@@ -532,8 +520,7 @@ The following lines show how to declare an array of <c>NX</c> elements of type
 float vector[NX];
 
 starpu_data_handle_t vector_handle;
-starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
-                            sizeof(vector[0]));
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 \endcode
 
 The first argument, called the <b>data handle</b>, is an opaque pointer which

+ 6 - 3
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -79,6 +79,11 @@ link to \ref StaticScheduling
 
 \section CUDA-specificOptimizations CUDA-specific Optimizations
 
+For proper overlapping of asynchronous GPU data transfers, data has to be pinned
+by CUDA. Data allocated with starpu_malloc() is always properly pinned. If the
+application is registering to StarPU some data which has not been allocated with
+starpu_malloc(), it should use starpu_memory_pin() to pin it.
+
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 communications and the codelet computations if the application does not use a
 dedicated CUDA stream for its computations instead of the default stream,
@@ -408,9 +413,7 @@ void feed(void)
         starpu_data_handle_t handle;
 	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
 	task.handles[0] = handle;
-	starpu_perfmodel_update_history(&perf_model, &task,
-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
-	                                measure->implementation, measure->time);
+	starpu_perfmodel_update_history(&perf_model, &task, STARPU_CUDA_DEFAULT + measure->cudadev, 0, measure->implementation, measure->time);
 	starpu_task_clean(&task);
 	starpu_data_unregister(handle);
     }

+ 4 - 6
doc/doxygen/chapters/301_tasks.doxy

@@ -98,9 +98,9 @@ for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 	handles[i] = handle;
 }
 starpu_task_insert(&dummy_big_cl,
-        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
-		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
-		 0);
+         	  STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+		  STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+		  0);
 \endcode
 
 The whole code for this complex data interface is available in the
@@ -355,9 +355,7 @@ starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
 
 /* And submit the corresponding task */
 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
-                       starpu_task_insert(&work,
-		                          STARPU_RW, A_handle[i],
-					  0));
+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
 \endcode
 
 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for

+ 10 - 26
doc/doxygen/chapters/310_data_management.doxy

@@ -261,8 +261,7 @@ int vector[NX];
 starpu_data_handle_t handle;
 
 /* Declare data to StarPU */
-starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
-                            NX, sizeof(vector[0]));
+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 /* Partition the vector in PARTS sub-vectors */
 struct starpu_data_filter f =
@@ -428,8 +427,8 @@ struct starpu_codelet cl_switch =
 };
 
 ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
-			STARPU_W, vert_handle[0], 
-			STARPU_W, vert_handle[1], 
+			STARPU_W, vert_handle[0],
+			STARPU_W, vert_handle[1],
 			0);
 \endcode
 
@@ -510,8 +509,7 @@ and attaches them as reduction methods for its handle <c>dtq</c>:
 
 \code{.c}
 starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
-starpu_data_set_reduction_methods(dtq_handle,
-        &accumulate_variable_cl, &bzero_variable_cl);
+starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
 \endcode
 
 and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
@@ -554,8 +552,7 @@ with a new reduction:
 for (i = 0; i < 100; i++)
 {
     starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
-    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
-               STARPU_R, B, STARPU_REDUX, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
     starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
 }
@@ -573,22 +570,10 @@ the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
 properly be serialized against accesses with this flag. For instance:
 
 \code{.c}
-    starpu_task_insert(&cl1,
-        STARPU_R, h,
-        STARPU_RW, handle,
-        0);
-    starpu_task_insert(&cl2,
-        STARPU_R, handle1,
-        STARPU_RW|STARPU_COMMUTE, handle,
-        0);
-    starpu_task_insert(&cl2,
-        STARPU_R, handle2,
-        STARPU_RW|STARPU_COMMUTE, handle,
-        0);
-    starpu_task_insert(&cl3,
-        STARPU_R, g,
-        STARPU_RW, handle,
-        0);
+    starpu_task_insert(&cl1, STARPU_R, h, STARPU_RW, handle, 0);
+    starpu_task_insert(&cl2, STARPU_R, handle1, STARPU_RW|STARPU_COMMUTE, handle, 0);
+    starpu_task_insert(&cl2, STARPU_R, handle2, STARPU_RW|STARPU_COMMUTE, handle, 0);
+    starpu_task_insert(&cl3, STARPU_R, g, STARPU_RW, handle, 0);
 \endcode
 
 The two tasks running <c>cl2</c> will be able to commute: depending on whether the
@@ -680,8 +665,7 @@ memory for it, and StarPU will allocate it on demand at task execution.
 \code{.c}
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
 for (i = 0; i < N; i++)
-    starpu_task_insert(&compute, STARPU_R, input[i],
-                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+    starpu_task_insert(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
 \endcode
 
 StarPU will make sure that the buffer is allocated before executing the task,

+ 12 - 12
doc/doxygen/chapters/330_scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2016                           Inria
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2014                           Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
@@ -113,7 +113,7 @@ The contexts can also be used to group set of SMs of an NVIDIA GPU in order to i
 the parallel kernels and allow them to coexecution on a specified partiton of the GPU.
 
 Each context will be mapped to a stream and the user can indicate the number of SMs.
-The context can be added to a larger context already grouping CPU cores. 
+The context can be added to a larger context already grouping CPU cores.
 This larger context can use a scheduling policy that assigns tasks to both CPUs and contexts (partitions of the GPU)
 based on performance models adjusted to the number of SMs.
 
@@ -132,7 +132,7 @@ int workers[ncpus+nstreams];
 workers[ncpus+0] = stream_workerids[0];
 workers[ncpus+1] = stream_workerids[1];
 
-big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0); 
+big_sched_ctx = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
 
 starpu_task_submit_to_ctx(task, big_sched_ctx);
 
@@ -160,25 +160,25 @@ starpu_sched_ctx_remove_workers(workerids, 3, sched_ctx1);
 \endcode
 
 \section SubmittingTasksToAContext Submitting Tasks To A Context
-The application may submit tasks to several contexts either 
+The application may submit tasks to several contexts either
 simultaneously or sequnetially. If several threads of submission
 are used the function starpu_sched_ctx_set_context() may be called just
-before starpu_task_submit(). Thus StarPU considers that 
+before starpu_task_submit(). Thus StarPU considers that
 the current thread will submit tasks to the coresponding context.
- 
+
 When the application may not assign a thread of submission to each
 context, the id of the context must be indicated by using the
-function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX 
+function starpu_task_submit_to_ctx() or the field \ref STARPU_SCHED_CTX
 for starpu_task_insert().
 
 \section DeletingAContext Deleting A Context
 
 When a context is no longer needed it must be deleted. The application
 can indicate which context should keep the resources of a deleted one.
-All the tasks of the context should be executed before doing this. 
-Thus, the programmer may use either a barrier and then delete the context 
+All the tasks of the context should be executed before doing this.
+Thus, the programmer may use either a barrier and then delete the context
 directly, or just indicate
-that other tasks will not be submitted later on to the context (such that when 
+that other tasks will not be submitted later on to the context (such that when
 the last task is executed its workers will be moved to the inheritor)
 and delete the context at the end of the execution (when a barrier will
 be used eventually).
@@ -212,8 +212,8 @@ A context may have no resources at the begining or at a certain
 moment of the execution. Task can still be submitted to these contexts
 and they will be executed as soon as the contexts will have resources. A list
 of tasks pending to be executed is kept and when workers are added to
-the contexts these tasks start being submitted. However, if resources 
-are never allocated to the context the program will not terminate. 
+the contexts these tasks start being submitted. However, if resources
+are never allocated to the context the program will not terminate.
 If these tasks have low
 priority the programmer can forbid the application to submit them
 by calling the function starpu_sched_ctx_stop_task_submission().

+ 15 - 31
doc/doxygen/chapters/350_modularized_scheduler.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013                                     Inria
- * Copyright (C) 2014,2016-2017                           CNRS
+ * Copyright (C) 2014,2016-2018                           CNRS
  * Copyright (C) 2014,2017                                Université de Bordeaux
  * Copyright (C) 2013                                     Simon Archipoff
  *
@@ -204,23 +204,19 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
     (sched_ctx_id, STARPU_WORKER_LIST);
 
   /* Create the Scheduling Tree */
-  struct starpu_sched_tree * t =
-    starpu_sched_tree_create(sched_ctx_id);
+  struct starpu_sched_tree * t = starpu_sched_tree_create(sched_ctx_id);
 
   /* The Root Component is a Flow-control Fifo Component */
    t->root = starpu_sched_component_fifo_create(NULL);
 
   /* The Resource-mapping Component of the strategy is an Eager Component
    */
-  struct starpu_sched_component * eager_component =
-    starpu_sched_component_eager_create(NULL);
+  struct starpu_sched_component *eager_component = starpu_sched_component_eager_create(NULL);
 
   /* Create links between Components : the Eager Component is the child
    * of the Root Component */
-  t->root->add_child
-    (t->root, eager_component);
-  eager_component->add_father
-    (eager_component, t->root);
+  t->root->add_child(t->root, eager_component);
+  eager_component->add_father(eager_component, t->root);
 
   /* A task threshold is set for the Flow-control Components which will
    * be connected to Worker Components. By doing so, this Modularized
@@ -233,44 +229,32 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
   };
 
   unsigned i;
-  for(i = 0;
-    i < starpu_worker_get_count() +
-    starpu_combined_worker_get_count();
-    i++)
+  for(i = 0; i < starpu_worker_get_count() + starpu_combined_worker_get_count(); i++)
   {
     /* Each Worker Component has a Flow-control Fifo Component as
      * father */
-    struct starpu_sched_component * worker_component =
-	  starpu_sched_component_worker_new(i);
-    struct starpu_sched_component * fifo_component =
-	  starpu_sched_component_fifo_create(&fifo_data);
-    fifo_component->add_child
-      (fifo_component, worker_component);
-    worker_component->add_father
-      (worker_component, fifo_component);
+    struct starpu_sched_component * worker_component = starpu_sched_component_worker_new(i);
+    struct starpu_sched_component * fifo_component = starpu_sched_component_fifo_create(&fifo_data);
+    fifo_component->add_child(fifo_component, worker_component);
+    worker_component->add_father(worker_component, fifo_component);
 
     /* Each Flow-control Fifo Component associated to a Worker
      * Component is linked to the Eager Component as one of its
      * children */
-    eager_component->add_child
-      (eager_component, fifo_component);
-    fifo_component->add_father
-      (fifo_component, eager_component);
+    eager_component->add_child(eager_component, fifo_component);
+    fifo_component->add_father(fifo_component, eager_component);
   }
 
   starpu_sched_tree_update_workers(t);
-  starpu_sched_ctx_set_policy_data
-    (sched_ctx_id, (void*)t);
+  starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)t);
 }
 
 /* Properly destroy the Scheduling Tree and all its Components */
 static void deinitialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
 {
-  struct starpu_sched_tree * tree =
-  	(struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+  struct starpu_sched_tree * tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
   starpu_sched_tree_destroy(tree);
-  starpu_sched_ctx_delete_worker_collection
-    (sched_ctx_id);
+  starpu_sched_ctx_delete_worker_collection(sched_ctx_id);
 }
 
 /* Initializing the starpu_sched_policy struct associated to the Modularized

+ 5 - 8
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2011-2012,2016                           Inria
- * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2016, 2018                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -313,12 +313,9 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
         starpu_worker_get_name(worker, workername, 128);
         fprintf(stderr, "Worker %s:\n", workername);
         fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
-        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
-                executing_time*1e-3, executing_ratio);
-        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
-                sleeping_time*1e-3, sleeping_ratio);
-        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
-                overhead_time*1e-3, overhead_ratio);
+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio);
+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio);
+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3, overhead_ratio);
 }
 \endcode
 

+ 2 - 3
doc/doxygen/chapters/390_faq.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2014,2016-2017                 Université de Bordeaux
  * Copyright (C) 2011-2012                                Inria
  *
@@ -159,8 +159,7 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 
 /* And register it to StarPU */
-starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
-                            output, num_bytes / sizeof(float4), sizeof(float4));
+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
 /* The handle can now be used as usual */
 starpu_task_insert(&cl, STARPU_RW, handle, 0);

+ 3 - 3
doc/doxygen/chapters/401_out_of_core.doxy

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2014,2016-2017                      CNRS
+ * Copyright (C) 2013-2014,2016-2018                      CNRS
  * Copyright (C) 2013                                     Inria
- * Copyright (C) 2013-2014,2017-2018                           Université de Bordeaux
+ * Copyright (C) 2013-2014,2017-2018                      Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -51,7 +51,7 @@ automatically read and write data as appropriate.
 To use a disk memory node, you have to register it with this function:
 
 \code{.c}
-	int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) "/tmp/", 1024*1024*200);
+int new_dd = starpu_disk_register(&starpu_disk_unistd_ops, (void *) "/tmp/", 1024*1024*200);
 \endcode
 
 Here, we use the unistd library to realize the read/write operations, i.e.

+ 73 - 78
doc/doxygen/chapters/410_mpi_support.doxy

@@ -38,31 +38,31 @@ initializes a token on node 0, and the token is passed from node to node,
 incremented by one on each step. The code is not using StarPU yet.
 
 \code{.c}
-    for (loop = 0; loop < nloops; loop++)
-    {
-        int tag = loop*size + rank;
+for (loop = 0; loop < nloops; loop++)
+{
+    int tag = loop*size + rank;
 
-        if (loop == 0 && rank == 0)
-        {
-            token = 0;
-            fprintf(stdout, "Start with token value %d\n", token);
-        }
-        else
-        {
-            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
-        }
+    if (loop == 0 && rank == 0)
+    {
+        token = 0;
+        fprintf(stdout, "Start with token value %d\n", token);
+    }
+    else
+    {
+        MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+    }
 
-        token++;
+    token++;
 
-        if (loop == last_loop && rank == last_rank)
-        {
-            fprintf(stdout, "Finished: token value %d\n", token);
-        }
-        else
-        {
-            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
-        }
+    if (loop == last_loop && rank == last_rank)
+    {
+        fprintf(stdout, "Finished: token value %d\n", token);
     }
+    else
+    {
+        MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+    }
+}
 \endcode
 
 \section NotUsingMPISupport About not using the MPI support
@@ -73,39 +73,39 @@ execution to StarPU.  This is possible by just using starpu_data_acquire(), for
 instance:
 
 \code{.c}
-    for (loop = 0; loop < nloops; loop++)
-    {
-        int tag = loop*size + rank;
+for (loop = 0; loop < nloops; loop++)
+{
+    int tag = loop*size + rank;
 
-	/* Acquire the data to be able to write to it */
-	starpu_data_acquire(token_handle, STARPU_W);
-        if (loop == 0 && rank == 0)
-        {
-            token = 0;
-            fprintf(stdout, "Start with token value %d\n", token);
-        }
-        else
-        {
-            MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
-        }
+    /* Acquire the data to be able to write to it */
+    starpu_data_acquire(token_handle, STARPU_W);
+    if (loop == 0 && rank == 0)
+    {
+        token = 0;
+        fprintf(stdout, "Start with token value %d\n", token);
+    }
+    else
+    {
+        MPI_Recv(&token, 1, MPI_INT, (rank+size-1)%size, tag, MPI_COMM_WORLD);
+    }
 	starpu_data_release(token_handle);
 
-        /* Task delegation to StarPU to increment the token. The execution might
-         * be performed on a CPU, a GPU, etc. */
-        increment_token();
+    /* Task delegation to StarPU to increment the token. The execution might
+     * be performed on a CPU, a GPU, etc. */
+    increment_token();
 
-	/* Acquire the update data to be able to read from it */
-	starpu_data_acquire(token_handle, STARPU_R);
-        if (loop == last_loop && rank == last_rank)
-        {
-            fprintf(stdout, "Finished: token value %d\n", token);
-        }
-        else
-        {
-            MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
-        }
-	starpu_data_release(token_handle);
+    /* Acquire the update data to be able to read from it */
+    starpu_data_acquire(token_handle, STARPU_R);
+    if (loop == last_loop && rank == last_rank)
+    {
+        fprintf(stdout, "Finished: token value %d\n", token);
     }
+    else
+    {
+        MPI_Send(&token, 1, MPI_INT, (rank+1)%size, tag+1, MPI_COMM_WORLD);
+    }
+	starpu_data_release(token_handle);
+}
 \endcode
 
 In that case, <c>libstarpumpi</c> is not needed. One can also use <c>MPI_Isend()</c> and
@@ -167,8 +167,7 @@ int main(int argc, char **argv)
         }
         else
         {
-            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
-                    MPI_COMM_WORLD, NULL, NULL);
+            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
         }
 
         increment_token();
@@ -181,8 +180,7 @@ int main(int argc, char **argv)
         }
         else
         {
-            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
-                    MPI_COMM_WORLD, NULL, NULL);
+            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
         }
     }
 
@@ -316,14 +314,12 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 {
   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-  struct starpu_complex_interface *complex_interface =
-    (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
+  struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
 
   *count = complex_get_size(handle);
   starpu_malloc_flags(ptr, *count, 0);
   memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
-  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary,
-         complex_interface->nx*sizeof(double));
+  memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
 
   return 0;
 }
@@ -332,12 +328,10 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 {
   STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
-  struct starpu_complex_interface *complex_interface =
-    (struct starpu_complex_interface *)	starpu_data_get_interface_on_node(handle, node);
+  struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
 
   memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
-  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double),
-         complex_interface->nx*sizeof(double));
+  memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
 
   return 0;
 }
@@ -369,8 +363,7 @@ void starpu_complex_interface_datatype_allocate(starpu_data_handle_t handle, MPI
 	MPI_Aint displacements[2];
 	MPI_Datatype types[2] = {MPI_DOUBLE, MPI_DOUBLE};
 
-	struct starpu_complex_interface *complex_interface =
-          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
 	MPI_Address(complex_interface, displacements);
 	MPI_Address(&complex_interface->imaginary, displacements+1);
@@ -461,13 +454,11 @@ data which will be needed by the tasks that we will execute.
             int mpi_rank = my_distrib(x, y, size);
             if (mpi_rank == my_rank)
                 /* Owning data */
-                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM,
-                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+                starpu_variable_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
             else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
                 /* I don't own this index, but will need it for my computations */
-                starpu_variable_data_register(&data_handles[x][y], -1,
-                                              (uintptr_t)NULL, sizeof(unsigned));
+                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
             else
                 /* I know it's useless to allocate anything for this */
                 data_handles[x][y] = NULL;
@@ -600,7 +591,8 @@ can just pass NULL to starpu_mpi_task_insert():
 
 \code{.c}
 starpu_data_handle_t data0 = NULL;
-if (rank == 0) {
+if (rank == 0)
+{
 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
 	starpu_mpi_data_register(data0, 0, rank);
 }
@@ -615,12 +607,15 @@ data1 to node 0, which owns data and thus will need the value of data1 to execut
 
 \code{.c}
 starpu_data_handle_t data0 = NULL, data1, data;
-if (rank == 0) {
+if (rank == 0)
+{
 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
 	starpu_mpi_data_register(data0, -1, rank);
 	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
 	starpu_variable_data_register(&data, STARPU_MAIN_RAM, (uintptr_t) &val, sizeof(val));
-} else if (rank == 1) {
+}
+else if (rank == 1)
+{
 	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
 	starpu_variable_data_register(&data, -1, 0, sizeof(val));
 }
@@ -641,10 +636,13 @@ starpu_variable_data_register(&pernode, -1, 0, sizeof(val));
 starpu_mpi_data_register(pernode, -1, STARPU_MPI_PER_NODE);
 
 /* Normal data: one on node0, one on node1 */
-if (rank == 0) {
+if (rank == 0)
+{
 	starpu_variable_data_register(&data0, STARPU_MAIN_RAM, (uintptr_t) &val0, sizeof(val0));
 	starpu_variable_data_register(&data1, -1, 0, sizeof(val1));
-} else if (rank == 1) {
+}
+else if (rank == 1)
+{
 	starpu_variable_data_register(&data0, -1, 0, sizeof(val1));
 	starpu_variable_data_register(&data1, STARPU_MAIN_RAM, (uintptr_t) &val1, sizeof(val1));
 }
@@ -744,8 +742,7 @@ migrate the data, and register the new location.
                   || my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size)))
                 /* Register newly-needed data */
-                starpu_variable_data_register(&data_handles[x][y], -1,
-                                              (uintptr_t)NULL, sizeof(unsigned));
+                starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
             if (data_handles[x][y])
 	    {
                 /* Migrate the data */
@@ -808,14 +805,12 @@ for(x = 0; x < nblocks ;  x++)
     int mpi_rank = my_distrib(x, nodes);
     if (rank == root)
     {
-        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
-                                    blocks_size, sizeof(float));
+        starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x], blocks_size, sizeof(float));
     }
     else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
     {
         /* I own this index, or i will need it for my computations */
-        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
-                                   block_size, sizeof(float));
+        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL, block_size, sizeof(float));
     }
     else
     {

+ 10 - 25
doc/doxygen/chapters/440_c_extensions.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2014-2015                      Université de Bordeaux
  * Copyright (C) 2011-2012                                Inria
  *
@@ -126,20 +126,15 @@ Here is an example:
 \code{.c}
 #define __output  __attribute__ ((output))
 
-static void matmul (const float *A, const float *B,
-                    __output float *C,
-                    unsigned nx, unsigned ny, unsigned nz)
+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task));
 
-static void matmul_cpu (const float *A, const float *B,
-                        __output float *C,
-                        unsigned nx, unsigned ny, unsigned nz)
+static void matmul_cpu (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("cpu", matmul)));
 
 
 static void
-matmul_cpu (const float *A, const float *B, __output float *C,
-            unsigned nx, unsigned ny, unsigned nz)
+matmul_cpu (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
 {
   unsigned i, j, k;
 
@@ -165,16 +160,12 @@ implementation.  Thus, the above snippet can be simplified like this:
 \code{.c}
 #define __output  __attribute__ ((output))
 
-static void matmul (const float *A, const float *B,
-                    __output float *C,
-                    unsigned nx, unsigned ny, unsigned nz)
+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task));
 
 /* Implicit definition of the CPU implementation of the
    `matmul' task.  */
-static void
-matmul (const float *A, const float *B, __output float *C,
-        unsigned nx, unsigned ny, unsigned nz)
+static void matmul (const float *A, const float *B, __output float *C, unsigned nx, unsigned ny, unsigned nz)
 {
   unsigned i, j, k;
 
@@ -194,12 +185,10 @@ the code is valid sequential code when StarPU's GCC plug-in is not used
 CUDA and OpenCL implementations can be declared in a similar way:
 
 \code{.c}
-static void matmul_cuda (const float *A, const float *B, float *C,
-                         unsigned nx, unsigned ny, unsigned nz)
+static void matmul_cuda (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("cuda", matmul)));
 
-static void matmul_opencl (const float *A, const float *B, float *C,
-                           unsigned nx, unsigned ny, unsigned nz)
+static void matmul_opencl (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
   __attribute__ ((task_implementation ("opencl", matmul)));
 \endcode
 
@@ -209,13 +198,9 @@ written in CUDA or OpenCL (for similar code, \ref CUDAKernel, and
 OpenCL under the hood, such as CUBLAS functions:
 
 \code{.c}
-static void
-matmul_cuda (const float *A, const float *B, float *C,
-             unsigned nx, unsigned ny, unsigned nz)
+static void matmul_cuda (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz)
 {
-  cublasSgemm ('n', 'n', nx, ny, nz,
-               1.0f, A, 0, B, 0,
-               0.0f, C, 0);
+  cublasSgemm ('n', 'n', nx, ny, nz, 1.0f, A, 0, B, 0, 0.0f, C, 0);
   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
 }
 \endcode

+ 9 - 1
doc/doxygen/chapters/470_simgrid.doxy

@@ -19,7 +19,7 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.19.
+platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.20.
 Other versions may have compatibility issues. 3.17 notably does not build at
 all.
 
@@ -180,6 +180,14 @@ application running with simgrid, pass the <c>--cfg=contexts/factory:thread</c>
 option to the application, to make simgrid use system threads, which gdb will be
 able to manipulate as usual.
 
+It is also worth noting Simgrid 3.21's new parameter
+<c>--cfg=simix/breakpoint</c> which allows to put a breakpoint at a precise
+(deterministic!) timing of the execution. If for instance in an execution
+trace we see that something odd is happening at time 19000ms, we can use
+<c>--cfg=simix/breakpoint:19.000</c> and SIGTRAP will be raised at that point,
+which will thus interrupt execution within gdb, allowing to inspect e.g.
+scheduler state, etc.
+
 \section SimulationMemoryUsage Memory Usage
 
 Since kernels are not actually run and data transfers are not actually

+ 4 - 3
doc/doxygen/chapters/490_clustering_a_machine.doxy

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015-2017                                CNRS
+ * Copyright (C) 2015-2018                                CNRS
  * Copyright (C) 2015-2016                                Inria
- * Copyright (C) 2015, 2018                                     Université de Bordeaux
+ * Copyright (C) 2015, 2018                               Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -64,7 +64,7 @@ struct starpu_cluster_machine *clusters;
 clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET, 0);
 starpu_cluster_print(clusters);
 
-//... submit some tasks with OpenMP computations 
+//... submit some tasks with OpenMP computations
 
 starpu_uncluster_machine(clusters);
 //... we are back in the default starpu state
@@ -111,6 +111,7 @@ and create the aforementioned OpenMP threads constrained in the cluster's
 resources set:
 \code{.c}
 void starpu_openmp_prologue(void * sched_ctx_id)
+{
   int sched_ctx = *(int*)sched_ctx_id;
   int *cpuids = NULL;
   int ncpuids = 0;

+ 10 - 8
doc/doxygen/chapters/501_environment_variables.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
  * Copyright (C) 2010-2018                                CNRS
- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -66,19 +66,21 @@ which will be concurrently running on the devices. The default value is 1.
 <dd>
 \anchor STARPU_CUDA_THREAD_PER_WORKER
 \addindex __env__STARPU_CUDA_THREAD_PER_WORKER
-Specify if the cuda driver should provide a thread per stream or a single thread
-dealing with all the streams. 0 if one thread per stream, 1 otherwise. The default
-value is 0. Setting it to 1 is contradictory with setting STARPU_CUDA_THREAD_PER_DEV to 1.
+Specify whether the cuda driver should use one thread per stream (1) or to use
+a single thread to drive all the streams of the device or all devices (0), and
+STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
+thread for all devices. The default value is 0. Setting it to 1 is contradictory
+with setting STARPU_CUDA_THREAD_PER_DEV.
 </dd>
 
 <dt>STARPU_CUDA_THREAD_PER_DEV</dt>
 <dd>
 \anchor STARPU_CUDA_THREAD_PER_DEV
 \addindex __env__STARPU_CUDA_THREAD_PER_DEV
-Specify if the cuda driver should provide a thread per device or a single thread
-dealing with all the devices. 0 if one thread per device, 1 otherwise. The default
-value is 1, unless STARPU_CUDA_THREAD_PER_WORKER is set to 1. Setting it to 1 is
-contradictory with setting STARPU_CUDA_THREAD_PER_WORKER to 1.
+Specify whether the cuda driver should use one thread per device (1) or to use a
+single thread to drive all the devices (0). The default value is 1.  It does not
+make sense to set this variable if STARPU_CUDA_THREAD_PER_WORKER is set to to 1
+(since STARPU_CUDA_THREAD_PER_DEV is then meaningless).
 </dd>
 
 <dt>STARPU_CUDA_PIPELINE</dt>

+ 2 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -505,7 +505,8 @@ Enable performance model debugging.
 <dd>
 \anchor enable-fxt-lock
 \addindex __configure__--enable-fxt-lock
-Enable additional trace events which describes locks behaviour.
+Enable additional trace events which describes locks behaviour. This is however
+extremely heavy and should only be enabled when debugging insides of StarPU.
 </dd>
 
 <dt>--enable-maxbuffers</dt>

+ 16 - 1
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2011-2014,2017                           Inria
- * Copyright (C) 2009-2011,2014-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1212,6 +1212,21 @@ the starpu_data_copy_methods::any_to_any copy method, which is provided with \p
 be passed to starpu_interface_copy(). this returns <c>-EAGAIN</c> if the
 transfer is still ongoing, or 0 if the transfer is already completed.
 
+\fn void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start)
+\ingroup API_Data_Interfaces
+When an asynchonous implementation of the data transfer is implemented, the call
+to the underlying CUDA, OpenCL, etc. call should be surrounded
+by calls to starpu_interface_start_driver_copy_async() and
+starpu_interface_end_driver_copy_async(), so that it is recorded in offline
+execution traces, and the timing of the submission is checked. \p start must
+point to a variable whose value will be passed unchanged to
+starpu_interface_end_driver_copy_async().
+
+\fn void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start)
+\ingroup API_Data_Interfaces
+See starpu_interface_start_driver_copy_async().
+
+
 \fn uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc)
 \ingroup API_Data_Interfaces
 Compute the CRC of a byte buffer seeded by the \p inputcrc

+ 15 - 1
doc/doxygen/chapters/api/insert_task.doxy

@@ -42,7 +42,7 @@ starpu_task::execute_on_a_specific_worker)
 <li> the specific values ::STARPU_VALUE, ::STARPU_CALLBACK,
 ::STARPU_CALLBACK_ARG, ::STARPU_CALLBACK_WITH_ARG, ::STARPU_PRIORITY,
 ::STARPU_TAG, ::STARPU_TAG_ONLY, ::STARPU_FLOPS, ::STARPU_SCHED_CTX, ::STARPU_CL_ARGS, ::STARPU_CL_ARGS_NFREE,
-::STARPU_TASK_DEPS_ARRAY,
+::STARPU_TASK_DEPS_ARRAY, ::STARPU_TASK_COLOR
 followed by the appropriated objects as defined elsewhere.
 </ul>
 
@@ -156,6 +156,11 @@ be followed by a number of tasks, and an array containing these tasks.
 The function starpu_task_declare_deps_array() will be called with the
 given values.
 
+\def STARPU_TASK_COLOR
+\ingroup API_Insert_Task
+Used when calling starpu_task_insert(), must be followed by an integer
+representing a color
+
 \fn void starpu_task_insert_data_make_room(struct starpu_codelet *cl, struct starpu_task *task, int *allocated_buffers, int current_buffer, int room)
 \ingroup API_Insert_Task
 Assuming that there are already \p current_buffer data handles passed to
@@ -230,4 +235,13 @@ starpu_task_insert().
 If some arguments of type ::STARPU_VALUE are given, the parameter
 starpu_task::cl_arg_free will be set to 1.
 
+\fn int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...)
+\ingroup API_Insert_Task
+Set the given \p task corresponding to \p cl with the following arguments.
+The argument list must be zero-terminated. The arguments
+following the codelet are the same as the ones for the function
+starpu_task_insert().
+If some arguments of type ::STARPU_VALUE are given, the parameter
+starpu_task::cl_arg_free will be set to 1.
+
 */

+ 2 - 5
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
  * Copyright (C) 2011-2012                                Inria
  *
@@ -82,10 +82,7 @@ Here an example:
 int n;
 cl_int err;
 cl_kernel kernel;
-n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
-                                  sizeof(foo), &foo,
-                                  sizeof(bar), &bar,
-                                  0);
+n = starpu_opencl_set_kernel_args(&err, 2, &kernel, sizeof(foo), &foo, sizeof(bar), &bar, 0);
 if (n != 2)
    fprintf(stderr, "Error : %d\n", err);
 \endcode

+ 4 - 7
doc/doxygen/chapters/code/complex.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
  * Copyright (C) 2010-2014                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -16,10 +16,7 @@
  */
 
 //! [To be included. You should update doxygen if you see this text.]
-#define STARPU_COMPLEX_GET_REAL(interface)	\
-        (((struct starpu_complex_interface *)(interface))->real)
-#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
-        (((struct starpu_complex_interface *)(interface))->imaginary)
-#define STARPU_COMPLEX_GET_NX(interface)	\
-        (((struct starpu_complex_interface *)(interface))->nx)
+#define STARPU_COMPLEX_GET_REAL(interface)	(((struct starpu_complex_interface *)(interface))->real)
+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	(((struct starpu_complex_interface *)(interface))->imaginary)
+#define STARPU_COMPLEX_GET_NX(interface)	(((struct starpu_complex_interface *)(interface))->nx)
 //! [To be included. You should update doxygen if you see this text.]

+ 1 - 4
doc/doxygen/chapters/code/disk_compute.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013,2017                                CNRS
+ * Copyright (C) 2013,2017,2018                           CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2014                                     Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
@@ -79,9 +79,6 @@ int main(int argc, char **argv)
 		C[j] = 0;
 	}
 
-
-
-
 	/* you create a file to store the vector ON the disk */
 	FILE * f = fopen(path_file_start, "wb+");
 	if (f == NULL)

+ 2 - 3
doc/doxygen/chapters/code/matmul_pragma2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
  * Copyright (C) 2010-2014                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -24,6 +24,5 @@
 # define __task
 #endif
 
-static void matmul (const float *A, const float *B, float *C,
-                    unsigned nx, unsigned ny, unsigned nz) __task;
+static void matmul (const float *A, const float *B, float *C, unsigned nx, unsigned ny, unsigned nz) __task;
 //! [To be included. You should update doxygen if you see this text.]

+ 3 - 5
doc/doxygen/chapters/code/scal_pragma.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
  * Copyright (C) 2010-2014,2016                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,8 +21,7 @@
 #include <starpu.h>
 #include <stdlib.h>
 
-static __global__ void
-vector_mult_cuda (unsigned n, float *val, float factor)
+static __global__ void vector_mult_cuda (unsigned n, float *val, float factor)
 {
   unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -36,8 +35,7 @@ extern "C" void vector_scal_cuda (size_t size, float vector[], float factor)
   unsigned threads_per_block = 64;
   unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
 
-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
+  vector_mult_cuda <<< nblocks, threads_per_block, 0, starpu_cuda_get_local_stream () >>> (size, vector, factor);
 
   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
 }

+ 3 - 5
doc/doxygen/chapters/code/vector_scal_c.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2018                 CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2010-2014                                Université de Bordeaux
  *
@@ -70,8 +70,7 @@ int main(int argc, char **argv)
     starpu_init(NULL);
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file(
-               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
+    starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl", &programs, NULL);
 #endif
 
     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
@@ -88,8 +87,7 @@ int main(int argc, char **argv)
      *  - the fifth argument is the size of each element.
      */
     starpu_data_handle_t vector_handle;
-    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
-                                NX, sizeof(vector[0]));
+    starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
     float factor = 3.14;
 

+ 3 - 5
doc/doxygen/chapters/code/vector_scal_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2018                      CNRS
  * Copyright (C) 2010,2014                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,8 +17,7 @@
 //! [To be included. You should update doxygen if you see this text.]
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(unsigned n, float *val,
-                                        float factor)
+static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
@@ -36,8 +35,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned threads_per_block = 64;
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
-        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
-	                (n, val, *factor);
+        vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 4 - 7
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010,2012-2013,2015,2017                 CNRS
+ * Copyright (C) 2010,2012-2013,2015,2017,2018            CNRS
  * Copyright (C) 2011,2014                                Université de Bordeaux
  * Copyright (C) 2010                                     Inria
  *
@@ -38,8 +38,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	 id = starpu_worker_get_id();
 	 devid = starpu_worker_get_devid(id);
 
-	 err = starpu_opencl_load_kernel(&kernel, &queue,
-					 &programs,
+	 err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
 					 "vector_mult_opencl", /* Name of the codelet */
 					 devid);
 	 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
@@ -57,13 +56,11 @@ void scal_opencl_func(void *buffers[], void *_args)
         cl_device_id device;
 
         starpu_opencl_get_device(devid, &device);
-        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
-                                        sizeof(local), &local, &s);
+        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (local > global) local=global;
 
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
-                                     NULL, &event);
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     }
 

+ 3 - 1
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013                                Inria
- * Copyright (C) 2009-2016                                Université de Bordeaux
+ * Copyright (C) 2009-2016,2018                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -151,6 +151,7 @@ int main(void)
 	 *  - the fifth argument is the size of each element.
 	 */
 	starpu_data_handle_t vector_handle;
+	starpu_memory_pin(vector, sizeof(vector));
 	starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
@@ -178,6 +179,7 @@ int main(void)
 	/* StarPU does not need to manipulate the array anymore so we can stop
  	 * monitoring it */
 	starpu_data_unregister(vector_handle);
+	starpu_memory_unpin(vector, sizeof(vector));
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_unload_opencl(&opencl_program);

+ 2 - 2
examples/filters/frecursive.c

@@ -31,7 +31,7 @@ void cpu_codelet(void *buffers[], void *cl_arg)
         /* local copy of the matrix pointer */
         int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
 
-	FPRINTF(stderr, "computing on matrix with nx=%d, ny=%d, ld=%d\n", nx, ny, ld);
+	FPRINTF(stderr, "computing on matrix with nx=%u, ny=%u, ld=%u\n", nx, ny, ld);
         for(j=0; j<ny ; j++)
 	{
                 for(i=0; i<nx ; i++)
@@ -58,7 +58,7 @@ int main(void)
 	starpu_data_handle_t subhandle_l1[PARTS];
 	starpu_data_handle_t subhandle_l2[PARTS][PARTS];
 	starpu_data_handle_t subhandle_l3[PARTS][PARTS][PARTS];
-	int ret, submit;
+	int ret;
 
 	int factor = 12;
 	int n=1;

+ 2 - 1
include/fstarpu_mod.f90

@@ -1,6 +1,6 @@
 ! StarPU --- Runtime system for heterogeneous multicore architectures.
 !
-! Copyright (C) 2017                                     CNRS
+! Copyright (C) 2017, 2018                                     CNRS
 ! Copyright (C) 2016-2017                                Inria
 ! Copyright (C) 2016-2017                                Université de Bordeaux
 !
@@ -55,6 +55,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_TAG
         type(c_ptr), bind(C) :: FSTARPU_TAG_ONLY
         type(c_ptr), bind(C) :: FSTARPU_NAME
+        type(c_ptr), bind(C) :: FSTARPU_TASK_COLOR
         type(c_ptr), bind(C) :: FSTARPU_NODE_SELECTION_POLICY
 
         type(c_ptr), bind(C) :: FSTARPU_VALUE

+ 3 - 1
include/starpu_data_interfaces.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2014,2016-2017                      Inria
- * Copyright (C) 2009-2016                                Université de Bordeaux
+ * Copyright (C) 2009-2016,2018                           Université de Bordeaux
  * Copyright (C) 2010-2015,2017                           CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -94,6 +94,8 @@ struct starpu_data_copy_methods
 };
 
 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start);
+void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start);
 uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
 uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
 void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 2 - 0
include/starpu_sched_component.h

@@ -63,6 +63,8 @@ struct starpu_sched_component
 	int (*can_push)(struct starpu_sched_component *from, struct starpu_sched_component *to);
 	int (*can_pull)(struct starpu_sched_component *component);
 
+	int (*notify)(struct starpu_sched_component* component, int message_ID, void* arg);
+
 	double (*estimated_load)(struct starpu_sched_component *component);
 	double (*estimated_end)(struct starpu_sched_component *component);
 

+ 2 - 0
include/starpu_sched_ctx.h

@@ -111,6 +111,8 @@ int starpu_sched_ctx_max_priority_is_set(unsigned sched_ctx_id);
 
 void *starpu_sched_ctx_get_user_data(unsigned sched_ctx_id);
 
+void starpu_sched_ctx_set_user_data(unsigned sched_ctx_id, void* user_data);
+
 struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type) STARPU_ATTRIBUTE_MALLOC;
 
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);

+ 3 - 1
include/starpu_task_util.h

@@ -63,8 +63,10 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_CL_ARGS		(25<<STARPU_MODE_SHIFT)
 #define STARPU_CL_ARGS_NFREE	(26<<STARPU_MODE_SHIFT)
 #define STARPU_TASK_DEPS_ARRAY	(27<<STARPU_MODE_SHIFT)
-#define STARPU_SHIFTED_MODE_MAX (28<<STARPU_MODE_SHIFT)
+#define STARPU_TASK_COLOR       (28<<STARPU_MODE_SHIFT)
+#define STARPU_SHIFTED_MODE_MAX (29<<STARPU_MODE_SHIFT)
 
+int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...);
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
 int starpu_task_insert(struct starpu_codelet *cl, ...);
 /* the function starpu_insert_task has the same semantics as starpu_task_insert, it is kept to avoid breaking old codes */

+ 5 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2014,2016-2017                      Inria
- * Copyright (C) 2011-2017                                CNRS
+ * Copyright (C) 2011-2018                                CNRS
  * Copyright (C) 2011-2018                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -448,6 +448,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			select_node_policy = va_arg(varg_list_copy, int);
 		}
+		else if (arg_type==STARPU_TASK_COLOR)
+                {
+                        (void)va_arg(varg_list_copy, int);
+                }
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);

+ 6 - 1
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016-2017                                CNRS
+ * Copyright (C) 2016-2018                                CNRS
  * Copyright (C) 2017-2018                                     Université de Bordeaux
  * Copyright (C) 2016                                     Inria
  *
@@ -293,6 +293,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* int* */
 		}
+		else if (arg_type==STARPU_TASK_COLOR)
+		{
+			arg_i++;
+			/* int* */
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);

+ 2 - 2
src/common/list.h

@@ -198,7 +198,7 @@
     struct ENAME *_tail; /**< @internal tail of the list */ \
   }; \
   /** @internal */LIST_INLINE struct ENAME *ENAME##_new(void) \
-    { struct ENAME *e; _STARPU_MALLOC_CAST(e, sizeof(struct ENAME), struct ENAME *); \
+    { struct ENAME *e; _STARPU_MALLOC(e, sizeof(struct ENAME)); \
       e->_next = NULL; e->_prev = NULL; return e; } \
   /** @internal */LIST_INLINE void ENAME##_delete(struct ENAME *e) \
     { free(e); } \
@@ -227,7 +227,7 @@
   /** @internal */LIST_INLINE void ENAME##_list_init(struct ENAME##_list *l) \
     { l->_head=NULL; l->_tail=l->_head; } \
   /** @internal */LIST_INLINE struct ENAME##_list *ENAME##_list_new(void) \
-    { struct ENAME##_list *l; _STARPU_MALLOC_CAST(l, sizeof(struct ENAME##_list), struct ENAME##_list *); \
+    { struct ENAME##_list *l; _STARPU_MALLOC(l, sizeof(struct ENAME##_list)); \
       ENAME##_list_init(l); return l; } \
   /** @internal */LIST_INLINE int ENAME##_list_empty(const struct ENAME##_list *l) \
     { return (l->_head == NULL); } \

+ 14 - 5
src/common/utils.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2015,2017                           Inria
- * Copyright (C) 2010-2017                                Université de Bordeaux
+ * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2010-2018                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -125,10 +125,19 @@
 	} while (0)
 
 
-#define _STARPU_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (size)); } while (0)
-#define _STARPU_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
-#define _STARPU_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) (size)); ptr = _new_ptr;} while (0)
-#define _STARPU_MALLOC_CAST(ptr, size, type) do { ptr = (type) malloc(size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (size)); } while (0)
+#ifdef _MSC_VER
+#  if defined(__cplusplus)
+#    define _STARPU_DECLTYPE(x) (decltype(x))
+#  else
+#    define _STARPU_DECLTYPE(x)
+#  endif
+#else
+#  define _STARPU_DECLTYPE(x) (__typeof(x))
+#endif
+
+#define _STARPU_MALLOC(ptr, size) do { ptr = _STARPU_DECLTYPE(ptr) malloc(size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (size)); } while (0)
+#define _STARPU_CALLOC(ptr, nmemb, size) do { ptr = _STARPU_DECLTYPE(ptr) calloc(nmemb, size); STARPU_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
+#define _STARPU_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) (size)); ptr = _STARPU_DECLTYPE(ptr) _new_ptr;} while (0)
 
 #ifdef _MSC_VER
 #define _STARPU_IS_ZERO(a) (a == 0.0)

+ 6 - 6
src/core/dependencies/implicit_data_deps.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2016                                Inria
  * Copyright (C) 2010-2018                                Université de Bordeaux
- * Copyright (C) 2010-2013,2015-2017                      CNRS
+ * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,14 +37,14 @@ void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t))
 	write_hook = func;
 }
 
-static void _starpu_add_ghost_dependency(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, unsigned long previous STARPU_ATTRIBUTE_UNUSED, struct starpu_task *next STARPU_ATTRIBUTE_UNUSED)
+static void _starpu_add_ghost_dependency(starpu_data_handle_t handle, unsigned long previous, struct starpu_task *next)
 {
 	struct _starpu_job *next_job = _starpu_get_job_associated_to_task(next);
 	_starpu_bound_job_id_dep(handle, next_job, previous);
 	STARPU_AYU_ADDDEPENDENCY(previous, handle, next_job->job_id);
 }
 
-static void _starpu_add_dependency(starpu_data_handle_t handle STARPU_ATTRIBUTE_UNUSED, struct starpu_task *previous STARPU_ATTRIBUTE_UNUSED, struct starpu_task *next STARPU_ATTRIBUTE_UNUSED)
+static void _starpu_add_dependency(starpu_data_handle_t handle, struct starpu_task *previous, struct starpu_task *next)
 {
 	_starpu_add_ghost_dependency(handle, _starpu_get_job_associated_to_task(previous)->job_id, next);
 }
@@ -296,11 +296,11 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					struct starpu_task *sync_task = starpu_task_create();
 					STARPU_ASSERT(sync_task);
 					if (previous_mode == STARPU_REDUX)
-						sync_task->name = "sync_task_redux";
+						sync_task->name = "_starpu_sync_task_redux";
 					else if (mode ==  STARPU_COMMUTE || previous_mode == STARPU_COMMUTE)
-						sync_task->name = "sync_task_commute";
+						sync_task->name = "_starpu_sync_task_commute";
 					else
-						sync_task->name = "sync_task";
+						sync_task->name = "_starpu_sync_task";
 					sync_task->cl = NULL;
 					sync_task->type = post_sync_task->type;
 

+ 13 - 9
src/core/disk.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2015-2017                                CNRS
  * Copyright (C) 2013,2017                                Inria
- * Copyright (C) 2013-2015,2017                           Université de Bordeaux
+ * Copyright (C) 2013-2015,2017-2018                      Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -181,11 +181,12 @@ int _starpu_disk_read(unsigned src_node, unsigned dst_node STARPU_ATTRIBUTE_UNUS
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = src_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[src_node]->functions->async_read(disk_register_list[src_node]->base, obj, buf, offset, size);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -210,11 +211,12 @@ int _starpu_disk_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned dst_n
 			channel = NULL;
 		else
                 {
+			double start;
 			channel->event.disk_event.memory_node = dst_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[dst_node]->functions->async_write(disk_register_list[dst_node]->base, obj, buf, offset, size);
-        		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -276,11 +278,12 @@ int _starpu_disk_full_read(unsigned src_node, unsigned dst_node, void *obj, void
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = src_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[src_node]->functions->async_full_read(disk_register_list[src_node]->base, obj, ptr, size, dst_node);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}
@@ -304,11 +307,12 @@ int _starpu_disk_full_write(unsigned src_node STARPU_ATTRIBUTE_UNUSED, unsigned
 			channel = NULL;
 		else
 		{
+			double start;
 			channel->event.disk_event.memory_node = dst_node;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			event = disk_register_list[dst_node]->functions->async_full_write(disk_register_list[dst_node]->base, obj, ptr, size);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
                         add_async_event(channel, event);
 		}

+ 2 - 2
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -619,10 +619,10 @@ static void * starpu_unistd_internal_thread(void * arg)
 				starpu_malloc(&buf, work->len);
 				ret = pread(work->fd_src, buf, work->len, work->off_src);
 				STARPU_ASSERT_MSG(ret >= 0, "Reading failed (errno %d)", errno);
-				STARPU_ASSERT_MSG(ret == work->len, "Reading failed (value %ld instead of %ld)", (long)ret, (long)work->len);
+				STARPU_ASSERT_MSG((size_t) ret == work->len, "Reading failed (value %ld instead of %ld)", (long)ret, (long)work->len);
 				ret = pwrite(work->fd_dst, buf, work->len, work->off_dst);
 				STARPU_ASSERT_MSG(ret >= 0, "Writing failed (errno %d)", errno);
-				STARPU_ASSERT_MSG(ret == work->len, "Writing failed (value %ld instead of %ld)", (long)ret, (long)work->len);
+				STARPU_ASSERT_MSG((size_t) ret == work->len, "Writing failed (value %ld instead of %ld)", (long)ret, (long)work->len);
 				starpu_free(buf);
 			}
 			else

+ 3 - 2
src/core/perfmodel/multiple_regression.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011,2016-2017                      CNRS
  * Copyright (C) 2016-2017                                Inria
- * Copyright (C) 2009-2011,2015-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2015-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -67,13 +67,14 @@ static void load_old_calibration(double *mx, double *my, unsigned nparameters, c
 
 	FILE *f=NULL;
 	f = fopen(filepath, "a+");
-	STARPU_ASSERT_MSG(f, "Could not save performance model into the file %s\n", filepath);
+	STARPU_ASSERT_MSG(f, "Could not load performance model from file %s\n", filepath);
 
 	line = fgets(buffer,sizeof(buffer),f);//skipping first line
 	STARPU_ASSERT(line);
 	while((line=fgets(buffer,sizeof(buffer),f))!=NULL)
 	{
 		char *record = strtok(line,",");
+		STARPU_ASSERT_MSG(record, "Could not load performance model from file %s\n", filepath);
 		my[i] = atof(record);
 		record = strtok(NULL,",");
 		int j=0;

+ 1 - 1
src/core/perfmodel/perfmodel.c

@@ -342,7 +342,7 @@ double starpu_task_expected_data_transfer_time(unsigned memory_node, struct star
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, buffer);
 		enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, buffer);
-		unsigned node = -1;
+		int node = -1;
 		if (task->cl->specific_nodes)
 			node = STARPU_CODELET_GET_NODE(task->cl, buffer);
 		if (node == -1)

+ 7 - 0
src/core/sched_ctx.c

@@ -2649,6 +2649,13 @@ void *starpu_sched_ctx_get_user_data(unsigned sched_ctx_id)
 	return sched_ctx->user_data;
 }
 
+void starpu_sched_ctx_set_user_data(unsigned sched_ctx_id, void* user_data)
+{
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	STARPU_ASSERT(sched_ctx != NULL);
+	sched_ctx->user_data = user_data;
+}
+
 void _starpu_worker_apply_deferred_ctx_changes(void)
 {
 	int workerid = starpu_worker_get_id_check();

+ 5 - 2
src/core/simgrid.c

@@ -562,6 +562,8 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 				"Codelet %s does not have a perfmodel, or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 			_starpu_job_get_model_name(j));
+                /* TODO: option to add variance according to performance model,
+                 * to be able to easily check scheduling robustness */
 	}
 
 	simgrid_task = MSG_task_create(_starpu_job_get_task_name(j),
@@ -910,6 +912,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	double *computation;
 	double *communication;
 	union _starpu_async_channel_event *event, myevent;
+	double start = 0.;
 
 	_STARPU_CALLOC(hosts, 2, sizeof(*hosts));
 	_STARPU_CALLOC(computation, 2, sizeof(*computation));
@@ -945,7 +948,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	transfer->next = NULL;
 
 	if (req)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 
 	/* Sleep 10µs for the GPU transfer queueing */
 	if (_starpu_simgrid_queue_malloc_cost())
@@ -955,7 +958,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
 	if (req)
 	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
 		return -EAGAIN;
 	}

+ 2 - 2
src/core/simgrid.h

@@ -72,8 +72,8 @@ extern int starpu_mpi_world_rank(void);
 #pragma weak _starpu_mpi_simgrid_init
 int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
-starpu_pthread_queue_t _starpu_simgrid_transfer_queue[STARPU_MAXNODES];
-starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
+extern starpu_pthread_queue_t _starpu_simgrid_transfer_queue[STARPU_MAXNODES];
+extern starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 
 #define _starpu_simgrid_cuda_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_CUDA_MALLOC_COST", 1)
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)

+ 2 - 2
src/core/task.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2013-2014,2017                      Inria
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ void _starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, s
 static inline struct _starpu_job *_starpu_get_job_associated_to_task(struct starpu_task *task)
 {
 	STARPU_ASSERT(task);
-	struct _starpu_job *job = task->starpu_private;
+	struct _starpu_job *job = (struct _starpu_job *) task->starpu_private;
 
 	if (STARPU_UNLIKELY(!job))
 	{

+ 2 - 2
src/core/topology.c

@@ -1437,6 +1437,8 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 	topology->cuda_th_per_stream = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_WORKER", -1);
 	topology->cuda_th_per_dev = starpu_get_env_number_default("STARPU_CUDA_THREAD_PER_DEV", -1);
 
+	STARPU_ASSERT_MSG(!(topology->cuda_th_per_stream == 1 && topology->cuda_th_per_dev != -1), "It does not make sense to set both STARPU_CUDA_THREAD_PER_WORKER to 1 and to set STARPU_CUDA_THREAD_PER_DEV, please choose either per worker or per device or none");
+
 	/* per device by default */
 	if (topology->cuda_th_per_dev == -1)
 	{
@@ -1451,8 +1453,6 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 		topology->cuda_th_per_stream = 0;
 	}
 
-	STARPU_ASSERT_MSG(topology->cuda_th_per_dev != 1 || topology->cuda_th_per_stream != 1, "It does not make sense to set both STARPU_CUDA_THREAD_PER_WORKER and STARPU_CUDA_THREAD_PER_DEV to 1, please choose either per worker or per device or none");
-
 	if (!topology->cuda_th_per_dev)
 	{
 		cuda_worker_set[0].workers = &config->workers[topology->nworkers];

+ 9 - 0
src/core/workers.c

@@ -1153,6 +1153,7 @@ static void _starpu_build_tree(void)
 
 static void (*act_sigint)(int);
 static void (*act_sigsegv)(int);
+static void (*act_sigtrap)(int);
 
 void _starpu_handler(int sig)
 {
@@ -1170,6 +1171,10 @@ void _starpu_handler(int sig)
 	{
 		signal(SIGSEGV, act_sigsegv);
 	}
+	if (sig == SIGTRAP)
+	{
+		signal(SIGTRAP, act_sigtrap);
+	}
 #ifdef STARPU_VERBOSE
 	_STARPU_MSG("Rearming signal '%d'\n", sig);
 #endif
@@ -1180,6 +1185,7 @@ void _starpu_catch_signals(void)
 {
 	act_sigint  = signal(SIGINT, _starpu_handler);
 	act_sigsegv = signal(SIGSEGV, _starpu_handler);
+	act_sigtrap = signal(SIGTRAP, _starpu_handler);
 }
 
 int starpu_init(struct starpu_conf *user_conf)
@@ -1282,6 +1288,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 #ifdef STARPU_USE_FXT
 	_STARPU_DISP("Warning: StarPU was configured with --with-fxt, which slows down a bit, limits scalability and makes worker initialization sequential\n");
 #endif
+#ifdef STARPU_FXT_LOCK_TRACES
+	_STARPU_DISP("Warning: StarPU was configured with --enable-fxt-lock, which slows down things a huge lot, and is really only meant for StarPU insides debugging. Did you really want to enable that?\n");
+#endif
 #ifdef STARPU_PERF_DEBUG
 	_STARPU_DISP("Warning: StarPU was configured with --enable-perf-debug, which slows down a bit\n");
 #endif

+ 27 - 1
src/datawizard/copy_driver.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2016-2017                      Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2011,2013,2015-2018                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -689,6 +689,32 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
+void starpu_interface_start_driver_copy_async(unsigned src_node, unsigned dst_node, double *start)
+{
+	*start = starpu_timing_now();
+	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+}
+
+void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node, double start)
+{
+	double end = starpu_timing_now();
+	double elapsed = end - start;
+	if (elapsed > 300)
+	{
+		static int warned = 0;
+		if (!warned)
+		{
+			char src_name[16], dst_name[16];
+			warned = 1;
+			_starpu_memory_node_get_name(src_node, src_name, sizeof(src_name));
+			_starpu_memory_node_get_name(dst_node, dst_name, sizeof(dst_name));
+
+			_STARPU_DISP("Warning: the submission of asynchronous transfer from %s to %s took a very long time (%f ms)\nFor proper asynchronous transfer overlapping, data registered to StarPU must be allocated with starpu_malloc() or pinned with starpu_memory_pin()\n", src_name, dst_name, elapsed / 1000.);
+		}
+	}
+	_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+}
+
 /* This can be used by interfaces to easily transfer a piece of data without
  * caring about the particular transfer methods.  */
 

+ 7 - 5
src/datawizard/interfaces/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012,2017                           Inria
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -528,12 +528,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		}
 		else
 		{
+			double start;
 			/* Are all plans contiguous */
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
 					nx*ny*elemsize, nz, kind, stream);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
@@ -559,12 +560,13 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 		{
 			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
 			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
+			double start;
 
-			_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
                                                   nx*elemsize, ny, kind, stream);
-			_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
 			if (STARPU_UNLIKELY(cures))
 			{

+ 7 - 5
src/datawizard/interfaces/matrix_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012,2017                           Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -418,11 +418,12 @@ static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIB
 
 	if (is_async)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
 			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		if (!cures)
 			return -EAGAIN;
 	}
@@ -466,9 +467,10 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 
 	if (is_async)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		cures = cudaMemcpy3DPeerAsync(&p, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		if (!cures)
 			return -EAGAIN;
 	}

+ 4 - 3
src/datawizard/interfaces/multiformat_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2012                                Inria
  * Copyright (C) 2012-2017                                CNRS
- * Copyright (C) 2013,2015-2016                           Université de Bordeaux
+ * Copyright (C) 2013,2015-2016,2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -528,11 +528,12 @@ static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
 
 	if (stream)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 		status = cudaMemcpyPeerAsync(dst_multiformat->cuda_ptr, dst_dev,
 					     src_multiformat->cuda_ptr, src_dev,
 					     size, stream);
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 		/* All good ! Still, returning -EAGAIN, because we will need to
                    check the transfert completion later */
 		if (status == cudaSuccess)

+ 3 - 2
src/drivers/cuda/driver_cuda.c

@@ -1140,7 +1140,8 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
 	if (stream)
 	{
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		double start;
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
 		if (peer_copy)
 		{
@@ -1153,7 +1154,7 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 		{
 			cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
 		}
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	}
 
 	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */

+ 15 - 10
src/drivers/opencl/driver_opencl.c

@@ -297,15 +297,16 @@ cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTR
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 
 	cl_event ev;
 	err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
 
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
@@ -336,13 +337,14 @@ cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 	cl_event ev;
 	err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
 		if (event == NULL)
@@ -372,13 +374,14 @@ cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_
 {
 	cl_int err;
 	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	double start = 0.;
 
 	if (event)
-		_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
 	cl_event ev;
 	err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
 	if (event)
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	if (STARPU_LIKELY(err == CL_SUCCESS))
 	{
 		if (event == NULL)
@@ -451,15 +454,16 @@ cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node S
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
+	double start = 0.;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
         if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
         err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                       buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
 	clFlush(out_transfer_queues[worker->devid]);
         if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 
         return CL_SUCCESS;
@@ -472,15 +476,16 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARP
         cl_int err;
         struct _starpu_worker *worker = _starpu_get_local_worker_key();
         cl_bool blocking;
+	double start = 0.;
 
         blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
         if (event)
-                _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
         err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
                                        buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
 	clFlush(in_transfer_queues[worker->devid]);
         if (event)
-                _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
+                starpu_interface_end_driver_copy_async(src_node, dst_node, start);
 	_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 
         return CL_SUCCESS;

+ 3 - 1
src/util/fstarpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2016-2017                                CNRS
+ * Copyright (C) 2016-2018                                CNRS
  * Copyright (C) 2016-2017                                Inria
  * Copyright (C) 2016-2017                                Université de Bordeaux
  *
@@ -57,6 +57,7 @@ static const intptr_t fstarpu_flops	= STARPU_FLOPS;
 static const intptr_t fstarpu_tag	= STARPU_TAG;
 static const intptr_t fstarpu_tag_only	= STARPU_TAG_ONLY;
 static const intptr_t fstarpu_name	= STARPU_NAME;
+static const intptr_t fstarpu_task_color	= STARPU_TASK_COLOR;
 static const intptr_t fstarpu_node_selection_policy	= STARPU_NODE_SELECTION_POLICY;
 
 static const intptr_t fstarpu_value = STARPU_VALUE;
@@ -133,6 +134,7 @@ intptr_t fstarpu_get_constant(char *s)
 	else if	(!strcmp(s, "FSTARPU_NODE_SELECTION_POLICY"))	{ return fstarpu_node_selection_policy; }
 	else if (!strcmp(s, "FSTARPU_VALUE"))	{ return fstarpu_value; }
 	else if (!strcmp(s, "FSTARPU_SCHED_CTX"))	{ return fstarpu_sched_ctx; }
+	else if (!strcmp(s, "FSTARPU_TASK_COLOR"))	{ return fstarpu_task_color; }
 
 	else if (!strcmp(s, "FSTARPU_CPU_WORKER"))	{ return fstarpu_cpu_worker; }
 	else if (!strcmp(s, "FSTARPU_CUDA_WORKER"))	{ return fstarpu_cuda_worker; }

+ 16 - 7
src/util/starpu_task_insert.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2012,2016                           Inria
  * Copyright (C) 2010-2018                                Université de Bordeaux
- * Copyright (C) 2011-2017                                CNRS
+ * Copyright (C) 2011-2018                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -111,13 +111,13 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 }
 
 static
-struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char* task_name, int cl_arg_free, va_list varg_list)
+struct starpu_task *_starpu_task_build_v(struct starpu_task *ptask, struct starpu_codelet *cl, const char* task_name, int cl_arg_free, va_list varg_list)
 {
 	va_list varg_list_copy;
 	int ret;
 
-	struct starpu_task *task = starpu_task_create();
-	task->name = task_name;
+	struct starpu_task *task = ptask ? ptask : starpu_task_create();
+	task->name = task_name ? task_name : task->name;
 	task->cl_arg_free = cl_arg_free;
 
 	va_copy(varg_list_copy, varg_list);
@@ -132,13 +132,12 @@ struct starpu_task *_starpu_task_build_v(struct starpu_codelet *cl, const char*
 	return (ret == 0) ? task : NULL;
 }
 
-static
 int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 {
 	struct starpu_task *task;
 	int ret;
 
-	task = _starpu_task_build_v(cl, NULL, 1, varg_list);
+	task = _starpu_task_build_v(NULL, cl, NULL, 1, varg_list);
 	ret = starpu_task_submit(task);
 
 	if (STARPU_UNLIKELY(ret == -ENODEV))
@@ -155,6 +154,16 @@ int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 	return ret;
 }
 
+int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...)
+{
+	va_list varg_list;
+
+	va_start(varg_list, cl);
+	task = _starpu_task_build_v(task, cl, NULL, 1, varg_list);
+	va_end(varg_list);
+	return 0;
+}
+
 int starpu_task_insert(struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
@@ -183,7 +192,7 @@ struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
 	va_list varg_list;
 
 	va_start(varg_list, cl);
-	task = _starpu_task_build_v(cl, "task_build", 0, varg_list);
+	task = _starpu_task_build_v(NULL, cl, "task_build", 0, varg_list);
 	if (task && task->cl_arg)
 	{
 		task->cl_arg_free = 1;

+ 13 - 0
src/util/starpu_task_insert_utils.c

@@ -192,6 +192,10 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 		{
 			(void)va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_TASK_COLOR)
+                {
+                        (void)va_arg(varg_list, int);
+                }
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -482,6 +486,10 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 		{
 			(void)va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_TASK_COLOR)
+                {
+                        task->color = va_arg(varg_list, int);
+                }
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -716,6 +724,11 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 			arg_i++;
 			(void)arglist[arg_i];
 		}
+		else if (arg_type == STARPU_TASK_COLOR)
+                {
+                        arg_i++;
+                        task->color = *(int *)arglist[arg_i];
+                }
 		else
 		{
 			STARPU_ABORT_MSG("unknown/unsupported argument %d, did you perhaps forget to end arguments with 0?", arg_type);

+ 2 - 1
tests/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010-2017                                Inria
-# Copyright (C) 2009-2017                                Université de Bordeaux
+# Copyright (C) 2009-2018                                Université de Bordeaux
 # Copyright (C) 2010-2017                                CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -343,6 +343,7 @@ myPROGRAMS +=				\
 	overlap/gpu_concurrency			\
 	parallel_tasks/explicit_combined_worker	\
 	parallel_tasks/parallel_kernels		\
+	parallel_tasks/parallel_kernels_trivial	\
 	parallel_tasks/parallel_kernels_spmd	\
 	parallel_tasks/spmd_peager		\
 	parallel_tasks/cuda_only		\

+ 132 - 0
tests/parallel_tasks/parallel_kernels_trivial.c

@@ -0,0 +1,132 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012-2013                                Inria
+ * Copyright (C) 2010-2016,2018                           Université de Bordeaux
+ * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2013                                     Thibaut Lambert
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <limits.h>
+#include <unistd.h>
+#include "../helper.h"
+
+/*
+ * Submit a simple testcase for parallel tasks.
+ */
+
+#define VECTORSIZE	1024
+
+void codelet_null(void *descr[], void *_args)
+{
+	(void)descr;
+	(void)_args;
+
+	STARPU_SKIP_IF_VALGRIND;
+
+	int worker_size = starpu_combined_worker_get_size();
+	STARPU_ASSERT(worker_size > 0);
+	usleep(1000/worker_size);
+#if 0
+	int id = starpu_worker_get_id();
+	int combined_id = starpu_combined_worker_get_id();
+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
+#endif
+}
+
+struct starpu_perfmodel model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "parallel_kernel_test"
+};
+
+static struct starpu_codelet cl =
+{
+	.type = STARPU_FORKJOIN,
+	.max_parallelism = INT_MAX,
+	.cpu_funcs = {codelet_null},
+	.cuda_funcs = {codelet_null},
+	.cpu_funcs_name = {"codelet_null"},
+        .opencl_funcs = {codelet_null},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+static struct starpu_codelet cl_seq =
+{
+	.cpu_funcs = {codelet_null},
+	.cuda_funcs = {codelet_null},
+	.cpu_funcs_name = {"codelet_null_seq"},
+        .opencl_funcs = {codelet_null},
+	.model = &model,
+	.nbuffers = 1,
+	.modes = {STARPU_R}
+};
+
+int main(void)
+{
+	int ret;
+	starpu_data_handle_t v_handle;
+	unsigned *v;
+
+        struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.ncpus = 2;
+	conf.sched_policy_name = "pheft";
+	conf.calibrate = 1;
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
+	starpu_vector_data_register(&v_handle, STARPU_MAIN_RAM, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
+
+	unsigned nworker = starpu_worker_get_count() + starpu_combined_worker_get_count();
+
+	/* First submit a sequential task */
+	ret = starpu_task_insert(&cl_seq, STARPU_R, v_handle, 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	/* Then a parallel task, which is not interesting to run in parallel when we have only two cpus */
+	ret = starpu_task_insert(&cl, STARPU_R, v_handle, 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+        /* Then another parallel task, which is interesting to run in parallel
+        since the two cpus are now finishing at the same time. */
+	ret = starpu_task_insert(&cl, STARPU_R, v_handle, 0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	ret = starpu_task_wait_for_all();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
+
+	starpu_data_unregister(v_handle);
+	starpu_free(v);
+	starpu_shutdown();
+
+	STARPU_RETURN(EXIT_SUCCESS);
+
+enodev:
+	starpu_data_unregister(v_handle);
+	starpu_free(v);
+	fprintf(stderr, "WARNING: No one can execute this task\n");
+	/* yes, we do not perform the computation but we did detect that no one
+ 	 * could perform the kernel, so this is not an error from StarPU */
+	starpu_shutdown();
+	STARPU_RETURN(STARPU_TEST_SKIPPED);
+}

+ 2 - 2
tools/dev/cppcheck/suppressions.txt

@@ -2,7 +2,7 @@
 //
 // Copyright (C) 2017, 2018                               CNRS
 // Copyright (C) 2017                                     Inria
-// Copyright (C) 2017                                     Université de Bordeaux
+// Copyright (C) 2017-2018                                Université de Bordeaux
 //
 // StarPU is free software; you can redistribute it and/or modify
 // it under the terms of the GNU Lesser General Public License as published by
@@ -57,7 +57,7 @@ unusedStructMember:src/core/perfmodel/perfmodel_bus.c:65
 unusedStructMember:src/core/perfmodel/perfmodel_bus.c:66
 unusedStructMember:src/core/simgrid.c:225
 unusedStructMember:src/core/simgrid.c:226
-wrongPrintfScanfArgNum:src/core/simgrid.c:1008
+wrongPrintfScanfArgNum:src/core/simgrid.c:1016
 duplicateExpression:src/util/starpu_task_insert.c:52
 
 nullPointerRedundantCheck:src/common/rbtree.c

+ 115 - 1
tools/dev/valgrind/hwloc.suppr

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012                                     Inria
-# Copyright (C) 2012,2016-2017                           CNRS
+# Copyright (C) 2012,2016-2018                           CNRS
 # Copyright (C) 2015                                     Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -2457,3 +2457,117 @@
    fun:MAIN__
    fun:main
 }
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: indirect
+   fun:realloc
+   obj:*
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+   fun:main
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:calloc
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+   fun:main
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: indirect
+   fun:malloc
+   fun:strdup
+   obj:*
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+   fun:main
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: indirect
+   fun:malloc
+   fun:strdup
+   obj:*
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: indirect
+   fun:calloc
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:calloc
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: indirect
+   fun:realloc
+   obj:*
+   obj:*
+   obj:*
+   obj:*
+   fun:hwloc_topology_load
+   fun:_starpu_init_topology
+   fun:_starpu_topology_get_nnumanodes
+   fun:_starpu_load_bus_performance_files
+   fun:starpu_initialize
+   fun:starpu_init
+}

+ 14 - 1
tools/dev/valgrind/libc.suppr

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2014,2016-2017                           CNRS
+# Copyright (C) 2014,2016-2018                           CNRS
 # Copyright (C) 2014-2016                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -134,3 +134,16 @@
    fun:dlopen_doit
    fun:_dl_catch_error
 }
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:malloc
+   obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
+   obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
+   obj:/usr/lib/x86_64-linux-gnu/libgomp.so.1.0.0
+   fun:call_init.part.0
+   fun:call_init
+   fun:_dl_init
+   obj:/lib/x86_64-linux-gnu/ld-2.26.so
+}