Browse Source

Merge branch 'fpga' of gitlab.inria.fr:starpu/starpu into fpga

Samuel Thibault 4 years ago
parent
commit
9fad43a535
63 changed files with 1133 additions and 326 deletions
  1. 2 0
      .gitignore
  2. 2 0
      ChangeLog
  3. 7 4
      configure.ac
  4. 90 10
      doc/doxygen/chapters/320_scheduling.doxy
  5. 37 8
      doc/doxygen/chapters/410_mpi_support.doxy
  6. 1 1
      doc/doxygen/chapters/510_configure_options.doxy
  7. 1 2
      examples/gl_interop/gl_interop.c
  8. 1 2
      examples/gl_interop/gl_interop_idle.c
  9. 1 3
      examples/matvecmult/matvecmult.c
  10. 1 4
      examples/perf_steering/perf_knobs_03.c
  11. 9 9
      examples/scheduler/heteroprio_test.c
  12. 1 0
      examples/stencil/implicit-stencil-tasks.c
  13. 4 30
      include/schedulers/starpu_heteroprio.h
  14. 11 0
      include/starpu.h
  15. 1 1
      include/starpu_fxt.h
  16. 41 2
      include/starpu_perfmodel.h
  17. 32 6
      include/starpu_task.h
  18. 34 1
      include/starpu_task_util.h
  19. 13 2
      include/starpu_worker.h
  20. 12 0
      mpi/include/starpu_mpi.h
  21. 16 1
      mpi/src/starpu_mpi.c
  22. 5 1
      mpi/src/starpu_mpi_coop_sends.c
  23. 8 0
      mpi/src/starpu_mpi_private.h
  24. 23 0
      mpi/src/starpu_mpi_task_insert.c
  25. 10 0
      mpi/src/starpu_mpi_task_insert_fortran.c
  26. 1 0
      src/Makefile.am
  27. 3 1
      src/common/fxt.c
  28. 9 0
      src/common/fxt.h
  29. 277 0
      src/core/perfmodel/energy_model.c
  30. 5 10
      src/core/perfmodel/perfmodel.h
  31. 17 12
      src/core/perfmodel/perfmodel_history.c
  32. 4 0
      src/core/sched_policy.c
  33. 2 0
      src/core/task.c
  34. 1 1
      src/core/topology.c
  35. 35 8
      src/core/workers.c
  36. 13 0
      src/datawizard/memory_nodes.c
  37. 53 40
      src/debug/traces/starpu_fxt.c
  38. 1 0
      src/debug/traces/starpu_fxt.h
  39. 6 0
      src/debug/traces/starpu_fxt_dag.c
  40. 2 2
      src/drivers/driver_common/driver_common.c
  41. 1 0
      src/drivers/max/driver_fpga.c
  42. 34 35
      src/profiling/profiling.c
  43. 34 76
      src/sched_policies/heteroprio.c
  44. 4 0
      src/util/fstarpu.c
  45. 5 0
      src/util/starpu_task_insert.c
  46. 27 0
      src/util/starpu_task_insert_utils.c
  47. 3 3
      tests/datawizard/bcsr.c
  48. 1 4
      tests/datawizard/noreclaim.c
  49. 2 3
      tests/disk/disk_copy.c
  50. 3 3
      tests/disk/disk_copy_unpack.c
  51. 3 3
      tests/disk/disk_pack.c
  52. 3 3
      tests/disk/mem_reclaim.c
  53. 2 2
      tests/energy/energy_efficiency.c
  54. 2 2
      tests/errorcheck/invalid_tasks.c
  55. 1 5
      tests/errorcheck/starpu_init_noworker.c
  56. 2 4
      tests/errorcheck/workers_cpuid.c
  57. 0 1
      tests/fpga/MyTasksMuxManager.maxj
  58. 3 6
      tests/main/driver_api/init_run_deinit.c
  59. 3 5
      tests/main/driver_api/run_driver.c
  60. 3 8
      tests/microbenchs/bandwidth.c
  61. 2 2
      tests/microbenchs/tasks_size_overhead.c
  62. 142 0
      tests/perfmodels/regression_based_memset.c
  63. 61 0
      tools/dev/valgrind/fxt.suppr

+ 2 - 0
.gitignore

@@ -20,6 +20,8 @@
 ,*
 .libs
 .deps
+*.orig
+*.rej
 *.o
 *.lo
 *.la

+ 2 - 0
ChangeLog

@@ -45,6 +45,8 @@ New features:
   * Add starpu_data_release_to() and starpu_data_release_to_on_node().
   * Add profiling based on papi performance counters.
   * Add an experimental python interface (not actually parallel yet)
+  * Add task submission file+line in traces.
+  * Add papi- and nvml-based energy measurement.
 
 Small changes:
   * Add a synthetic energy efficiency testcase.

+ 7 - 4
configure.ac

@@ -2221,10 +2221,6 @@ if test x$use_fxt = xyes; then
 		FXT_LIBS="$(pkg-config --variable=libdir fxt)/libfxt.a -Wl,--as-needed $(pkg-config --libs --static fxt) -Wl,--no-as-needed"
 	fi
 
-	AC_CHECK_LIB([papi], [PAPI_library_init],
-		     [AC_DEFINE([STARPU_PAPI], [1], [Define to 1 if you have the libpapi library])
-		      PAPI_LIBS=-lpapi])
-
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################
@@ -2252,6 +2248,13 @@ if  test x$enable_fxt_lock = xyes; then
 	AC_DEFINE(STARPU_FXT_LOCK_TRACES, [1], [enable additional locking systems FxT traces])
 fi
 
+AC_CHECK_LIB([papi], [PAPI_library_init],
+	     [AC_DEFINE([STARPU_PAPI], [1], [Define to 1 if you have the libpapi library])
+	      PAPI_LIBS=-lpapi])
+AC_SUBST(PAPI_LIBS)
+
+AM_CONDITIONAL([STARPU_USE_PAPI], [test "x$PAPI_LIBS" != "x"])
+
 AC_MSG_CHECKING(whether performance debugging should be enabled)
 AC_ARG_ENABLE(perf-debug, [AS_HELP_STRING([--enable-perf-debug],
 			[enable performance debugging through gprof])],

File diff suppressed because it is too large
+ 90 - 10
doc/doxygen/chapters/320_scheduling.doxy


+ 37 - 8
doc/doxygen/chapters/410_mpi_support.doxy

@@ -1010,17 +1010,46 @@ data transfers and supports data matrices which do not fit in memory (out-of-cor
 </li>
 </ul>
 
-\section MPIImplementation Notes about the Implementation
 
-StarPU-MPI is implemented directly on top of MPI.
+\section Nmad Using the NewMadeleine communication library
+
+NewMadeleine (see http://pm2.gforge.inria.fr/newmadeleine/, part of the PM2
+project) is an optimizing communication library for high-performance networks.
+NewMadeleine provides its own interface, but also an MPI interface (called
+MadMPI). Thus there are two possibilities to use NewMadeleine with StarPU:
+
+<ul>
+<li>
+using the NewMadeleine's native interface. StarPU supports this interface from
+its release 1.3.0, by enabling the \c configure option \ref enable-nmad
+"--enable-nmad". In this case, StarPU relies directly on NewMadeleine to make
+communications progress and NewMadeleine has to be built with the profile
+<c>pukabi+madmpi.conf</c>.
+</li>
+<li>
+using the NewMadeleine's MPI interface (MadMPI). StarPU will use the standard
+MPI API and NewMadeleine will handle the calls to the MPI API. In this case,
+StarPU makes communications progress and thus communication progress has to be
+disabled in NewMadeleine by compiling it with the profile
+<c>pukabi+madmpi-mini.conf</c>.
+</li>
+</ul>
+
+To build NewMadeleine, download the latest version from the website (or,
+better, use the Git version to use the most recent version), then:
+\code{.sh}
+cd pm2/scripts
+./pm2-build-packages ./<the profile you chose> --prefix=<installation prefix>
+\endcode
+
+With Guix, the NewMadeleine's native interface can be used by setting the
+parameter \c \-\-with-input=openmpi=nmad and MadMPI can be used with \c
+\-\-with-input=openmpi=nmad-mini.
+
+Whatever implementation (NewMadeleine or MadMPI) is used by StarPU, the public
+MPI interface of StarPU (described in \ref API_MPI_Support) is the same.
 
-Since the release 1.3.0, an implementation on top of NewMadeleine, an
-optimizing communication library for high-performance networks, is
-also provided. To use it, one needs to install NewMadeleine (see
-http://pm2.gforge.inria.fr/newmadeleine/) and enable the \c configure
-option \ref enable-nmad "--enable-nmad".
 
-Both implementations provide the same public API.
 
 \section MPIMasterSlave MPI Master Slave Support
 

+ 1 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -457,7 +457,7 @@ $ STARPU_SILENT=1 mpirun -np 2 ./insert_task
 <dd>
 \anchor enable-nmad
 \addindex __configure__--enable-nmad
-Enable the NewMadeleine implementation for StarPU-MPI.
+Enable the NewMadeleine implementation for StarPU-MPI. See \ref Nmad for more details.
 </dd>
 
 <dt>--disable-fortran</dt>

+ 1 - 2
examples/gl_interop/gl_interop.c

@@ -106,9 +106,8 @@ int main(int argc, char **argv)
 
 	/* Enable OpenGL interoperability */
 	starpu_conf_init(&conf);
+	starpu_conf_noworker(&conf);
 	conf.ncuda = 1;
-	conf.ncpus = 0;
-	conf.nopencl = 0;
 	conf.cuda_opengl_interoperability = cuda_devices;
 	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
 	conf.not_launched_drivers = drivers;

+ 1 - 2
examples/gl_interop/gl_interop_idle.c

@@ -122,9 +122,8 @@ int main(int argc, char **argv)
 
 	/* Enable OpenGL interoperability */
 	starpu_conf_init(&conf);
+	starpu_conf_noworker(&conf);
 	conf.ncuda = 1;
-	conf.ncpus = 0;
-	conf.nopencl = 0;
 	conf.cuda_opengl_interoperability = cuda_devices;
 	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
 	conf.not_launched_drivers = drivers;

+ 1 - 3
examples/matvecmult/matvecmult.c

@@ -142,9 +142,7 @@ int main(void)
 	struct starpu_conf conf;
 
 	starpu_conf_init(&conf);
-	conf.ncpus = 0;
-	conf.ncuda = 0;
-	conf.nmic = 0;
+	starpu_conf_noworker(&conf);
 	conf.nopencl = 1;
 
         /* int width=1100; */

+ 1 - 4
examples/perf_steering/perf_knobs_03.c

@@ -37,11 +37,8 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 2;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-	conf.nmpi_ms = 0;
 	{
 		const char *sched_pol_name = starpu_getenv("STARPU_SCHED");
 		if (sched_pol_name != NULL && strcmp(sched_pol_name, "prio") != 0)

+ 9 - 9
examples/scheduler/heteroprio_test.c

@@ -30,33 +30,33 @@ void initSchedulerCallback(unsigned sched_ctx)
 #ifdef STARPU_USE_CPU
 	if (starpu_cpu_worker_get_count())
 	{
-		starpu_heteroprio_set_nb_prios(0, STARPU_CPU_IDX, 3);
+		starpu_heteroprio_set_nb_prios(0, STARPU_CPU_WORKER, 3);
 		// It uses direct mapping idx => idx
 		unsigned idx;
 		for(idx = 0; idx < 3; ++idx)
 		{
-			starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_IDX, idx, idx);
-			starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_IDX, idx);
+			starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_WORKER, idx, idx);
+			starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_WORKER, idx);
 		}
 	}
 #endif
 #ifdef STARPU_USE_OPENCL
 	// OpenCL is enabled and uses 2 buckets
-	starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_OPENCL_IDX, 2);
+	starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_OPENCL_WORKER, 2);
 	// OpenCL will first look to priority 2
 	int prio2 = starpu_cpu_worker_get_count() ? 2 : 1;
-	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 0, prio2);
+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_WORKER, 0, prio2);
 	// For this bucket OpenCL is the fastest
-	starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_OPENCL_IDX, prio2);
+	starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_OPENCL_WORKER, prio2);
 	// And CPU is 4 times slower
 #ifdef STARPU_USE_CPU
-	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_IDX, 2, 4.0f);
+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_WORKER, 2, 4.0f);
 #endif
 
 	int prio1 = starpu_cpu_worker_get_count() ? 1 : 0;
-	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 1, prio1);
+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_WORKER, 1, prio1);
 	// We let the CPU as the fastest and tell that OpenCL is 1.7 times slower
-	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_OPENCL_IDX, prio1, 1.7f);
+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_OPENCL_WORKER, prio1, 1.7f);
 #endif
 }
 

+ 1 - 0
examples/stencil/implicit-stencil-tasks.c

@@ -36,6 +36,7 @@
 
 #if defined(STARPU_USE_MPI) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
 #include <starpu_mpi.h>
+#undef starpu_insert_task
 #define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__)
 #endif
 

+ 4 - 30
include/schedulers/starpu_heteroprio.h

@@ -25,49 +25,23 @@ extern "C"
 #endif
 
 #define STARPU_HETEROPRIO_MAX_PRIO 100
-/* #define STARPU_NB_TYPES 3 */
-/* #define STARPU_CPU_IDX 0 */
-/* #define STARPU_CUDA_IDX 1 */
-/* #define STARPU_OPENCL_IDX 2 */
 
 #define STARPU_HETEROPRIO_MAX_PREFETCH 2
 #if STARPU_HETEROPRIO_MAX_PREFETCH <= 0
 #error STARPU_HETEROPRIO_MAX_PREFETCH == 1 means no prefetch so STARPU_HETEROPRIO_MAX_PREFETCH must >= 1
 #endif
 
-enum starpu_heteroprio_types
-{
-// First will be zero
-	STARPU_CPU_IDX, // = 0
-	STARPU_CUDA_IDX,
-	STARPU_OPENCL_IDX,
-	STARPU_MIC_IDX,
-	STARPU_MPI_MS_IDX,
-// This will be the number of archs
-	STARPU_NB_TYPES
-};
-
-static const unsigned starpu_heteroprio_types_to_arch[STARPU_NB_TYPES+1] =
-{
-	STARPU_CPU,
-	STARPU_CUDA,
-	STARPU_OPENCL,
-	STARPU_MIC,
-        STARPU_MPI_MS,
-	0
-};
-
 /** Tell how many prio there are for a given arch */
-void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned max_prio);
+void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned max_prio);
 
 /** Set the mapping for a given arch prio=>bucket */
-void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned source_prio, unsigned dest_bucket_id);
+void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned source_prio, unsigned dest_bucket_id);
 
 /** Tell which arch is the faster for the tasks of a bucket (optional) */
-void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id);
+void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id);
 
 /** Tell how slow is a arch for the tasks of a bucket (optional) */ 
-void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id, float slow_factor);
+void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id, float slow_factor);
 
 #ifdef __cplusplus
 }

+ 11 - 0
include/starpu.h

@@ -542,6 +542,17 @@ struct starpu_conf
 int starpu_conf_init(struct starpu_conf *conf);
 
 /**
+   Set fields of \p conf so that no worker is enabled, i.e. set
+   starpu_conf::ncpus = 0, starpu_conf::ncuda = 0, etc.
+
+   This allows to portably enable only a given type of worker:
+
+   starpu_conf_noworker(&conf);
+   conf.ncpus = -1;
+*/
+int starpu_conf_noworker(struct starpu_conf *conf);
+
+/**
    StarPU initialization method, must be called prior to any other
    StarPU call. It is possible to specify StarPU’s configuration (e.g.
    scheduling policy, number of cores, ...) by passing a

+ 1 - 1
include/starpu_fxt.h

@@ -35,7 +35,7 @@ extern "C"
 
 struct starpu_fxt_codelet_event
 {
-	char symbol[256];
+	char symbol[2048];
 	int workerid;
 	char perfmodel_archname[256];
 	uint32_t hash;

+ 41 - 2
include/starpu_perfmodel.h

@@ -319,6 +319,29 @@ void starpu_perfmodel_init(struct starpu_perfmodel *model);
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 /**
+   starpu_energy_start - start counting hardware events in an event set
+
+   - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
+   - \p archi is the type of architecture on which calibration will be run
+*/
+
+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
+
+/**
+   starpu_energy_stop - stop counting hardware events in an event set
+
+   - \p model is the energy performance model to be filled with the result
+   - \p task is a task specimen, so the performance model folds the result according to the parameter sizes of the task.
+   - \p nimpl is the implementation number run during calibration
+   - \p ntasks is the number of tasks run during calibration
+   - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
+   - \p archi is the type of architecture on which calibration was run
+*/
+
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
+
+
+/**
    Load the performance model found in the file named \p filename. \p model has to be
    completely zero, and will be filled with the information stored in the given file.
 */
@@ -414,16 +437,32 @@ int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
 
 /**
-   Feed the performance model model with an explicit
-   measurement measured (in µs), in addition to measurements done by StarPU
+   Feed the performance model \p model with one explicit
+   measurement (in µs or J), in addition to measurements done by StarPU
    itself. This can be useful when the application already has an
    existing set of measurements done in good conditions, that StarPU
    could benefit from instead of doing on-line measurements. An example
    of use can be seen in \ref PerformanceModelExample.
+
+   Note that this records only one measurement, and StarPU would ignore
+   the first measurement (since it is usually disturbed by library loading
+   etc.). Make sure to call this function several times to record all your
+   measurements.
+
+   You can also call starpu_perfmodel_update_history_n() to directly provide an
+   average performed on several tasks.
 */
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 
 /**
+   Feed the performance model \p model with an explicit average measurement (in µs or J).
+
+   This is similar to starpu_perfmodel_update_history(), but records a batch of
+   \p number measurements provided as the average of the measurements \p average_measured.
+*/
+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double average_measured, unsigned number);
+
+/**
    Print the directory name storing performance models on \p output
 */
 void starpu_perfmodel_directory(FILE *output);

+ 32 - 6
include/starpu_task.h

@@ -48,46 +48,50 @@ extern "C"
 #define STARPU_NOWHERE	((1ULL)<<0)
 
 /**
+  Convert from enum starpu_worker_archtype to worker type mask for use in "where" fields
+  */
+#define STARPU_WORKER_TO_MASK(worker_archtype) (1ULL << (worker_archtype + 1))
+/**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a CPU processing unit.
 */
-#define STARPU_CPU	((1ULL)<<1)
+#define STARPU_CPU	STARPU_WORKER_TO_MASK(STARPU_CPU_WORKER)
 
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a CUDA processing unit.
 */
-#define STARPU_CUDA	((1ULL)<<3)
+#define STARPU_CUDA	STARPU_WORKER_TO_MASK(STARPU_CUDA_WORKER)
 
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a MAX FPGA.
 */
-#define STARPU_FPGA	((1ULL)<<4)
+#define STARPU_FPGA	STARPU_WORKER_TO_MASK(STARPU_FPGA_WORKER)
 
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a OpenCL processing unit.
 */
-#define STARPU_OPENCL	((1ULL)<<6)
+#define STARPU_OPENCL	STARPU_WORKER_TO_MASK(STARPU_OPENCL_WORKER)
 
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a MIC processing unit.
 */
-#define STARPU_MIC	((1ULL)<<7)
+#define STARPU_MIC	STARPU_WORKER_TO_MASK(STARPU_MIC_WORKER)
 
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
    executed on a MPI Slave processing unit.
 */
-#define STARPU_MPI_MS	((1ULL)<<9)
+#define STARPU_MPI_MS	STARPU_WORKER_TO_MASK(STARPU_MPI_MS_WORKER)
 
 /**
    Value to be set in starpu_codelet::flags to execute the codelet
@@ -653,6 +657,18 @@ struct starpu_task
 	const char *name;
 
 	/**
+	   Optional file name where the task was submitted. This can be useful
+	   for debugging purposes.
+	*/
+	const char *file;
+
+	/**
+	  Optional line number where the task was submitted. This can be useful
+	   for debugging purposes.
+	*/
+	int line;
+
+	/**
 	   Pointer to the corresponding structure starpu_codelet. This
 	   describes where the kernel should be executed, and supplies
 	   the appropriate implementations. When set to <c>NULL</c>,
@@ -1487,6 +1503,16 @@ void starpu_task_destroy(struct starpu_task *task);
 */
 int starpu_task_submit(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
+#ifdef STARPU_USE_FXT
+static inline int starpu_task_submit_line(struct starpu_task *task, const char *file, int line)
+{
+	task->file = file;
+	task->line = line;
+	return starpu_task_submit(task);
+}
+#define starpu_task_submit(task) starpu_task_submit_line((task), __FILE__, __LINE__)
+#endif
+
 /**
    Submit \p task to StarPU with dependency bypass.
 

+ 34 - 1
include/starpu_task_util.h

@@ -327,7 +327,23 @@ extern "C"
  */
 #define STARPU_TASK_SCHED_DATA (41<<STARPU_MODE_SHIFT)
 
-#define STARPU_SHIFTED_MODE_MAX (42<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   char * stored in starpu_task::file.
+
+   This is automatically set when FXT is enabled.
+*/
+#define STARPU_TASK_FILE	 (42<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by an
+   int stored in starpu_task::line.
+
+   This is automatically set when FXT is enabled.
+*/
+#define STARPU_TASK_LINE	 (43<<STARPU_MODE_SHIFT)
+
+#define STARPU_SHIFTED_MODE_MAX (44<<STARPU_MODE_SHIFT)
 
 /**
    Set the given \p task corresponding to \p cl with the following arguments.
@@ -338,6 +354,11 @@ extern "C"
    starpu_task::cl_arg_free will be set to 1.
 */
 int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_task_set(task, cl, ...) \
+	starpu_task_set((task), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
+
 
 /**
    Create a task corresponding to \p cl with the following arguments.
@@ -348,6 +369,10 @@ int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...);
    starpu_task::cl_arg_free will be set to 1.
 */
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_task_build(cl, ...) \
+	starpu_task_build((cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    Create and submit a task corresponding to \p cl with the following
@@ -386,11 +411,19 @@ struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
    starpu_codelet_unpack_args() must be called within the codelet implementation to retrieve them.
 */
 int starpu_task_insert(struct starpu_codelet *cl, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_task_insert(cl, ...) \
+	starpu_task_insert((cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    Similar to starpu_task_insert(). Kept to avoid breaking old codes.
 */
 int starpu_insert_task(struct starpu_codelet *cl, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_insert_task(cl, ...) \
+	starpu_insert_task((cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    Assuming that there are already \p current_buffer data handles

+ 13 - 2
include/starpu_worker.h

@@ -68,7 +68,8 @@ enum starpu_worker_archtype
 	STARPU_MIC_WORKER=3,        /**< Intel MIC device */
 	STARPU_FPGA_WORKER=4,       /**< FPGA device */
 	STARPU_MPI_MS_WORKER=5,     /**< MPI Slave device */
-	STARPU_ANY_WORKER=6         /**< any worker, used in the hypervisor */
+	STARPU_MAX_WORKER=5,        /**< maximum value of STARPU_*_WORKER */
+	STARPU_ANY_WORKER=255       /**< any worker, used in the hypervisor */
 };
 
 /**
@@ -314,7 +315,12 @@ unsigned starpu_worker_is_slave_somewhere(int workerid);
 /**
    Return worker \p type as a string.
 */
-char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
+const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
+
+/**
+   Return worker \p type as a trivial string (CPU, CUDA, etc.)
+*/
+const char *starpu_worker_get_type_as_short_string(enum starpu_worker_archtype type);
 
 int starpu_bindid_get_workerids(int bindid, int **workerids);
 
@@ -377,6 +383,11 @@ int starpu_memory_nodes_numa_devid_to_id(unsigned id);
 enum starpu_node_kind starpu_node_get_kind(unsigned node);
 
 /**
+   Return the type of worker which operates on memory node kind \p node_kind
+  */
+enum starpu_worker_archtype starpu_memory_node_get_worker_archtype(enum starpu_node_kind node_kind);
+
+/**
    @name Scheduling operations
    @{
 */

+ 12 - 0
mpi/include/starpu_mpi.h

@@ -597,11 +597,19 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
    STARPU_MPI_CACHE).
 */
 int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_mpi_task_insert(comm, cl, ...) \
+	starpu_mpi_task_insert((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    Call starpu_mpi_task_insert(). Symbol kept for backward compatibility.
 */
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_mpi_insert_task(comm, cl, ...) \
+	starpu_mpi_insert_task((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    Create a task corresponding to \p codelet with the following given
@@ -617,6 +625,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
    creates it, with the SAME list of arguments.
 */
 struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+#ifdef STARPU_USE_FXT
+#define starpu_mpi_task_build(comm, cl, ...) \
+	starpu_mpi_task_build((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
+#endif
 
 /**
    MUST be called after a call to starpu_mpi_task_build(),

+ 16 - 1
mpi/src/starpu_mpi.c

@@ -53,7 +53,15 @@ static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum sta
 		}
 	}
 
-	starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
+	if (sequential_consistency)
+	{
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
+	}
+	else
+	{
+		/* post_sync_job_id has already been filled */
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
+	}
 }
 
 static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
@@ -185,6 +193,13 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 
 	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
 	_starpu_mpi_req_willpost(req);
+
+	if (sequential_consistency == 0)
+	{
+		/* Synchronization task jobid from redux is used */
+		_starpu_mpi_redux_fill_post_sync_jobid(arg, &(req->post_sync_jobid));
+	}
+
 	_starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
 	return req;
 }

+ 5 - 1
mpi/src/starpu_mpi_coop_sends.c

@@ -268,7 +268,11 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
 	if (first)
 	{
 		/* We were first, we are responsible for acquiring the data for everybody */
-		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &req->pre_sync_jobid, NULL);
+		long pre_sync_jobid;
+		starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_coop_sends_data_ready, coop_sends, sequential_consistency, 0, &pre_sync_jobid, NULL);
+		coop_sends->pre_sync_jobid = pre_sync_jobid;
 	}
+	else
+		req->pre_sync_jobid = coop_sends->pre_sync_jobid;
 }
 

+ 8 - 0
mpi/src/starpu_mpi_private.h

@@ -196,6 +196,9 @@ struct _starpu_mpi_coop_sends
 	struct _starpu_mpi_req **reqs_array;
 	unsigned n;
 	unsigned redirects_sent;
+
+	/* Used to trace dependencies */
+	long pre_sync_jobid;
 };
 
 /** Initialized in starpu_mpi_data_register_comm */
@@ -299,6 +302,11 @@ void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_
  */
 void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data);
 
+/*
+ * Fills post_sync_jobid with the reduction synchronization task jobid
+ */
+void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args, long * const post_sync_jobid);
+
 void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req);
 void _starpu_mpi_request_init(struct _starpu_mpi_req **req);
 struct _starpu_mpi_req * _starpu_mpi_request_fill(starpu_data_handle_t data_handle,

+ 23 - 0
mpi/src/starpu_mpi_task_insert.c

@@ -505,6 +505,14 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_TASK_FILE)
+		{
+			(void)va_arg(varg_list_copy, const char *);
+		}
+		else if (arg_type==STARPU_TASK_LINE)
+		{
+			(void)va_arg(varg_list_copy, int);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -661,6 +669,7 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
 	return val;
 }
 
+#undef starpu_mpi_task_insert
 int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
 	va_list varg_list;
@@ -672,6 +681,7 @@ int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	return ret;
 }
 
+#undef starpu_mpi_insert_task
 int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
 	va_list varg_list;
@@ -683,6 +693,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 	return ret;
 }
 
+#undef starpu_mpi_task_build
 struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
 	va_list varg_list;
@@ -726,6 +737,7 @@ struct _starpu_mpi_redux_data_args
 	int node;
 	MPI_Comm comm;
 	struct starpu_task *taskB;
+	long taskC_jobid;
 };
 
 void _starpu_mpi_redux_data_dummy_func(void *buffers[], void *cl_arg)
@@ -792,6 +804,13 @@ void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->data_tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
 }
 
+
+void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args, long * const post_sync_jobid)
+{
+	*post_sync_jobid = ((const struct _starpu_mpi_redux_data_args *) redux_data_args)->taskC_jobid;
+}
+
+
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
  * a data previously accessed in REDUX mode gets accessed in R mode. */
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
@@ -821,7 +840,9 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 		int i;
 
 		// taskC depends on all taskBs created
+		// Creating synchronization task and use its jobid for tracing
 		struct starpu_task *taskC = starpu_task_create();
+		const long taskC_jobid = starpu_task_get_job_id(taskC);
 		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
 		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
 
@@ -855,6 +876,8 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				args->node = i;
 				args->comm = comm;
 
+				args->taskC_jobid = taskC_jobid;
+
 				// We need to create taskB early as
 				// taskC declares a dependancy on it
 				args->taskB = starpu_task_create();

+ 10 - 0
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -367,6 +367,16 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void * */
 		}
+		else if (arg_type==STARPU_TASK_FILE)
+		{
+			arg_i++;
+			/* char* */
+		}
+		else if (arg_type==STARPU_TASK_LINE)
+		{
+			arg_i++;
+			/* int */
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);

+ 1 - 0
src/Makefile.am

@@ -197,6 +197,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	core/disk_ops/disk_unistd.c                             \
 	core/disk_ops/unistd/disk_unistd_global.c		\
 	core/perfmodel/perfmodel_history.c			\
+        core/perfmodel/energy_model.c                           \
 	core/perfmodel/perfmodel_bus.c				\
 	core/perfmodel/perfmodel.c				\
 	core/perfmodel/perfmodel_print.c			\

+ 3 - 1
src/common/fxt.c

@@ -279,12 +279,14 @@ static void _starpu_generate_paje_trace(char *input_fxt_filename, char *output_p
 
 	options.ninputfiles = 1;
 	options.filenames[0] = input_fxt_filename;
-	options.out_paje_path = output_paje_filename;
+	free(options.out_paje_path);
+	options.out_paje_path = strdup(output_paje_filename);
 	options.file_prefix = "";
 	options.file_rank = -1;
 	options.dir = dirname;
 
 	starpu_fxt_generate_trace(&options);
+	starpu_fxt_options_shutdown(&options);
 }
 
 void _starpu_fxt_dump_file(void)

+ 9 - 0
src/common/fxt.h

@@ -111,6 +111,8 @@
 
 #define _STARPU_FUT_DATA_DOING_WONT_USE	0x512e
 
+#define _STARPU_FUT_TASK_LINE	0x512f
+
 #define	_STARPU_FUT_START_MEMRECLAIM	0x5131
 #define	_STARPU_FUT_END_MEMRECLAIM	0x5132
 
@@ -840,6 +842,12 @@ do {									\
 	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_EXCLUDE_FROM_DAG, (job)->job_id, (long unsigned)exclude_from_dag); \
 } while(0)
 
+#define _STARPU_TRACE_TASK_LINE(job)					\
+	do {								\
+		if ((job)->task->file)					\
+			_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_LINE, (job)->job_id, (job)->task->line, (job)->task->file); \
+} while(0)
+
 #define _STARPU_TRACE_TASK_NAME(job)					\
 	do {								\
         const char *model_name = _starpu_job_get_model_name((job));		\
@@ -1328,6 +1336,7 @@ do {										\
 #define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_TASK_EXCLUDE_FROM_DAG(a)	do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_NAME(a)		do {(void)(a);} while(0)
+#define _STARPU_TRACE_TASK_LINE(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_COLOR(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_DONE(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TAG_DONE(a)		do {(void)(a);} while(0)

+ 277 - 0
src/core/perfmodel/energy_model.c

@@ -0,0 +1,277 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2008-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#ifdef STARPU_PAPI
+#include <papi.h>
+#endif
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
+#include <starpu_perfmodel.h>
+#include <starpu_profiling.h>
+#include <common/config.h>
+#include <common/utils.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <datawizard/datawizard.h>
+#include <core/task.h>
+
+#ifdef STARPU_USE_CUDA
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+#include <nvml.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#endif
+
+#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
+
+#if 0
+#define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
+#else
+#define debug(fmt, ...)
+#endif
+
+#ifdef STARPU_PAPI
+static const int N_EVTS = 2;
+
+static int nsockets;
+
+static const char* event_names[] = { "rapl::RAPL_ENERGY_PKG:cpu=%d",
+				     "rapl::RAPL_ENERGY_DRAM:cpu=%d"};
+
+static int add_event(int EventSet, int socket);
+
+/* PAPI variables*/
+
+/*must be initialized to PAPI_NULL before calling PAPI_create_event*/
+static int EventSet = PAPI_NULL;
+
+/*This is where we store the values we read from the eventset */
+static long long *values;
+
+#endif
+
+static double t1;
+
+#ifdef STARPU_USE_CUDA
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+static unsigned long long energy_begin, energy_end;
+static nvmlDevice_t device;
+#endif
+#endif
+
+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
+{
+	t1 = starpu_timing_now();
+
+	switch (archi)
+	{
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
+	case STARPU_CPU_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
+
+		int retval, number;
+
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t topology = config->topology.hwtopology;
+
+		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+
+		values=calloc(nsockets * N_EVTS,sizeof(long long));
+		STARPU_ASSERT(values);
+
+		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
+			ERROR_RETURN(retval);
+
+		/* Creating the eventset */
+		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		int i;
+		for (i = 0 ; i < nsockets ; i ++ )
+		{
+			/* return the index of socket */
+			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
+			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
+				ERROR_RETURN(retval);
+		}
+
+		/* get the number of events in the event set */
+		number = 0;
+		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		debug("There are %d events in the event set\n", number);
+
+		/* Start counting */
+		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		return retval;
+	}
+#endif
+#endif
+
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+	case STARPU_CUDA_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
+		int devid = starpu_worker_get_devid(workerid);
+		int ret = nvmlDeviceGetHandleByIndex_v2 (devid,  &device);
+		if (ret != NVML_SUCCESS)
+		{
+			_STARPU_DISP("Could not get CUDA device %d from nvml\n", devid);
+			return -1;
+		}
+		ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_begin );
+		if (ret != NVML_SUCCESS)
+		{
+			_STARPU_DISP("Could not measure energy used by CUDA device %d\n", devid);
+			return -1;
+		}
+		return 0;
+	}
+	break;
+#endif
+
+	default:
+		printf("Error: worker is not supported ! \n");
+		return -1;
+	}
+}
+
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi)
+{
+	double energy = 0.;
+	int retval;
+	unsigned cpuid = 0;
+	double t2 = starpu_timing_now();
+	double t STARPU_ATTRIBUTE_UNUSED = t2 - t1;
+
+	switch (archi)
+	{
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
+	case STARPU_CPU_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
+
+		/* Stop counting and store the values into the array */
+		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		int k,s;
+
+		for( s = 0 ; s < nsockets ; s ++)
+		{
+			for(k = 0 ; k < N_EVTS; k++)
+			{
+				double delta = values[s * N_EVTS + k]*0.23/1.0e9;
+				energy += delta;
+
+				debug("%-40s%12.6f J\t(for %f us, Average Power %.1fW)\n",
+				      event_names[k],
+				      delta, t, delta/(t*1.0E-6));
+			}
+		}
+		free(values);
+
+		energy = energy * 0.23 / 1.0e9 / ntasks;
+
+		/*removes all events from a PAPI event set */
+		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		/*deallocates the memory associated with an empty PAPI EventSet*/
+		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		break;
+	}
+#endif
+#endif
+
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+	case STARPU_CUDA_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
+		int ret = nvmlDeviceGetTotalEnergyConsumption(device, &energy_end );
+		if (ret != NVML_SUCCESS)
+			return -1;
+		energy = (energy_end - energy_begin) / 1000.;
+		debug("energy consumption on device %d is %f mJ (for %f us, Average power %0.1fW)\n", 0, energy * 1000., t, energy / (t*1.0E-6));
+		break;
+	}
+#endif
+
+	default:
+	{
+		printf("Error: worker type %d is not supported! \n", archi);
+		return -1;
+		break;
+	}
+	}
+
+
+	struct starpu_perfmodel_arch *arch;
+	if (workerid == -1)
+		/* Just take one of them */
+		workerid = starpu_worker_get_by_type(archi, 0);
+
+	arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
+
+	starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
+
+	return retval;
+}
+
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
+static int add_event(int eventSet, int socket)
+{
+	int retval, i;
+	for (i = 0; i < N_EVTS; i++)
+	{
+		char buf[255];
+		snprintf(buf, sizeof(buf), event_names[i], socket);
+
+		/* printf("Activating multiplex\n"); */
+		/* retval = PAPI_set_multiplex(eventSet); */
+		/* if(retval != PAPI_OK) { */
+		/*      _STARPU_DISP("cannot set multiplex\n"); */
+		/*      return retval; */
+		/* } */
+		retval = PAPI_add_named_event(eventSet, buf);
+		if (retval != PAPI_OK)
+		{
+			_STARPU_DISP("cannot add event '%s': %d\n", buf, retval);
+			return retval;
+		}
+	}
+
+	return ( PAPI_OK );
+}
+#endif
+#endif

+ 5 - 10
src/core/perfmodel/perfmodel.h

@@ -74,14 +74,10 @@ void _starpu_initialize_registered_performance_models(void);
 void _starpu_deinitialize_registered_performance_models(void);
 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
 
-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
-					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
-double _starpu_multiple_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,
-					struct _starpu_job *j, unsigned nimpl);
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
-				unsigned cpuid, double measured, unsigned nimpl);
+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
+double _starpu_multiple_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned cpuid, double measured, unsigned nimpl, unsigned number);
 int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch);
 
 void _starpu_create_sampling_directory_if_needed(void);
@@ -102,8 +98,7 @@ unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid);
 #endif
 
-void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read,
-					     double latency_write, double latency_read, unsigned node, const char *name);
+void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node, const char *name);
 
 void _starpu_write_double(FILE *f, const char *format, double val);
 int _starpu_read_double(FILE *f, char *format, double *val);

+ 17 - 12
src/core/perfmodel/perfmodel_history.c

@@ -804,7 +804,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 	/* Dump the history into the model file in case it is necessary */
        if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
 	{
-		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us)\tdev (us)\tsum\t\tsum2\t\tn\n");
+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us or J)\tdev (us or J)\tsum\t\tsum2\t\tn\n");
 		ptr = per_arch_model->list;
 		while (ptr)
 		{
@@ -1839,7 +1839,7 @@ int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch)
 	return comb;
 }
 
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl, unsigned number)
 {
 	STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
 	if (model)
@@ -1909,11 +1909,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				if (model->type != STARPU_HISTORY_BASED)
+				if (number != 1 || model->type != STARPU_HISTORY_BASED)
 				{
-					entry->sum = measured;
-					entry->sum2 = measured*measured;
-					entry->nsample = 1;
+					entry->sum = measured * number;
+					entry->sum2 = measured*measured * number;
+					entry->nsample = number;
 					entry->mean = measured;
 				}
 
@@ -1934,7 +1934,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 					(100 * local_deviation > (100 + historymaxerror)
 					 || (100 / local_deviation > (100 + historymaxerror))))
 				{
-					entry->nerror++;
+					entry->nerror+=number;
 
 					/* More errors than measurements, we're most probably completely wrong, we flush out all the entries */
 					if (entry->nerror >= entry->nsample)
@@ -1952,9 +1952,9 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				}
 				else
 				{
-					entry->sum += measured;
-					entry->sum2 += measured*measured;
-					entry->nsample++;
+					entry->sum += measured * number;
+					entry->sum2 += measured*measured * number;
+					entry->nsample += number;
 
 					unsigned n = entry->nsample;
 					entry->mean = entry->sum / n;
@@ -2070,7 +2070,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 	}
 }
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured, unsigned number)
 {
 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
 
@@ -2080,11 +2080,16 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
 	_starpu_init_and_load_perfmodel(model);
 	/* Record measurement */
-	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl);
+	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl, number);
 	/* and save perfmodel on termination */
 	_starpu_set_calibrate_flag(1);
 }
 
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
+{
+	starpu_perfmodel_update_history_n(model, task, arch, cpuid, nimpl, measured, 1);
+}
+
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model)
 {
 	int comb;

+ 4 - 0
src/core/sched_policy.c

@@ -417,6 +417,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
 int _starpu_push_task(struct _starpu_job *j)
 {
+#ifdef STARPU_SIMGRID
+	//if (_starpu_simgrid_task_push_cost())
+		starpu_sleep(0.000001);
+#endif
 	if(j->task->prologue_callback_func)
 	{
 		_starpu_set_current_task(j->task);

+ 2 - 0
src/core/task.c

@@ -942,6 +942,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
 		_STARPU_TRACE_TASK_NAME(j);
+		_STARPU_TRACE_TASK_LINE(j);
 	}
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
@@ -1012,6 +1013,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	return ret;
 }
 
+#undef starpu_task_submit
 int starpu_task_submit(struct starpu_task *task)
 {
 	return _starpu_task_submit(task, 0);

+ 1 - 1
src/core/topology.c

@@ -881,7 +881,7 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 #ifdef HAVE_HWLOC_CPUKINDS_GET_NR
 	int nr_kinds = hwloc_cpukinds_get_nr(topology->hwtopology, 0);
 	if (nr_kinds > 1)
-		_STARPU_DISP("Warning: there are several kinds of CPU on this system. For now StarPU assumes all CPU are equal\n", strerror(errno));
+		_STARPU_DISP("Warning: there are several kinds of CPU on this system. For now StarPU assumes all CPU are equal\n");
 #endif
 
 	if (starpu_get_env_number_default("STARPU_WORKERS_GETBIND", 0))

+ 35 - 8
src/core/workers.c

@@ -1192,6 +1192,16 @@ int starpu_conf_init(struct starpu_conf *conf)
 	return 0;
 }
 
+int starpu_conf_noworker(struct starpu_conf *conf)
+{
+	conf->ncpus = 0;
+	conf->ncuda = 0;
+	conf->nopencl = 0;
+	conf->nfpga = 0;
+	conf->nmic = 0;
+	conf->nmpi_ms = 0;
+}
+
 static void _starpu_conf_set_value_against_environment(char *name, int *value, int precedence_over_env)
 {
 	if (precedence_over_env == 0)
@@ -2602,15 +2612,32 @@ unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 	return nsched_ctxs;
 }
 
-char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
+const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
+{
+	switch (type) {
+		case STARPU_CPU_WORKER: return "STARPU_CPU_WORKER";
+		case STARPU_CUDA_WORKER: return "STARPU_CUDA_WORKER";
+		case STARPU_OPENCL_WORKER: return "STARPU_OPENCL_WORKER";
+		case STARPU_FPGA_WORKER: return "STARPU_FPGA_WORKER";
+		case STARPU_MIC_WORKER: return "STARPU_MIC_WORKER";
+		case STARPU_MPI_MS_WORKER: return "STARPU_MPI_MS_WORKER";
+		case STARPU_ANY_WORKER: return "STARPU_ANY_WORKER";
+		default: return "STARPU_unknown_WORKER";
+	}
+}
+
+const char *starpu_worker_get_type_as_short_string(enum starpu_worker_archtype type)
 {
-	if (type == STARPU_CPU_WORKER) return "STARPU_CPU_WORKER";
-	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
-	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
-	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
-        if (type == STARPU_MPI_MS_WORKER) return "STARPU_MPI_MS_WORKER";
-	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
-	return "STARPU_unknown_WORKER";
+	switch (type) {
+		case STARPU_CPU_WORKER: return "CPU";
+		case STARPU_CUDA_WORKER: return "CUDA";
+		case STARPU_OPENCL_WORKER: return "OPENCL";
+		case STARPU_FPGA_WORKER: return "FPGA";
+		case STARPU_MIC_WORKER: return "MIC";
+		case STARPU_MPI_MS_WORKER: return "MPI_MS";
+		case STARPU_ANY_WORKER: return "ANY";
+		default: return "STARPU_unknown_WORKER";
+	}
 }
 
 void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx)

+ 13 - 0
src/datawizard/memory_nodes.c

@@ -179,3 +179,16 @@ int starpu_memory_node_get_devid(unsigned node)
 {
 	return _starpu_descr.devid[node];
 }
+
+enum starpu_worker_archtype starpu_memory_node_get_worker_archtype(enum starpu_node_kind node_kind) {
+	switch (node_kind) {
+		// case STARPU_UNUSED:
+		case STARPU_CPU_RAM: return STARPU_CPU_WORKER;
+		case STARPU_CUDA_RAM: return STARPU_CUDA_WORKER;
+		case STARPU_OPENCL_RAM: return STARPU_OPENCL_WORKER;
+		// case STARPU_DISK_RAM:
+		case STARPU_MIC_RAM: return STARPU_MIC_WORKER;
+		case STARPU_MPI_MS_RAM: return STARPU_MPI_MS_WORKER;
+		default: STARPU_ASSERT_MSG(0, "ambiguous memory node kind %d", node_kind);
+	}
+}

+ 53 - 40
src/debug/traces/starpu_fxt.c

@@ -105,6 +105,8 @@ struct task_info
 	UT_hash_handle hh;
 	char *model_name;
 	char *name;
+	char *file;
+	int line;
 	int exclude_from_dag;
 	int show;
 	unsigned type;
@@ -143,6 +145,8 @@ static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 		_STARPU_MALLOC(task, sizeof(*task));
 		task->model_name = NULL;
 		task->name = NULL;
+		task->file = NULL;
+		task->line = -1;
 		task->exclude_from_dag = 0;
 		task->show = 0;
 		task->type = 0;
@@ -201,6 +205,10 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 		fprintf(tasks_file, "Name: %s\n", task->name);
 	if (task->model_name)
 		fprintf(tasks_file, "Model: %s\n", task->model_name);
+	if (task->file) {
+		fprintf(tasks_file, "File: %s\n", task->file);
+		fprintf(tasks_file, "Line: %d\n", task->line);
+	}
 	fprintf(tasks_file, "JobId: %s%lu\n", prefix, task->job_id);
 	if (task->submit_order)
 		fprintf(tasks_file, "SubmitOrder: %lu\n", task->submit_order);
@@ -272,6 +280,7 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 out:
 	free(task->name);
 	free(task->model_name);
+	free(task->file);
 	free(task->dependencies);
 	if (task->dep_labels)
 	{
@@ -2906,6 +2915,21 @@ static void handle_task_name(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 		_starpu_fxt_dag_set_task_name(options->file_prefix, job_id, task->name, color);
 }
 
+static void handle_task_line(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned long job_id = ev->param[0];
+	int line = ev->param[1];
+	char *file = get_fxt_string(ev,2);
+
+	struct task_info *task = get_task(job_id, options->file_rank);
+	task->file = strdup(file);
+	task->line = line;
+
+	if (!task->exclude_from_dag && show_task(task, options))
+		_starpu_fxt_dag_set_task_line(options->file_prefix, job_id, task->file, line);
+}
+
+
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
         /* Ideally, we would be able to dump tasks as they terminate, to save
@@ -3715,6 +3739,10 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				handle_task_name(&ev, options);
 				break;
 
+			case _STARPU_FUT_TASK_LINE:
+				handle_task_line(&ev, options);
+				break;
+
 			case _STARPU_FUT_TASK_COLOR:
 				handle_task_color(&ev, options);
 				break;
@@ -4286,31 +4314,18 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 /* Initialize FxT options to default values */
 void starpu_fxt_options_init(struct starpu_fxt_options *options)
 {
-	options->per_task_colour = 0;
-	options->no_events = 0;
-	options->no_counter = 0;
-	options->no_bus = 0;
-	options->no_flops = 0;
-	options->no_smooth = 0;
-	options->no_acquire = 0;
-	options->memory_states = 0;
-	options->internal = 0;
-	options->label_deps = 0;
-	options->ninputfiles = 0;
-	options->out_paje_path = "paje.trace";
-	options->dag_path = "dag.dot";
-	options->tasks_path = "tasks.rec";
-	options->comms_path = "comms.rec";
-	options->number_events_path = NULL;
-	options->data_path = "data.rec";
-	options->papi_path = "papi.rec";
-	options->anim_path = "trace.html";
-	options->states_path = "trace.rec";
-	options->distrib_time_path = "distrib.data";
-	options->dumped_codelets = NULL;
-	options->activity_path = "activity.data";
-	options->sched_tasks_path = "sched_tasks.rec";
-	options->dir = NULL;
+	memset(options, 0, sizeof(struct starpu_fxt_options));
+	options->out_paje_path = strdup("paje.trace");
+	options->dag_path = strdup("dag.dot");
+	options->tasks_path = strdup("tasks.rec");
+	options->comms_path = strdup("comms.rec");
+	options->data_path = strdup("data.rec");
+	options->papi_path = strdup("papi.rec");
+	options->anim_path = strdup("trace.html");
+	options->states_path = strdup("trace.rec");
+	options->distrib_time_path = strdup("distrib.data");
+	options->activity_path = strdup("activity.data");
+	options->sched_tasks_path = strdup("sched_tasks.rec");
 }
 
 static
@@ -4319,6 +4334,7 @@ void _set_dir(char *dir, char **option)
 	if (*option)
 	{
 		char *tmp = strdup(*option);
+		free(*option);
 		_STARPU_MALLOC(*option, 256);
 		snprintf(*option, 256, "%s/%s", dir, tmp);
 		free(tmp);
@@ -4348,21 +4364,18 @@ void _starpu_fxt_options_set_dir(struct starpu_fxt_options *options)
 
 void starpu_fxt_options_shutdown(struct starpu_fxt_options *options)
 {
-	if (options->dir)
-	{
-		free(options->out_paje_path);
-		free(options->dag_path);
-		free(options->tasks_path);
-		free(options->comms_path);
-		free(options->number_events_path);
-		free(options->data_path);
-		free(options->papi_path);
-		free(options->anim_path);
-		free(options->states_path);
-		free(options->distrib_time_path);
-		free(options->activity_path);
-		free(options->sched_tasks_path);
-	}
+	free(options->out_paje_path);
+	free(options->dag_path);
+	free(options->tasks_path);
+	free(options->comms_path);
+	free(options->number_events_path);
+	free(options->data_path);
+	free(options->papi_path);
+	free(options->anim_path);
+	free(options->states_path);
+	free(options->distrib_time_path);
+	free(options->activity_path);
+	free(options->sched_tasks_path);
 }
 
 static

+ 1 - 0
src/debug/traces/starpu_fxt.h

@@ -50,6 +50,7 @@ void _starpu_fxt_dag_add_tag_deps(const char *prefix, uint64_t child, uint64_t f
 void _starpu_fxt_dag_set_tag_done(const char *prefix, uint64_t tag, const char *color);
 void _starpu_fxt_dag_add_task_deps(const char *prefix, unsigned long dep_prev, unsigned long dep_succ, const char *label);
 void _starpu_fxt_dag_set_task_name(const char *prefix, unsigned long job_id, const char *label, const char *color);
+void _starpu_fxt_dag_set_task_line(const char *prefix, unsigned long job_id, const char *file, int line);
 void _starpu_fxt_dag_add_send(int src, unsigned long dep_prev, unsigned long tag, unsigned long id);
 void _starpu_fxt_dag_add_receive(int dst, unsigned long dep_prev, unsigned long tag, unsigned long id);
 void _starpu_fxt_dag_add_sync_point(void);

+ 6 - 0
src/debug/traces/starpu_fxt_dag.c

@@ -110,6 +110,12 @@ void _starpu_fxt_dag_set_task_name(const char *prefix, unsigned long job_id, con
 		fprintf(out_file, "\t \"task_%s%lu\" [ style=filled, label=\"%s\", fillcolor=\"%s\"]\n", prefix, job_id, label, color);
 }
 
+void _starpu_fxt_dag_set_task_line(const char *prefix, unsigned long job_id, const char *file, int line)
+{
+	if (out_file)
+		fprintf(out_file, "\t \"task_%s%lu\" [ href=\"%s#%d\" ]\n", prefix, job_id, file, line);
+}
+
 void _starpu_fxt_dag_add_send(int src, unsigned long dep_prev, unsigned long tag, unsigned long id)
 {
 	if (out_file)

+ 2 - 2
src/drivers/driver_common/driver_common.c

@@ -276,7 +276,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 				do_update_time_model = 0;
 			if (do_update_time_model)
 			{
-				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
+				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl, 1);
 			}
 		}
 	}
@@ -312,7 +312,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 			do_update_energy_model = 0;
 		if (do_update_energy_model)
 		{
-			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl);
+			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl, 1);
 		}
 	}
 }

+ 1 - 0
src/drivers/max/driver_fpga.c

@@ -167,6 +167,7 @@ static void init_device_context(unsigned devid)
 
 	/* 0 would be seen as NULL, i.e. allocation failed... */
 	// FIXME: Maxeler FPGAs want 192-byte alignment
+	// TODO: use int 	max_get_burst_size (max_file_t *maxfile, const char *name)
 	current_address[devid] = (fpga_mem) (8192*192);
 	global_mem[devid] = 128ULL*1024*1024*1024;
 

+ 34 - 35
src/profiling/profiling.c

@@ -146,43 +146,42 @@ void _starpu_profiling_init(void)
 	}
 
 #ifdef STARPU_PAPI
-		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
-		int retval = PAPI_library_init(PAPI_VER_CURRENT);
-		if (retval != PAPI_VER_CURRENT)
-		{
-			 _STARPU_MSG("Failed init PAPI, error: %s.\n", PAPI_strerror(retval));
-		}
-		retval = PAPI_thread_init(pthread_self);
-		if (retval != PAPI_OK)
-		{
-			 _STARPU_MSG("Failed init PAPI thread, error: %s.\n", PAPI_strerror(retval));
-		}
+	STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
+	int retval = PAPI_library_init(PAPI_VER_CURRENT);
+	if (retval != PAPI_VER_CURRENT)
+	{
+		_STARPU_MSG("Failed init PAPI, error: %s.\n", PAPI_strerror(retval));
+	}
+	retval = PAPI_thread_init(pthread_self);
+	if (retval != PAPI_OK)
+	{
+		_STARPU_MSG("Failed init PAPI thread, error: %s.\n", PAPI_strerror(retval));
+	}
 
-		char *conf_papi_events;
-		char *papi_event_name;
-		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
-		papi_nevents = 0;
-		if (conf_papi_events != NULL)
+	char *conf_papi_events;
+	char *papi_event_name;
+	conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
+	papi_nevents = 0;
+	if (conf_papi_events != NULL)
+	{
+		while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
 		{
-			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
+			if (papi_nevents == PAPI_MAX_HWCTRS)
 			{
-				if (papi_nevents == PAPI_MAX_HWCTRS)
-				{
-				      _STARPU_MSG("Too many requested papi counters, ignoring %s\n", papi_event_name);
-				      continue;
-				}
-
-				_STARPU_DEBUG("Loading PAPI Event: %s\n", papi_event_name);
-				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
-				if (retval != PAPI_OK)
-				      _STARPU_MSG("Failed to codify papi event [%s], error: %s.\n", papi_event_name, PAPI_strerror(retval));
-				else
-					papi_nevents++;
+				_STARPU_MSG("Too many requested papi counters, ignoring %s\n", papi_event_name);
+				continue;
 			}
+
+			_STARPU_DEBUG("Loading PAPI Event: %s\n", papi_event_name);
+			retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
+			if (retval != PAPI_OK)
+				_STARPU_MSG("Failed to codify papi event [%s], error: %s.\n", papi_event_name, PAPI_strerror(retval));
+			else
+				papi_nevents++;
 		}
-		STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
 #endif
-
 }
 
 #ifdef STARPU_PAPI
@@ -195,10 +194,11 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 	profiling_info = task->profiling_info;
 	if (profiling_info && papi_nevents)
 	{
+		int i;
 		profiling_info->papi_event_set = PAPI_NULL;
 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_create_eventset(&profiling_info->papi_event_set);
-		for(int i=0; i<papi_nevents; i++)
+		for(i=0; i<papi_nevents; i++)
 		{
 			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
 			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
@@ -224,9 +224,10 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 
 	if (profiling_info && papi_nevents)
 	{
+		int i;
 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_stop(profiling_info->papi_event_set, profiling_info->papi_values);
-		for(int i=0; i<papi_nevents; i++)
+		for(i=0; i<papi_nevents; i++)
 		{
 			_STARPU_TRACE_PAPI_TASK_EVENT(papi_events[i], task, profiling_info->papi_values[i]);
 		}
@@ -266,7 +267,6 @@ void _starpu_profiling_terminate(void)
 /*
  *	Task profiling
  */
-
 struct starpu_profiling_task_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task)
 {
 	struct starpu_profiling_task_info *info = NULL;
@@ -283,7 +283,6 @@ struct starpu_profiling_task_info *_starpu_allocate_profiling_info_if_needed(str
 /*
  *	Worker profiling
  */
-
 static void _starpu_worker_reset_profiling_info_with_lock(int workerid)
 {
 	_starpu_clock_gettime(&worker_info[workerid].start_time);

+ 34 - 76
src/sched_policies/heteroprio.c

@@ -37,6 +37,8 @@
 #define DBL_MAX __DBL_MAX__
 #endif
 
+#define STARPU_NB_TYPES (STARPU_MAX_WORKER+1)
+
 /* A bucket corresponds to a Pair of priorities
  * When a task is pushed with a priority X, it will be stored
  * into the bucket X.
@@ -107,8 +109,16 @@ struct _starpu_heteroprio_data
 	unsigned nb_workers_per_arch_index[STARPU_NB_TYPES];
 };
 
+
+static int starpu_heteroprio_types_to_arch(enum starpu_worker_archtype arch)
+{
+	if (arch > STARPU_MAX_WORKER)
+		return 0;
+	return STARPU_WORKER_TO_MASK(arch);
+}
+
 /** Tell how many prio there are for a given arch */
-void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned max_prio)
+void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned max_prio)
 {
 	struct _starpu_heteroprio_data *hp = (struct _starpu_heteroprio_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 
@@ -118,7 +128,7 @@ void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteropri
 }
 
 /** Set the mapping for a given arch prio=>bucket */
-inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned source_prio, unsigned dest_bucket_id)
+inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned source_prio, unsigned dest_bucket_id)
 {
 	STARPU_ASSERT(dest_bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
 
@@ -126,12 +136,12 @@ inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_het
 
 	hp->prio_mapping_per_arch_index[arch][source_prio] = dest_bucket_id;
 
-	hp->buckets[dest_bucket_id].valid_archs |= starpu_heteroprio_types_to_arch[arch];
+	hp->buckets[dest_bucket_id].valid_archs |= starpu_heteroprio_types_to_arch(arch);
 	_STARPU_DEBUG("Adding arch %d to bucket %u\n", arch, dest_bucket_id);
 }
 
 /** Tell which arch is the faster for the tasks of a bucket (optional) */
-inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id)
+inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id)
 {
 	STARPU_ASSERT(bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
 
@@ -143,7 +153,7 @@ inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu
 }
 
 /** Tell how slow is a arch for the tasks of a bucket (optional) */
-inline void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id, float slow_factor)
+inline void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id, float slow_factor)
 {
 	STARPU_ASSERT(bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
 
@@ -160,52 +170,22 @@ static inline void default_init_sched(unsigned sched_ctx_id)
 	int max_prio = starpu_sched_ctx_get_max_priority(sched_ctx_id);
 	STARPU_ASSERT(min_prio >= 0);
 	STARPU_ASSERT(max_prio >= 0);
+
+	enum starpu_worker_archtype type;
+
 	// By default each type of devices uses 1 bucket and no slow factor
-#ifdef STARPU_USE_CPU
-	if (starpu_cpu_worker_get_count() > 0)
-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_CPU_IDX, max_prio-min_prio+1);
-#endif
-#ifdef STARPU_USE_CUDA
-	if (starpu_cuda_worker_get_count() > 0)
-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_CUDA_IDX, max_prio-min_prio+1);
-#endif
-#ifdef STARPU_USE_OPENCL
-	if (starpu_opencl_worker_get_count() > 0)
-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_OPENCL_IDX, max_prio-min_prio+1);
-#endif
-#ifdef STARPU_USE_MIC
-	if (starpu_mic_worker_get_count() > 0)
-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_MIC_IDX, max_prio-min_prio+1);
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-	if (starpu_mpi_ms_worker_get_count() > 0)
-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_MPI_MS_IDX, max_prio-min_prio+1);
-#endif
+	for (type = 0; type <= STARPU_MAX_WORKER; type++)
+		if (starpu_worker_get_count_by_type(type) > 0)
+			starpu_heteroprio_set_nb_prios(sched_ctx_id, type, max_prio-min_prio+1);
 
 	// Direct mapping
 	int prio;
 	for(prio=min_prio ; prio<=max_prio ; prio++)
 	{
-#ifdef STARPU_USE_CPU
-		if (starpu_cpu_worker_get_count() > 0)
-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_CPU_IDX, prio, prio);
-#endif
-#ifdef STARPU_USE_CUDA
-		if (starpu_cuda_worker_get_count() > 0)
-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_CUDA_IDX, prio, prio);
-#endif
-#ifdef STARPU_USE_OPENCL
-		if (starpu_opencl_worker_get_count() > 0)
-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_OPENCL_IDX, prio, prio);
-#endif
-#ifdef STARPU_USE_MIC
-		if (starpu_mic_worker_get_count() > 0)
-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_MIC_IDX, prio, prio);
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-		if (starpu_mpi_ms_worker_get_count() > 0)
-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_MPI_MS_IDX, prio, prio);
-#endif
+		// By default each type of devices uses 1 bucket and no slow factor
+		for (type = 0; type <= STARPU_MAX_WORKER; type++)
+			if (starpu_worker_get_count_by_type(type) > 0)
+				starpu_heteroprio_set_mapping(sched_ctx_id, type, prio, prio);
 	}
 }
 
@@ -249,7 +229,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 			const unsigned mapped_prio = hp->prio_mapping_per_arch_index[arch_index][idx_prio];
 			STARPU_ASSERT(mapped_prio <= STARPU_HETEROPRIO_MAX_PRIO);
 			STARPU_ASSERT(hp->buckets[mapped_prio].slow_factors_per_index[arch_index] >= 0.0);
-			STARPU_ASSERT(hp->buckets[mapped_prio].valid_archs & starpu_heteroprio_types_to_arch[arch_index]);
+			STARPU_ASSERT(hp->buckets[mapped_prio].valid_archs & starpu_heteroprio_types_to_arch(arch_index));
 			check_archs[mapped_prio]      = 1;
 			check_all_archs[mapped_prio] += 1;
 		}
@@ -257,7 +237,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 		{
 			/* Ensure the current arch use a bucket or someone else can use it */
 			STARPU_ASSERT(check_archs[idx_prio] == 1 || hp->buckets[idx_prio].valid_archs == 0
-				      || (hp->buckets[idx_prio].valid_archs & ~starpu_heteroprio_types_to_arch[arch_index]) != 0);
+				      || (hp->buckets[idx_prio].valid_archs & ~starpu_heteroprio_types_to_arch(arch_index)) != 0);
 		}
 	}
 	/* Ensure that if a valid_archs = (STARPU_CPU|STARPU_CUDA) then check_all_archs[] = 2 for example */
@@ -267,7 +247,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 		unsigned nb_arch_on_bucket = 0;
 		for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
 		{
-			if(hp->buckets[idx_prio].valid_archs & starpu_heteroprio_types_to_arch[arch_index])
+			if(hp->buckets[idx_prio].valid_archs & starpu_heteroprio_types_to_arch(arch_index))
 			{
 				nb_arch_on_bucket += 1;
 			}
@@ -310,32 +290,10 @@ static void add_workers_heteroprio_policy(unsigned sched_ctx_id, int *workerids,
 		memset(&hp->workers_heteroprio[workerid], 0, sizeof(hp->workers_heteroprio[workerid]));
 		/* if the worker has already belonged to this context
 		   the queue and the synchronization variables have been already initialized */
-			_starpu_prio_deque_init(&hp->workers_heteroprio[workerid].tasks_queue);
-			switch(starpu_worker_get_type(workerid))
-			{
-			case STARPU_CPU_WORKER:
-				hp->workers_heteroprio[workerid].arch_type = STARPU_CPU;
-				hp->workers_heteroprio[workerid].arch_index = STARPU_CPU_IDX;
-				break;
-			case STARPU_CUDA_WORKER:
-				hp->workers_heteroprio[workerid].arch_type = STARPU_CUDA;
-				hp->workers_heteroprio[workerid].arch_index = STARPU_CUDA_IDX;
-				break;
-			case STARPU_OPENCL_WORKER:
-				hp->workers_heteroprio[workerid].arch_type = STARPU_OPENCL;
-				hp->workers_heteroprio[workerid].arch_index = STARPU_OPENCL_IDX;
-				break;
-			case STARPU_MIC_WORKER:
-				hp->workers_heteroprio[workerid].arch_type = STARPU_MIC;
-				hp->workers_heteroprio[workerid].arch_index = STARPU_MIC_IDX;
-				break;
-			case STARPU_MPI_MS_WORKER:
-				hp->workers_heteroprio[workerid].arch_type = STARPU_MPI_MS;
-				hp->workers_heteroprio[workerid].arch_index = STARPU_MPI_MS_IDX;
-				break;
-			default:
-				STARPU_ASSERT(0);
-			}
+		enum starpu_worker_archtype arch_index = starpu_worker_get_type(workerid);
+		_starpu_prio_deque_init(&hp->workers_heteroprio[workerid].tasks_queue);
+		hp->workers_heteroprio[workerid].arch_index = arch_index;
+		hp->workers_heteroprio[workerid].arch_type = starpu_heteroprio_types_to_arch(arch_index);
 		hp->nb_workers_per_arch_index[hp->workers_heteroprio[workerid].arch_index]++;
 
 	}
@@ -379,7 +337,7 @@ static int push_task_heteroprio_policy(struct starpu_task *task)
 	for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
 	{
 		/* We test the archs on the bucket and not on task->where since it is restrictive */
-		if(bucket->valid_archs & starpu_heteroprio_types_to_arch[arch_index])
+		if(bucket->valid_archs & starpu_heteroprio_types_to_arch(arch_index))
 			hp->nb_remaining_tasks_per_arch_index[arch_index] += 1;
 	}
 
@@ -512,7 +470,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 				for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
 				{
 					/* We test the archs on the bucket and not on task->where since it is restrictive */
-					if(bucket->valid_archs & starpu_heteroprio_types_to_arch[arch_index])
+					if(bucket->valid_archs & starpu_heteroprio_types_to_arch(arch_index))
 					{
 						hp->nb_remaining_tasks_per_arch_index[arch_index] -= 1;
 					}

+ 4 - 0
src/util/fstarpu.c

@@ -70,6 +70,8 @@ static const intptr_t fstarpu_sequential_consistency = STARPU_SEQUENTIAL_CONSIST
 static const intptr_t fstarpu_task_profiling_info = STARPU_TASK_PROFILING_INFO;
 static const intptr_t fstarpu_task_no_submitorder = STARPU_TASK_NO_SUBMITORDER;
 static const intptr_t fstarpu_task_sched_data = STARPU_TASK_SCHED_DATA;
+static const intptr_t fstarpu_task_file = STARPU_TASK_FILE;
+static const intptr_t fstarpu_task_line = STARPU_TASK_LINE;
 
 static const intptr_t fstarpu_value = STARPU_VALUE;
 static const intptr_t fstarpu_sched_ctx = STARPU_SCHED_CTX;
@@ -164,6 +166,8 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_TASK_PROFILING_INFO"))	{ return fstarpu_task_profiling_info; }
 	else if (!strcmp(s, "FSTARPU_TASK_NO_SUBMITORDER"))	{ return fstarpu_task_no_submitorder; }
 	else if	(!strcmp(s, "FSTARPU_TASK_SCHED_DATA"))	{ return fstarpu_task_sched_data; }
+	else if	(!strcmp(s, "FSTARPU_TASK_FILE"))	{ return fstarpu_task_file; }
+	else if	(!strcmp(s, "FSTARPU_TASK_LINE"))	{ return fstarpu_task_line; }
 
 	else if (!strcmp(s, "FSTARPU_CPU_WORKER"))	{ return fstarpu_cpu_worker; }
 	else if (!strcmp(s, "FSTARPU_CUDA_WORKER"))	{ return fstarpu_cuda_worker; }

+ 5 - 0
src/util/starpu_task_insert.c

@@ -130,6 +130,7 @@ struct starpu_task *_starpu_task_build_v(struct starpu_task *ptask, struct starp
 	return (ret == 0) ? task : NULL;
 }
 
+#undef starpu_task_submit
 int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 {
 	struct starpu_task *task;
@@ -152,6 +153,7 @@ int _starpu_task_insert_v(struct starpu_codelet *cl, va_list varg_list)
 	return ret;
 }
 
+#undef starpu_task_set
 int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
@@ -162,6 +164,7 @@ int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...)
 	return 0;
 }
 
+#undef starpu_task_insert
 int starpu_task_insert(struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
@@ -173,6 +176,7 @@ int starpu_task_insert(struct starpu_codelet *cl, ...)
 	return ret;
 }
 
+#undef starpu_insert_task
 int starpu_insert_task(struct starpu_codelet *cl, ...)
 {
 	va_list varg_list;
@@ -184,6 +188,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	return ret;
 }
 
+#undef starpu_task_build
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...)
 {
 	struct starpu_task *task;

+ 27 - 0
src/util/starpu_task_insert_utils.c

@@ -249,6 +249,14 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 		{
 			(void)va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_TASK_FILE)
+		{
+			(void)va_arg(varg_list, const char *);
+		}
+		else if (arg_type==STARPU_TASK_LINE)
+		{
+			(void)va_arg(varg_list, int);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -610,6 +618,14 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 		{
 			task->sched_data = va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_TASK_FILE)
+		{
+			task->file = va_arg(varg_list, const char *);
+		}
+		else if (arg_type==STARPU_TASK_LINE)
+		{
+			task->line = va_arg(varg_list, int);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -935,6 +951,16 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 			arg_i++;
 			task->sched_data = (void*)arglist[arg_i];
 		}
+		else if (arg_type == STARPU_TASK_FILE)
+		{
+			arg_i++;
+			task->file = arglist[arg_i];
+		}
+		else if (arg_type == STARPU_TASK_LINE)
+		{
+			arg_i++;
+			task->line = *(int *)arglist[arg_i];
+		}
 		else
 		{
 			STARPU_ABORT_MSG("unknown/unsupported argument %d, did you perhaps forget to end arguments with 0?", arg_type);
@@ -981,6 +1007,7 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 }
 
 /* Fortran interface to task_insert */
+#undef starpu_task_submit
 void fstarpu_task_insert(void **arglist)
 {
 	struct starpu_codelet *cl = arglist[0];

+ 3 - 3
tests/datawizard/bcsr.c

@@ -116,9 +116,9 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
+	conf.nmpi_ms = -1;
 
 	if (starpu_initialize(&conf, &argc, &argv) == -ENODEV)
 		return STARPU_TEST_SKIPPED;

+ 1 - 4
tests/datawizard/noreclaim.c

@@ -84,11 +84,8 @@ int main(int argc, char **argv)
 	setenv("STARPU_LIMIT_CPU_NUMA_MEM", TOTAL, 1);
 
 	starpu_conf_init(&conf);
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-	conf.nmpi_ms = 0;
 
         ret = starpu_initialize(&conf, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 2 - 3
tests/disk/disk_copy.c

@@ -66,10 +66,9 @@ int dotest(struct starpu_disk_ops *ops, void *param)
 	if (ret == -EINVAL)
 		return EXIT_FAILURE;
 	conf.precedence_over_environment_variables = 1;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
+	conf.nmpi_ms = -1;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) goto enodev;
 

+ 3 - 3
tests/disk/disk_copy_unpack.c

@@ -54,9 +54,9 @@ int dotest(struct starpu_disk_ops *ops, void *param)
 	if (ret == -EINVAL)
 		return EXIT_FAILURE;
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
+	conf.nmpi_ms = -1;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) goto enodev;
 

+ 3 - 3
tests/disk/disk_pack.c

@@ -66,9 +66,9 @@ int dotest(struct starpu_disk_ops *ops, char *base)
 	if (ret == -EINVAL)
 		return EXIT_FAILURE;
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
+	conf.nmpi_ms = -1;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) goto enodev;
 

+ 3 - 3
tests/disk/mem_reclaim.c

@@ -159,9 +159,9 @@ int dotest(struct starpu_disk_ops *ops, char *base, void (*vector_data_register)
 	if (ret == -EINVAL)
 		return EXIT_FAILURE;
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
+	conf.nmpi_ms = -1;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) goto enodev;
 

+ 2 - 2
tests/energy/energy_efficiency.c

@@ -308,9 +308,9 @@ int main(int argc, char *argv[])
 	/* Initialize StarPU */
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
 
-	conf.ncuda = 0;
-	conf.nopencl = 0;
 	if (!getenv("STARPU_SCHED"))
 		conf.sched_policy_name = "dmdas";
 

+ 2 - 2
tests/errorcheck/invalid_tasks.c

@@ -51,9 +51,9 @@ int main(void)
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
 	conf.precedence_over_environment_variables = 1;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.nopencl = 0;
-	conf.ncuda = 0;
+	conf.nmpi_ms = -1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 1 - 5
tests/errorcheck/starpu_init_noworker.c

@@ -33,11 +33,7 @@ int main(int argc, char **argv)
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
 	conf.precedence_over_environment_variables = 1;
-	conf.ncpus = 0;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-        conf.nmpi_ms = 0;
+	starpu_conf_noworker(&conf);
 
 	/* starpu_init should return -ENODEV */
         ret = starpu_initialize(&conf, &argc, &argv);

+ 2 - 4
tests/errorcheck/workers_cpuid.c

@@ -102,10 +102,8 @@ static int test_combination(long *combination, unsigned n)
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-	conf.nmpi_ms = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
 
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 0 - 1
tests/fpga/MyTasksMuxManager.maxj

@@ -8,7 +8,6 @@ import com.maxeler.maxcompiler.v2.managers.custom.blocks.Mux;
 import com.maxeler.maxcompiler.v2.managers.custom.blocks.Demux;
 import com.maxeler.maxcompiler.v2.managers.custom.stdlib.LMemCommandGroup;
 import com.maxeler.maxcompiler.v2.managers.custom.stdlib.LMemInterface;
-import com.maxeler.maxcompiler.v2.managers.engine_interfaces.InterfaceParam;
 import com.maxeler.platform.max5.manager.MAX5CManager;
 
 public class MyTasksMuxManager extends MAX5CManager

+ 3 - 6
tests/main/driver_api/init_run_deinit.c

@@ -85,9 +85,8 @@ static int test_cpu(void)
 	};
 
 	conf.precedence_over_environment_variables = 1;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
 	conf.not_launched_drivers = &d;
 	conf.n_not_launched_drivers = 1;
 
@@ -145,9 +144,8 @@ static int test_cuda(void)
 	};
 
 	conf.precedence_over_environment_variables = 1;
-	conf.ncpus = 0;
+	starpu_conf_noworker(&conf);
 	conf.ncuda = 1;
-	conf.nopencl = 0;
 	conf.not_launched_drivers = &d;
 	conf.n_not_launched_drivers = 1;
 
@@ -231,8 +229,7 @@ static int test_opencl(void)
 	};
 
 	conf.precedence_over_environment_variables = 1;
-	conf.ncpus = 0;
-	conf.ncuda = 0;
+	starpu_conf_noworker(&conf);
 	conf.nopencl = 1;
 	conf.not_launched_drivers = &d;
 	conf.n_not_launched_drivers = 1;

+ 3 - 5
tests/main/driver_api/run_driver.c

@@ -73,9 +73,8 @@ static int test_cpu(void)
 	conf.precedence_over_environment_variables = 1;
 	conf.n_not_launched_drivers = 1;
 	conf.not_launched_drivers = &d;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_cpu_worker_get_count() == 0)
 	{
@@ -135,9 +134,8 @@ static int test_cuda(void)
 	conf.precedence_over_environment_variables = 1;
 	conf.n_not_launched_drivers = 1;
 	conf.not_launched_drivers = &d;
-	conf.ncpus = 0;
+	starpu_conf_noworker(&conf);
 	conf.ncuda = 1;
-	conf.nopencl = 0;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_cuda_worker_get_count() == 0)
 	{
@@ -225,8 +223,8 @@ static int test_opencl(void)
 	conf.precedence_over_environment_variables = 1;
 	conf.n_not_launched_drivers = 1;
 	conf.not_launched_drivers = &d;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = 1;
-	conf.ncuda = 0;
 	conf.nopencl = 1;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV || starpu_opencl_worker_get_count() == 0)

+ 3 - 8
tests/microbenchs/bandwidth.c

@@ -186,10 +186,7 @@ static float bench(int *argc, char ***argv, unsigned nbusy, unsigned ncpus, int
 
 	starpu_conf_init(&conf);
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-	conf.nmpi_ms = 0;
+	starpu_conf_noworker(&conf);
 	conf.ncpus = ncpus;
 
 	if (intl && sleep == PAUSE)
@@ -284,10 +281,8 @@ int main(int argc, char **argv)
 
 	starpu_conf_init(&conf);
 	conf.precedence_over_environment_variables = 1;
-	conf.ncuda = 0;
-	conf.nopencl = 0;
-	conf.nmic = 0;
-	conf.nmpi_ms = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
 
 	ret = starpu_initialize(&conf, &argc, &argv);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;

+ 2 - 2
tests/microbenchs/tasks_size_overhead.c

@@ -165,8 +165,8 @@ int main(int argc, char **argv)
 
 	/* Get number of CPUs */
 	starpu_conf_init(&conf);
-	conf.ncuda = 0;
-	conf.nopencl = 0;
+	starpu_conf_noworker(&conf);
+	conf.ncpus = -1;
 #ifdef STARPU_SIMGRID
 	/* This will get serialized, avoid spending too much time on it. */
 	maxcpus = 2;

+ 142 - 0
tests/perfmodels/regression_based_memset.c

@@ -19,6 +19,8 @@
 #include <starpu_scheduler.h>
 #include "../helper.h"
 
+#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); }
+
 /*
  * Benchmark memset with a linear and non-linear regression
  */
@@ -27,8 +29,10 @@
 #define START 1024
 #ifdef STARPU_QUICK_CHECK
 #define END 1048576
+#define NENERGY 3
 #else
 #define END 16777216
+#define NENERGY 100
 #endif
 
 #ifdef STARPU_USE_CUDA
@@ -85,6 +89,18 @@ static struct starpu_perfmodel nl_model =
 	.symbol = "non_linear_memset_regression_based"
 };
 
+static struct starpu_perfmodel energy_model =
+{
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based_energy"
+};
+
+static struct starpu_perfmodel nl_energy_model =
+{
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based_energy"
+};
+
 static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
@@ -98,6 +114,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {memset0_cpu, memset_cpu},
 	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
 	.model = &model,
+	.energy_model = &energy_model,
 	.nbuffers = 1,
 	.modes = {STARPU_W}
 };
@@ -115,6 +132,7 @@ static struct starpu_codelet nl_memset_cl =
 	.cpu_funcs = {memset0_cpu, memset_cpu},
 	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
 	.model = &nl_model,
+	.energy_model = &nl_energy_model,
 	.nbuffers = 1,
 	.modes = {STARPU_W}
 };
@@ -142,6 +160,84 @@ static void test_memset(int nelems, struct starpu_codelet *codelet)
         starpu_data_unregister(handle);
 }
 
+static int test_memset_energy(int nelems, int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+{
+	int nloops;
+	int loop;
+
+	nloops = NENERGY;
+	if (workerid == -1)
+		nloops *= starpu_worker_get_count_by_type(archtype);
+
+	starpu_data_handle_t handle[nloops];
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		starpu_vector_data_register(&handle[loop], -1, (uintptr_t)NULL, nelems, sizeof(int));
+
+		task->cl = codelet;
+		task->where = where;
+		task->handles[0] = handle[loop];
+		task->flops = nelems;
+		if (workerid != -1)
+		{
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = workerid;
+		}
+
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_data_unregister(handle[loop]);
+	}
+
+	return nloops;
+}
+
+static int bench_energy(int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+{
+	int size;
+	int retval;
+	int ntasks;
+
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		starpu_data_handle_t handle;
+		starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+
+		if ( (retval = starpu_energy_start(workerid, archtype)) != 0)
+		{
+			starpu_data_unregister(handle);
+			_STARPU_DISP("Energy measurement not supported for archtype %d\n", archtype);
+			return -1;
+		}
+
+		/* Use a linear regression */
+		ntasks = test_memset_energy(size, workerid, where, archtype, impl, codelet);
+
+		struct starpu_task *task = starpu_task_create();
+		task->cl = codelet;
+		task->handles[0] = handle;
+		task->synchronous = 1;
+		task->destroy = 0;
+		task->flops = size;
+
+		retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, workerid, archtype);
+
+		starpu_task_destroy (task);
+		starpu_data_unregister(handle);
+
+		if (retval != 0)
+			ERROR_RETURN(retval);
+	}
+	return 0;
+}
+
 static void show_task_perfs(int size, struct starpu_task *task)
 {
 	unsigned workerid;
@@ -168,6 +264,7 @@ int main(int argc, char **argv)
 	struct starpu_conf conf;
 	starpu_data_handle_t handle;
 	int ret;
+	unsigned i;
 
 	starpu_conf_init(&conf);
 
@@ -227,5 +324,50 @@ int main(int argc, char **argv)
 #endif
 	starpu_shutdown();
 
+
+	starpu_conf_init(&conf);
+
+	/* Use a scheduler which doesn't choose the implementation */
+	conf.sched_policy_name = "eager";
+	conf.calibrate = 1;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	if (starpu_cpu_worker_get_count() > 0)
+	{
+		memset_cl.cpu_funcs[1] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
+		memset_cl.cpu_funcs[1] = memset_cpu;
+		memset_cl.cpu_funcs[0] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
+
+		nl_memset_cl.cpu_funcs[1] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
+		nl_memset_cl.cpu_funcs[1] = memset_cpu;
+		nl_memset_cl.cpu_funcs[0] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
+	}
+
+	for (i = 0; i < starpu_cuda_worker_get_count(); i++)
+	{
+		int workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, i);
+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &memset_cl);
+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &nl_memset_cl);
+	}
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
 	return EXIT_SUCCESS;
 }

+ 61 - 0
tools/dev/valgrind/fxt.suppr

@@ -56,3 +56,64 @@
    fun:fxt_setinfos
    fun:fut_setup
 }
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:malloc
+   fun:realloc
+   fun:fxt_next_ev
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: possible
+   fun:malloc
+   fun:fxt_blockev_enter
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:malloc
+   fun:strdup
+   fun:fxt_fdopen
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:realloc
+   fun:fxt_next_ev
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:calloc
+   fun:fxt_load_time
+   fun:fxt_fdopen
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:malloc
+   fun:fkt_load_pids
+   fun:fxt_fdopen
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:calloc
+   fun:fxt_fdopen
+}
+{
+   <insert_a_suppression_name_here>
+   Memcheck:Leak
+   match-leak-kinds: reachable
+   fun:malloc
+   fun:fdopen@@GLIBC_2.2.5
+   fun:fxt_fdopen
+}