4 years ago · 3abc2c75c6
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,8 @@
 
				 ,*
			
 
				 .libs
			
 
				 .deps
			
 
				+*.orig
			
 
				+*.rej
			
 
				 *.o
			
 
				 *.lo
			
 
				 *.la
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -46,6 +46,7 @@ New features:
 
				   * Add profiling based on papi performance counters.
			
 
				   * Add an experimental python interface (not actually parallel yet)
			
 
				   * Add task submission file+line in traces.
			
 
				+  * Add papi- and nvml-based energy measurement.
			
 
				 
			
 
				 Small changes:
			
 
				   * Add a synthetic energy efficiency testcase.
			
--- a/configure.ac
+++ b/configure.ac
--- a/contrib/ci.inria.fr/job-1-check.sh
+++ b/contrib/ci.inria.fr/job-1-check.sh
@@ -42,6 +42,7 @@ env > $PWD/env
 
				 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
			
 
				 tar xfz ../$tarball
			
 
				 cd $basename
			
 
				+touch configure
			
 
				 mkdir build
			
 
				 cd build
			
 
				 
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -1010,17 +1010,46 @@ data transfers and supports data matrices which do not fit in memory (out-of-cor
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				-\section MPIImplementation Notes about the Implementation
			
 
				 
			
 
				-StarPU-MPI is implemented directly on top of MPI.
			
 
				+\section Nmad Using the NewMadeleine communication library
			
 
				+
			
 
				+NewMadeleine (see http://pm2.gforge.inria.fr/newmadeleine/, part of the PM2
			
 
				+project) is an optimizing communication library for high-performance networks.
			
 
				+NewMadeleine provides its own interface, but also an MPI interface (called
			
 
				+MadMPI). Thus there are two possibilities to use NewMadeleine with StarPU:
			
 
				+
			
 
				+<ul>
			
 
				+<li>
			
 
				+using the NewMadeleine's native interface. StarPU supports this interface from
			
 
				+its release 1.3.0, by enabling the \c configure option \ref enable-nmad
			
 
				+"--enable-nmad". In this case, StarPU relies directly on NewMadeleine to make
			
 
				+communications progress and NewMadeleine has to be built with the profile
			
 
				+<c>pukabi+madmpi.conf</c>.
			
 
				+</li>
			
 
				+<li>
			
 
				+using the NewMadeleine's MPI interface (MadMPI). StarPU will use the standard
			
 
				+MPI API and NewMadeleine will handle the calls to the MPI API. In this case,
			
 
				+StarPU makes communications progress and thus communication progress has to be
			
 
				+disabled in NewMadeleine by compiling it with the profile
			
 
				+<c>pukabi+madmpi-mini.conf</c>.
			
 
				+</li>
			
 
				+</ul>
			
 
				+
			
 
				+To build NewMadeleine, download the latest version from the website (or,
			
 
				+better, use the Git version to use the most recent version), then:
			
 
				+\code{.sh}
			
 
				+cd pm2/scripts
			
 
				+./pm2-build-packages ./<the profile you chose> --prefix=<installation prefix>
			
 
				+\endcode
			
 
				+
			
 
				+With Guix, the NewMadeleine's native interface can be used by setting the
			
 
				+parameter \c \-\-with-input=openmpi=nmad and MadMPI can be used with \c
			
 
				+\-\-with-input=openmpi=nmad-mini.
			
 
				+
			
 
				+Whatever implementation (NewMadeleine or MadMPI) is used by StarPU, the public
			
 
				+MPI interface of StarPU (described in \ref API_MPI_Support) is the same.
			
 
				 
			
 
				-Since the release 1.3.0, an implementation on top of NewMadeleine, an
			
 
				-optimizing communication library for high-performance networks, is
			
 
				-also provided. To use it, one needs to install NewMadeleine (see
			
 
				-http://pm2.gforge.inria.fr/newmadeleine/) and enable the \c configure
			
 
				-option \ref enable-nmad "--enable-nmad".
			
 
				 
			
 
				-Both implementations provide the same public API.
			
 
				 
			
 
				 \section MPIMasterSlave MPI Master Slave Support
			
 
				 
			
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -23,7 +23,7 @@
 
				 
			
 
				 StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				 platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to
			
 
				-3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON .
			
 
				+3.26. SimGrid version 3.25 needs to be configured with -Denable_msg=ON .
			
 
				 Other versions may have compatibility issues. 3.17 notably does not build at
			
 
				 all. MPI simulation does not work with version 3.22.
			
 
				 
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -442,7 +442,7 @@ $ STARPU_SILENT=1 mpirun -np 2 ./insert_task
 
				 <dd>
			
 
				 \anchor enable-nmad
			
 
				 \addindex __configure__--enable-nmad
			
 
				-Enable the NewMadeleine implementation for StarPU-MPI.
			
 
				+Enable the NewMadeleine implementation for StarPU-MPI. See \ref Nmad for more details.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--disable-fortran</dt>
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -172,7 +172,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 
				 	$(top_srcdir)/src/common/list.h	\
			
 
				 	$(top_srcdir)/src/debug/starpu_debug_helpers.h	\
			
 
				 	$(top_srcdir)/src/debug/traces/starpu_fxt.h	\
			
 
				-	$(top_srcdir)/src/starpu_parameters.h	\
			
 
				 	$(top_srcdir)/src/sched_policies/fifo_queues.h	\
			
 
				 	$(top_srcdir)/src/sched_policies/helper_mct.h	\
			
 
				 	$(top_srcdir)/src/sched_policies/sched_component.h	\
			
--- a/doc/doxygen_dev/doxygen-config.cfg.in
+++ b/doc/doxygen_dev/doxygen-config.cfg.in
@@ -73,7 +73,6 @@ INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
 
				 			 @top_srcdir@/src/common/list.h \
			
 
				 			 @top_srcdir@/src/debug/starpu_debug_helpers.h \
			
 
				 			 @top_srcdir@/src/debug/traces/starpu_fxt.h \
			
 
				-			 @top_srcdir@/src/starpu_parameters.h \
			
 
				 			 @top_srcdir@/src/sched_policies/fifo_queues.h \
			
 
				 			 @top_srcdir@/src/sched_policies/helper_mct.h \
			
 
				 			 @top_srcdir@/src/sched_policies/sched_component.h \
			
--- a/doc/doxygen_dev/refman.tex
+++ b/doc/doxygen_dev/refman.tex
@@ -148,7 +148,6 @@ Documentation License”.
 
				 \input{starpu__data__cpy_8h}
			
 
				 \input{starpu__debug__helpers_8h}
			
 
				 \input{starpu__fxt_8h}
			
 
				-\input{starpu__parameters_8h}
			
 
				 \input{starpu__spinlock_8h}
			
 
				 \input{starpu__task__insert__utils_8h}
			
 
				 \input{tags_8h}
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -18,12 +18,15 @@
 
				 #
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CXXFLAGS) -Wno-unused
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_CFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_CXXFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_FFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_FCFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include $(STARPU_H_CPPFLAGS)
			
 
				 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) $(FXT_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				+LIBS += $(MAGMA_LIBS)
			
 
				 
			
 
				 SUBDIRS = stencil
			
 
				 
			
--- a/examples/cpp/add_vectors_interface.cpp
+++ b/examples/cpp/add_vectors_interface.cpp
@@ -61,9 +61,9 @@ class my_allocator
 
				 		node = a.get_node();
			
 
				 	}
			
 
				 
			
 
				-	explicit my_allocator(const unsigned node)
			
 
				+	explicit my_allocator(const unsigned thenode)
			
 
				 	{
			
 
				-		this->node = node;
			
 
				+		this->node = thenode;
			
 
				 	}
			
 
				 
			
 
				 	pointer allocate(size_type n, const void * = 0)
			
@@ -175,10 +175,8 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 
				 
			
 
				 	.cuda_to_ram = NULL,
			
 
				 	.cuda_to_cuda = NULL,
			
 
				-	.cuda_to_opencl = NULL,
			
 
				 
			
 
				 	.opencl_to_ram = NULL,
			
 
				-	.opencl_to_cuda = NULL,
			
 
				 	.opencl_to_opencl = NULL,
			
 
				 
			
 
				 	.mic_to_ram = NULL,
			
@@ -216,11 +214,9 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 
				 
			
 
				 	NULL,
			
 
				 	NULL,
			
 
				-	NULL,
			
 
				 
			
 
				 	NULL,
			
 
				 	NULL,
			
 
				-	NULL,
			
 
				 
			
 
				 	NULL,
			
 
				 
			
--- a/examples/fortran90/f90_example.f90
+++ b/examples/fortran90/f90_example.f90
@@ -33,8 +33,7 @@ PROGRAM f90_example
 
				   TYPE(type_mesh_elt),POINTER    :: elt   => NULL()
			
 
				   INTEGER(KIND=C_INT)            :: i,Nelt,res,cpus
			
 
				   INTEGER(KIND=C_INT)            :: starpu_maj,starpu_min,starpu_rev
			
 
				-  INTEGER(KIND=C_INT)            :: neq,ng,nb,it,it_tot
			
 
				-  REAL(KIND=C_DOUBLE)            :: r, coeff2
			
 
				+  INTEGER(KIND=C_INT)            :: it,it_tot
			
 
				 
			
 
				   !Initialization with arbitrary data
			
 
				   Nelt           = 2
			
--- a/examples/gl_interop/gl_interop.c
+++ b/examples/gl_interop/gl_interop.c
@@ -106,9 +106,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* Enable OpenGL interoperability */
			
 
				 	starpu_conf_init(&conf);
			
 
				+	starpu_conf_noworker(&conf);
			
 
				 	conf.ncuda = 1;
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.nopencl = 0;
			
 
				 	conf.cuda_opengl_interoperability = cuda_devices;
			
 
				 	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
			
 
				 	conf.not_launched_drivers = drivers;
			
--- a/examples/gl_interop/gl_interop_idle.c
+++ b/examples/gl_interop/gl_interop_idle.c
@@ -122,9 +122,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* Enable OpenGL interoperability */
			
 
				 	starpu_conf_init(&conf);
			
 
				+	starpu_conf_noworker(&conf);
			
 
				 	conf.ncuda = 1;
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.nopencl = 0;
			
 
				 	conf.cuda_opengl_interoperability = cuda_devices;
			
 
				 	conf.n_cuda_opengl_interoperability = sizeof(cuda_devices) / sizeof(*cuda_devices);
			
 
				 	conf.not_launched_drivers = drivers;
			
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -142,9 +142,7 @@ int main(void)
 
				 	struct starpu_conf conf;
			
 
				 
			
 
				 	starpu_conf_init(&conf);
			
 
				-	conf.ncpus = 0;
			
 
				-	conf.ncuda = 0;
			
 
				-	conf.nmic = 0;
			
 
				+	starpu_conf_noworker(&conf);
			
 
				 	conf.nopencl = 1;
			
 
				 
			
 
				         /* int width=1100; */
			
--- a/examples/native_fortran/nf_example.f90
+++ b/examples/native_fortran/nf_example.f90
@@ -32,8 +32,7 @@ PROGRAM f90_example
 
				   TYPE(type_mesh_elt),POINTER    :: elt   => NULL()
			
 
				   INTEGER(KIND=C_INT)            :: i,Nelt,res,cpus
			
 
				   INTEGER(KIND=C_INT)            :: starpu_maj,starpu_min,starpu_rev
			
 
				-  INTEGER(KIND=C_INT)            :: neq,ng,nb,it,it_tot
			
 
				-  REAL(KIND=C_DOUBLE)            :: r, coeff2
			
 
				+  INTEGER(KIND=C_INT)            :: it,it_tot
			
 
				   REAL(KIND=C_DOUBLE),TARGET     :: flops
			
 
				 
			
 
				   TYPE(C_PTR) :: cl_loop_element = C_NULL_PTR ! loop codelet
			
--- a/examples/perf_steering/perf_knobs_03.c
+++ b/examples/perf_steering/perf_knobs_03.c
@@ -37,11 +37,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct starpu_conf conf;
			
 
				 	starpu_conf_init(&conf);
			
 
				+	starpu_conf_noworker(&conf);
			
 
				 	conf.ncpus = 2;
			
 
				-	conf.ncuda = 0;
			
 
				-	conf.nopencl = 0;
			
 
				-	conf.nmic = 0;
			
 
				-	conf.nmpi_ms = 0;
			
 
				 	{
			
 
				 		const char *sched_pol_name = starpu_getenv("STARPU_SCHED");
			
 
				 		if (sched_pol_name != NULL && strcmp(sched_pol_name, "prio") != 0)
			
--- a/examples/scheduler/heteroprio_test.c
+++ b/examples/scheduler/heteroprio_test.c
@@ -30,33 +30,33 @@ void initSchedulerCallback(unsigned sched_ctx)
 
				 #ifdef STARPU_USE_CPU
			
 
				 	if (starpu_cpu_worker_get_count())
			
 
				 	{
			
 
				-		starpu_heteroprio_set_nb_prios(0, STARPU_CPU_IDX, 3);
			
 
				+		starpu_heteroprio_set_nb_prios(0, STARPU_CPU_WORKER, 3);
			
 
				 		// It uses direct mapping idx => idx
			
 
				 		unsigned idx;
			
 
				 		for(idx = 0; idx < 3; ++idx)
			
 
				 		{
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_IDX, idx, idx);
			
 
				-			starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_IDX, idx);
			
 
				+			starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_WORKER, idx, idx);
			
 
				+			starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_WORKER, idx);
			
 
				 		}
			
 
				 	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	// OpenCL is enabled and uses 2 buckets
			
 
				-	starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_OPENCL_IDX, 2);
			
 
				+	starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_OPENCL_WORKER, 2);
			
 
				 	// OpenCL will first look to priority 2
			
 
				 	int prio2 = starpu_cpu_worker_get_count() ? 2 : 1;
			
 
				-	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 0, prio2);
			
 
				+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_WORKER, 0, prio2);
			
 
				 	// For this bucket OpenCL is the fastest
			
 
				-	starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_OPENCL_IDX, prio2);
			
 
				+	starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_OPENCL_WORKER, prio2);
			
 
				 	// And CPU is 4 times slower
			
 
				 #ifdef STARPU_USE_CPU
			
 
				-	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_IDX, 2, 4.0f);
			
 
				+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_WORKER, 2, 4.0f);
			
 
				 #endif
			
 
				 
			
 
				 	int prio1 = starpu_cpu_worker_get_count() ? 1 : 0;
			
 
				-	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_IDX, 1, prio1);
			
 
				+	starpu_heteroprio_set_mapping(sched_ctx, STARPU_OPENCL_WORKER, 1, prio1);
			
 
				 	// We let the CPU as the fastest and tell that OpenCL is 1.7 times slower
			
 
				-	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_OPENCL_IDX, prio1, 1.7f);
			
 
				+	starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_OPENCL_WORKER, prio1, 1.7f);
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -15,11 +15,13 @@
 
				 #
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				+AM_CFLAGS += -Wno-unused
			
 
				+
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include $(STARPU_H_CPPFLAGS)
			
 
				 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) $(FXT_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
			
 
				+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				+LIBS += $(STARPU_EXPORTED_LIBS)
			
 
				 
			
 
				 if STARPU_USE_MPI
			
 
				 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
--- a/examples/tag_example/tag_example.c
+++ b/examples/tag_example/tag_example.c
@@ -223,7 +223,8 @@ int main(int argc, char **argv)
 
				 	int ret;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HELGRIND_H
			
 
				-	if (RUNNING_ON_VALGRIND) {
			
 
				+	if (RUNNING_ON_VALGRIND)
			
 
				+	{
			
 
				 		ni /= 2;
			
 
				 		nj /= 2;
			
 
				 		nk /= 2;
			
--- a/include/schedulers/starpu_heteroprio.h
+++ b/include/schedulers/starpu_heteroprio.h
@@ -25,49 +25,23 @@ extern "C"
 
				 #endif
			
 
				 
			
 
				 #define STARPU_HETEROPRIO_MAX_PRIO 100
			
 
				-/* #define STARPU_NB_TYPES 3 */
			
 
				-/* #define STARPU_CPU_IDX 0 */
			
 
				-/* #define STARPU_CUDA_IDX 1 */
			
 
				-/* #define STARPU_OPENCL_IDX 2 */
			
 
				 
			
 
				 #define STARPU_HETEROPRIO_MAX_PREFETCH 2
			
 
				 #if STARPU_HETEROPRIO_MAX_PREFETCH <= 0
			
 
				 #error STARPU_HETEROPRIO_MAX_PREFETCH == 1 means no prefetch so STARPU_HETEROPRIO_MAX_PREFETCH must >= 1
			
 
				 #endif
			
 
				 
			
 
				-enum starpu_heteroprio_types
			
 
				-{
			
 
				-// First will be zero
			
 
				-	STARPU_CPU_IDX, // = 0
			
 
				-	STARPU_CUDA_IDX,
			
 
				-	STARPU_OPENCL_IDX,
			
 
				-	STARPU_MIC_IDX,
			
 
				-	STARPU_MPI_MS_IDX,
			
 
				-// This will be the number of archs
			
 
				-	STARPU_NB_TYPES
			
 
				-};
			
 
				-
			
 
				-static const unsigned starpu_heteroprio_types_to_arch[STARPU_NB_TYPES+1] =
			
 
				-{
			
 
				-	STARPU_CPU,
			
 
				-	STARPU_CUDA,
			
 
				-	STARPU_OPENCL,
			
 
				-	STARPU_MIC,
			
 
				-        STARPU_MPI_MS,
			
 
				-	0
			
 
				-};
			
 
				-
			
 
				 /** Tell how many prio there are for a given arch */
			
 
				-void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned max_prio);
			
 
				+void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned max_prio);
			
 
				 
			
 
				 /** Set the mapping for a given arch prio=>bucket */
			
 
				-void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned source_prio, unsigned dest_bucket_id);
			
 
				+void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned source_prio, unsigned dest_bucket_id);
			
 
				 
			
 
				 /** Tell which arch is the faster for the tasks of a bucket (optional) */
			
 
				-void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id);
			
 
				+void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id);
			
 
				 
			
 
				 /** Tell how slow is a arch for the tasks of a bucket (optional) */ 
			
 
				-void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id, float slow_factor);
			
 
				+void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id, float slow_factor);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -486,6 +486,17 @@ struct starpu_conf
 
				 int starpu_conf_init(struct starpu_conf *conf);
			
 
				 
			
 
				 /**
			
 
				+   Set fields of \p conf so that no worker is enabled, i.e. set
			
 
				+   starpu_conf::ncpus = 0, starpu_conf::ncuda = 0, etc.
			
 
				+
			
 
				+   This allows to portably enable only a given type of worker:
			
 
				+
			
 
				+   starpu_conf_noworker(&conf);
			
 
				+   conf.ncpus = -1;
			
 
				+*/
			
 
				+int starpu_conf_noworker(struct starpu_conf *conf);
			
 
				+
			
 
				+/**
			
 
				    StarPU initialization method, must be called prior to any other
			
 
				    StarPU call. It is possible to specify StarPU’s configuration (e.g.
			
 
				    scheduling policy, number of cores, ...) by passing a
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -144,13 +144,6 @@ struct starpu_data_copy_methods
 
				 
			
 
				 	/**
			
 
				 	   Define how to copy data from the \p src_interface interface on the
			
 
				-	   \p src_node CUDA node to the \p dst_interface interface on the \p
			
 
				-	   dst_node OpenCL node. Return 0 on success.
			
 
				-	*/
			
 
				-	int (*cuda_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-
			
 
				-	/**
			
 
				-	   Define how to copy data from the \p src_interface interface on the
			
 
				 	   \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				 	   \p dst_node CPU node. Return 0 on success.
			
 
				 	*/
			
@@ -159,13 +152,6 @@ struct starpu_data_copy_methods
 
				 	/**
			
 
				 	   Define how to copy data from the \p src_interface interface on the
			
 
				 	   \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				-	   \p dst_node CUDA node. Return 0 on success.
			
 
				-	*/
			
 
				-	int (*opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-
			
 
				-	/**
			
 
				-	   Define how to copy data from the \p src_interface interface on the
			
 
				-	   \p src_node OpenCL node to the \p dst_interface interface on the
			
 
				 	   \p dst_node OpenCL node. Return 0 on success.
			
 
				 	*/
			
 
				 	int (*opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
--- a/include/starpu_driver.h
+++ b/include/starpu_driver.h
@@ -33,7 +33,13 @@ extern "C"
 
				 */
			
 
				 
			
 
				 /**
			
 
				-   structure for a driver
			
 
				+   Pre-initialize drivers
			
 
				+   So as to register information on device types, memory types, etc.
			
 
				+*/
			
 
				+void starpu_drivers_preinit(void);
			
 
				+
			
 
				+/**
			
 
				+   structure for designating a given driver
			
 
				 */
			
 
				 struct starpu_driver
			
 
				 {
			
@@ -51,8 +57,6 @@ struct starpu_driver
 
				 		unsigned cuda_id;
			
 
				 #if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				 		cl_device_id opencl_id;
			
 
				-#else
			
 
				-		unsigned opencl_id;
			
 
				 #endif
			
 
				 	} id;
			
 
				 };
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -35,8 +35,6 @@ extern "C"
 
				 struct starpu_task;
			
 
				 struct starpu_data_descr;
			
 
				 
			
 
				-#define STARPU_NARCH STARPU_ANY_WORKER
			
 
				-
			
 
				 /**
			
 
				    todo
			
 
				 */
			
@@ -319,6 +317,29 @@ void starpu_perfmodel_init(struct starpu_perfmodel *model);
 
				 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
			
 
				 
			
 
				 /**
			
 
				+   starpu_energy_start - start counting hardware events in an event set
			
 
				+
			
 
				+   - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
			
 
				+   - \p archi is the type of architecture on which calibration will be run
			
 
				+*/
			
 
				+
			
 
				+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
			
 
				+
			
 
				+/**
			
 
				+   starpu_energy_stop - stop counting hardware events in an event set
			
 
				+
			
 
				+   - \p model is the energy performance model to be filled with the result
			
 
				+   - \p task is a task specimen, so the performance model folds the result according to the parameter sizes of the task.
			
 
				+   - \p nimpl is the implementation number run during calibration
			
 
				+   - \p ntasks is the number of tasks run during calibration
			
 
				+   - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
			
 
				+   - \p archi is the type of architecture on which calibration was run
			
 
				+*/
			
 
				+
			
 
				+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				    Load the performance model found in the file named \p filename. \p model has to be
			
 
				    completely zero, and will be filled with the information stored in the given file.
			
 
				 */
			
@@ -385,7 +406,7 @@ int starpu_perfmodel_set_per_devices_size_base(struct starpu_perfmodel *model, i
 
				 */
			
 
				 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, struct starpu_perfmodel_arch *arch, char *path, size_t maxlen, unsigned nimpl);
			
 
				 
			
 
				-char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
			
 
				+const char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype);
			
 
				 
			
 
				 /**
			
 
				    Return the architecture name for \p arch
			
@@ -414,16 +435,32 @@ int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t
 
				 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
			
 
				 
			
 
				 /**
			
 
				-   Feed the performance model model with an explicit
			
 
				-   measurement measured (in µs), in addition to measurements done by StarPU
			
 
				+   Feed the performance model \p model with one explicit
			
 
				+   measurement (in µs or J), in addition to measurements done by StarPU
			
 
				    itself. This can be useful when the application already has an
			
 
				    existing set of measurements done in good conditions, that StarPU
			
 
				    could benefit from instead of doing on-line measurements. An example
			
 
				    of use can be seen in \ref PerformanceModelExample.
			
 
				+
			
 
				+   Note that this records only one measurement, and StarPU would ignore
			
 
				+   the first measurement (since it is usually disturbed by library loading
			
 
				+   etc.). Make sure to call this function several times to record all your
			
 
				+   measurements.
			
 
				+
			
 
				+   You can also call starpu_perfmodel_update_history_n() to directly provide an
			
 
				+   average performed on several tasks.
			
 
				 */
			
 
				 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 
			
 
				 /**
			
 
				+   Feed the performance model \p model with an explicit average measurement (in µs or J).
			
 
				+
			
 
				+   This is similar to starpu_perfmodel_update_history(), but records a batch of
			
 
				+   \p number measurements provided as the average of the measurements \p average_measured.
			
 
				+*/
			
 
				+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double average_measured, unsigned number);
			
 
				+
			
 
				+/**
			
 
				    Print the directory name storing performance models on \p output
			
 
				 */
			
 
				 void starpu_perfmodel_directory(FILE *output);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -48,39 +48,43 @@ extern "C"
 
				 #define STARPU_NOWHERE	((1ULL)<<0)
			
 
				 
			
 
				 /**
			
 
				+  Convert from enum starpu_worker_archtype to worker type mask for use in "where" fields
			
 
				+  */
			
 
				+#define STARPU_WORKER_TO_MASK(worker_archtype) (1ULL << (worker_archtype + 1))
			
 
				+/**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a CPU processing unit.
			
 
				 */
			
 
				-#define STARPU_CPU	((1ULL)<<1)
			
 
				+#define STARPU_CPU	STARPU_WORKER_TO_MASK(STARPU_CPU_WORKER)
			
 
				 
			
 
				 /**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a CUDA processing unit.
			
 
				 */
			
 
				-#define STARPU_CUDA	((1ULL)<<3)
			
 
				+#define STARPU_CUDA	STARPU_WORKER_TO_MASK(STARPU_CUDA_WORKER)
			
 
				 
			
 
				 /**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a OpenCL processing unit.
			
 
				 */
			
 
				-#define STARPU_OPENCL	((1ULL)<<6)
			
 
				+#define STARPU_OPENCL	STARPU_WORKER_TO_MASK(STARPU_OPENCL_WORKER)
			
 
				 
			
 
				 /**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a MIC processing unit.
			
 
				 */
			
 
				-#define STARPU_MIC	((1ULL)<<7)
			
 
				+#define STARPU_MIC	STARPU_WORKER_TO_MASK(STARPU_MIC_WORKER)
			
 
				 
			
 
				 /**
			
 
				    To be used when setting the field starpu_codelet::where (or
			
 
				    starpu_task::where) to specify the codelet (or the task) may be
			
 
				    executed on a MPI Slave processing unit.
			
 
				 */
			
 
				-#define STARPU_MPI_MS	((1ULL)<<9)
			
 
				+#define STARPU_MPI_MS	STARPU_WORKER_TO_MASK(STARPU_MPI_MS_WORKER)
			
 
				 
			
 
				 /**
			
 
				    Value to be set in starpu_codelet::flags to execute the codelet
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -49,7 +49,8 @@ enum starpu_node_kind
 
				 	STARPU_OPENCL_RAM=3,
			
 
				 	STARPU_DISK_RAM=4,
			
 
				 	STARPU_MIC_RAM=5,
			
 
				-	STARPU_MPI_MS_RAM=6
			
 
				+	STARPU_MPI_MS_RAM=6,
			
 
				+	STARPU_MAX_RAM=6
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -66,7 +67,8 @@ enum starpu_worker_archtype
 
				 	STARPU_OPENCL_WORKER=2,     /**< OpenCL device */
			
 
				 	STARPU_MIC_WORKER=3,        /**< Intel MIC device */
			
 
				 	STARPU_MPI_MS_WORKER=5,     /**< MPI Slave device */
			
 
				-	STARPU_ANY_WORKER=6         /**< any worker, used in the hypervisor */
			
 
				+	STARPU_NARCH = 6,           /**< Number of arch types */
			
 
				+	STARPU_ANY_WORKER=255       /**< any worker, used in the hypervisor */
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -312,7 +314,12 @@ unsigned starpu_worker_is_slave_somewhere(int workerid);
 
				 /**
			
 
				    Return worker \p type as a string.
			
 
				 */
			
 
				-char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
			
 
				+const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type);
			
 
				+
			
 
				+/**
			
 
				+   Return worker \p type as a string suitable for environment variable names (CPU, CUDA, etc.)
			
 
				+*/
			
 
				+const char *starpu_worker_get_type_as_env_var(enum starpu_worker_archtype type);
			
 
				 
			
 
				 int starpu_bindid_get_workerids(int bindid, int **workerids);
			
 
				 
			
@@ -375,6 +382,17 @@ int starpu_memory_nodes_numa_devid_to_id(unsigned id);
 
				 enum starpu_node_kind starpu_node_get_kind(unsigned node);
			
 
				 
			
 
				 /**
			
 
				+   Return the type of worker which operates on memory node kind \p node_kind
			
 
				+  */
			
 
				+enum starpu_worker_archtype starpu_memory_node_get_worker_archtype(enum starpu_node_kind node_kind);
			
 
				+
			
 
				+/**
			
 
				+   Return the type of memory node that arch type \p type operates on
			
 
				+  */
			
 
				+enum starpu_node_kind starpu_worker_get_memory_node_kind(enum starpu_worker_archtype type);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				    @name Scheduling operations
			
 
				    @{
			
 
				 */
			
--- a/julia/examples/Makefile.am
+++ b/julia/examples/Makefile.am
@@ -85,7 +85,7 @@ if STARPU_USE_CUDA
 
				 if STARPU_COVERITY
			
 
				 include $(top_srcdir)/starpu-mynvcc.mk
			
 
				 else
			
 
				-NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(STARPU_H_CPPFLAGS)
			
 
				 
			
 
				 .cu.cubin:
			
 
				 	$(V_nvcc) $(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
@@ -95,10 +95,12 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+AM_CFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include $(STARPU_H_CPPFLAGS)
			
 
				+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				+LIBS += -lm
			
 
				 
			
 
				 check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
			
 
				 SHELL_TESTS	=
			
--- a/julia/src/Makefile.am
+++ b/julia/src/Makefile.am
@@ -18,10 +18,9 @@ include $(top_srcdir)/starpu-notests.mk
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda
			
 
				 
			
 
				-AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+AM_CFLAGS += -fPIC
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src $(STARPU_H_CPPFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl` $(STARPU_EXPORTED_LIBS)
			
 
				 
			
 
				 SUBDIRS = dynamic_compiler
			
 
				 
			
--- a/julia/src/dynamic_compiler/Makefile.am
+++ b/julia/src/dynamic_compiler/Makefile.am
@@ -14,14 +14,15 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 #
			
 
				 
			
 
				+include $(top_srcdir)/starpu-notests.mk
			
 
				+
			
 
				 LD=$(CC_OR_NVCC)
			
 
				 AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top_builddir)/include \
			
 
				-	 -I$(abs_top_srcdir)/julia/src/
			
 
				+	 -I$(abs_top_srcdir)/julia/src/ $(STARPU_H_CPPFLAGS)
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				-AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+AM_CFLAGS += -fPIC -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				 LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
			
 
				-LIBS += -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
			
 
				+LIBS += -L $(top_builddir)/julia/src/libstarpujulia-$(STARPU_EFFECTIVE_VERSION).1.3
			
 
				 CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
			
 
				 EXTERNLIB=extern_tasks.so
			
 
				 GENERATEDLIB=generated_tasks.so
			
--- a/libstarpu-mic.pc.in
+++ b/libstarpu-mic.pc.in
@@ -22,6 +22,6 @@ Name: starpu
 
				 Description: offers support for heterogeneous multicore architecture
			
 
				 Version: @PACKAGE_VERSION@
			
 
				 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API
			
 
				-Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_SC_HYPERVISOR@ @STARPU_EXPORTED_LIBS@
			
 
				+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_EXPORTED_LIBS@ @STARPU_SC_HYPERVISOR@
			
 
				 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
			
 
				 Requires: @HWLOC_REQUIRES@
			
--- a/libstarpu.pc.in
+++ b/libstarpu.pc.in
@@ -16,12 +16,13 @@
 
				 prefix=@prefix@
			
 
				 exec_prefix=@exec_prefix@
			
 
				 libdir=@libdir@
			
 
				+pkglibdir=@pkglibdir@
			
 
				 includedir=@includedir@
			
 
				 
			
 
				 Name: starpu
			
 
				 Description: offers support for heterogeneous multicore architecture
			
 
				 Version: @PACKAGE_VERSION@
			
 
				-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ @SIMGRID_CFLAGS@ -DSTARPU_USE_DEPRECATED_API -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
			
 
				-Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_SC_HYPERVISOR@ @STARPU_EXPORTED_LIBS@
			
 
				+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_H_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
			
 
				+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_EXPORTED_LIBS@ @STARPU_SC_HYPERVISOR@
			
 
				 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
			
 
				 Requires: @HWLOC_REQUIRES@
			
--- a/m4/libs.m4
+++ b/m4/libs.m4
@@ -103,7 +103,7 @@ AC_DEFUN([IS_SUPPORTED_CFLAG],
 
				 
			
 
				 # IS_SUPPORTED_CXXFLAG(flag)
			
 
				 # ------------------------
			
 
				-# Check if the CFLAGS `flag' is supported by the compiler
			
 
				+# Check if the CXXFLAGS `flag' is supported by the compiler
			
 
				 AC_DEFUN([IS_SUPPORTED_CXXFLAG],
			
 
				 [
			
 
				 	AC_REQUIRE([AC_PROG_CXX])
			
@@ -119,7 +119,7 @@ AC_DEFUN([IS_SUPPORTED_CXXFLAG],
 
				 			[[AC_LANG_SOURCE([const char *hello = "Hello World";])]]
			
 
				 		),
			
 
				 		[
			
 
				-			m4_default_nblank([$2], [GLOBAL_AM_CFLAGS="$GLOBAL_AM_CXXFLAGS $1"])
			
 
				+			m4_default_nblank([$2], [GLOBAL_AM_CXXFLAGS="$GLOBAL_AM_CXXFLAGS $1"])
			
 
				 			AC_MSG_RESULT(yes)
			
 
				 			option_available=1
			
 
				 		],
			
@@ -192,10 +192,21 @@ AC_DEFUN([IS_SUPPORTED_FCFLAG],
 
				 	AC_LANG_POP([Fortran])
			
 
				 ])
			
 
				 
			
 
				+# IS_SUPPORTED_FLAG(flag)
			
 
				+# ------------------------
			
 
				+# Check with C, C++, F77 and F90 that the `flag' is supported by the compiler
			
 
				+AC_DEFUN([IS_SUPPORTED_FLAG],
			
 
				+[
			
 
				+	IS_SUPPORTED_CFLAG($1)
			
 
				+	IS_SUPPORTED_CXXFLAG($1)
			
 
				+	IS_SUPPORTED_FFLAG($1)
			
 
				+	IS_SUPPORTED_FCFLAG($1)
			
 
				+])
			
 
				+
			
 
				 # AC_PYTHON_MODULE(modulename, [action-if-found], [action-if-not-found])
			
 
				 # Check if the given python module is available
			
 
				 AC_DEFUN([AC_PYTHON_MODULE],
			
 
				-	[
			
 
				+[
			
 
				 	echo "import $1" | $PYTHON - 2>/dev/null
			
 
				 	if test $? -ne 0 ; then
			
 
				 	   	$3
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -97,7 +97,7 @@ if STARPU_USE_CUDA
 
				 if STARPU_COVERITY
			
 
				 include $(top_srcdir)/starpu-mynvcc.mk
			
 
				 else
			
 
				-NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(STARPU_H_CPPFLAGS)
			
 
				 
			
 
				 .cu.cubin:
			
 
				 	$(V_nvcc) $(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
@@ -107,10 +107,15 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+AM_CFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_CXXFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_FFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_FCFLAGS += $(MAGMA_CFLAGS) -Wno-unused
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include $(STARPU_H_CPPFLAGS)
			
 
				+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_CUDA_LDFLAGS)
			
 
				+LIBS += -lm $(MAGMA_LIBS)
			
 
				 
			
 
				 ###################
			
 
				 # Stencil example #
			
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -23,11 +23,12 @@ BUILT_SOURCES =
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) $(NMAD_CFLAGS)
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
			
 
				-LIBS += $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+AM_CFLAGS += $(FXT_CFLAGS) $(NMAD_CFLAGS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src $(STARPU_H_CPPFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(FXT_LDFLAGS) $(FXT_LIBS)
			
 
				+LIBS += $(NMAD_LDFLAGS) $(NMAD_LIBS)
			
 
				+LIBS += $(MPICC_LDFLAGS)
			
 
				 
			
 
				 ldflags =
			
 
				 
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -73,7 +73,7 @@ if STARPU_USE_CUDA
 
				 if STARPU_COVERITY
			
 
				 include $(top_srcdir)/starpu-mynvcc.mk
			
 
				 else
			
 
				-NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(HWLOC_CFLAGS)
			
 
				+NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -I$(top_builddir)/include/ $(STARPU_H_CPPFLAGS)
			
 
				 
			
 
				 .cu.cubin:
			
 
				 	$(V_nvcc) $(NVCC) -cubin $< -o $@ $(NVCCFLAGS)
			
@@ -83,10 +83,14 @@ NVCCFLAGS += --compiler-options -fno-strict-aliasing  -I$(top_srcdir)/include/ -
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS) $(MAGMA_LIBS)
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+AM_CFLAGS += -Wno-unused
			
 
				+AM_CXXFLAGS += -Wno-unused
			
 
				+AM_FFLAGS += -Wno-unused
			
 
				+AM_FCFLAGS += -Wno-unused
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/ $(STARPU_H_CPPFLAGS)
			
 
				+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				+LIBS += $(STARPU_CUDA_LDFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(STARPU_EXPORTED_LIBS)
			
 
				 
			
 
				 ########################
			
 
				 # Unit testcases       #
			
--- a/mpi/tools/Makefile.am
+++ b/mpi/tools/Makefile.am
@@ -20,11 +20,9 @@ include $(top_srcdir)/starpu-notests.mk
 
				 
			
 
				 SUBDIRS =
			
 
				 
			
 
				-AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/mpi/include -I$(top_builddir)/src -I$(top_srcdir)/src -DSTARPU_REPLAY_MPI
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/mpi/include -I$(top_builddir)/src -I$(top_srcdir)/src -DSTARPU_REPLAY_MPI $(STARPU_H_CPPFLAGS)
			
 
				 AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS)
			
 
				-LIBS += $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(STARPU_EXPORTED_LIBS)
			
 
				 
			
 
				 CC=$(CC_OR_MPICC)
			
 
				 CCLD=$(CC_OR_MPICC)
			
--- a/sc_hypervisor/examples/Makefile.am
+++ b/sc_hypervisor/examples/Makefile.am
@@ -15,10 +15,11 @@
 
				 #
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include -I$(top_srcdir)/sc_hypervisor/examples
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+AM_CFLAGS += $(MAGMA_CFLAGS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include -I$(top_srcdir)/sc_hypervisor/examples $(STARPU_H_CPPFLAGS)
			
 
				+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_CUDA_LDFLAGS)
			
 
				 
			
 
				 noinst_PROGRAMS =				\
			
 
				 	app_driven_test/app_driven_test		\
			
--- a/sc_hypervisor/src/Makefile.am
+++ b/sc_hypervisor/src/Makefile.am
@@ -16,10 +16,8 @@
 
				 
			
 
				 include $(top_srcdir)/starpu-notests.mk
			
 
				 
			
 
				-AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				-AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/sc_hypervisor/include/ -I$(top_srcdir)/sc_hypervisor/src
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
			
 
				-LIBS += $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
			
 
				+AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/sc_hypervisor/include/ -I$(top_srcdir)/sc_hypervisor/src $(STARPU_H_CPPFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(STARPU_EXPORTED_LIBS)
			
 
				 
			
 
				 lib_LTLIBRARIES = libsc_hypervisor.la
			
 
				 
			
--- a/socl/examples/Makefile.am
+++ b/socl/examples/Makefile.am
@@ -15,9 +15,11 @@
 
				 #
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				-AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
			
 
				-LIBS += $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/@LIBSTARPU_LINK@
			
 
				-LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
			
 
				+AM_CFLAGS += $(MAGMA_CFLAGS)
			
 
				+AM_CPPFLAGS = $(STARPU_H_CPPFLAGS)
			
 
				+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
			
 
				+LIBS += $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/@LIBSTARPU_LINK@ $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
 
				 
			
 
				 SOCL_EXAMPLES	=
			
--- a/socl/src/Makefile.am
+++ b/socl/src/Makefile.am
@@ -18,10 +18,9 @@ include $(top_srcdir)/starpu-notests.mk
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda
			
 
				 
			
 
				-AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS)
			
 
				-AM_CPPFLAGS = -DBUILDING_SOCL -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/socl/src
			
 
				-LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ \
			
 
				-	    $(STARPU_OPENCL_LDFLAGS) $(FXT_LDFLAGS)
			
 
				+AM_CPPFLAGS = -DBUILDING_SOCL -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/socl/src $(STARPU_H_CPPFLAGS)
			
 
				+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(STARPU_EXPORTED_LIBS)
			
 
				+LIBS += $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
 
				 SUBDIRS =
			
 
				 
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -17,9 +17,10 @@
 
				 
			
 
				 include $(top_srcdir)/starpu-notests.mk
			
 
				 
			
 
				-AM_CFLAGS = $(GLOBAL_AM_CFLAGS)
			
 
				-AM_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU  $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(STARPU_RCCE_CPPFLAGS) -DSTARPU_DATADIR='"$(datadir)"'
			
 
				-LIBS += -lm $(FXT_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(LIBSTARPU_LDFLAGS) $(PAPI_LIBS)
			
 
				+AM_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -DSTARPU_DATADIR='"$(datadir)"'
			
 
				+AM_CPPFLAGS += $(STARPU_H_CPPFLAGS)
			
 
				+AM_CPPFLAGS += $(FXT_CFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(STARPU_RCCE_CPPFLAGS)
			
 
				+LIBS += -lm $(LIBSTARPU_LDFLAGS)
			
 
				 
			
 
				 SUBDIRS =
			
 
				 
			
@@ -59,8 +60,7 @@ endif STARPU_HAVE_WINDOWS
 
				 
			
 
				 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined									\
			
 
				-  -version-info $(libstarpu_so_version)
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined -version-info $(libstarpu_so_version)
			
 
				 
			
 
				 if STARPU_HAVE_DARWIN
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS += \
			
@@ -154,7 +154,6 @@ noinst_HEADERS = 						\
 
				 	util/starpu_clusters_create.h				\
			
 
				 	util/starpu_task_insert_utils.h				\
			
 
				 	util/starpu_data_cpy.h					\
			
 
				-	starpu_parameters.h					\
			
 
				 	sched_policies/prio_deque.h				\
			
 
				 	sched_policies/sched_component.h
			
 
				 
			
@@ -196,6 +195,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
				 	core/disk_ops/disk_unistd.c                             \
			
 
				 	core/disk_ops/unistd/disk_unistd_global.c		\
			
 
				 	core/perfmodel/perfmodel_history.c			\
			
 
				+        core/perfmodel/energy_model.c                           \
			
 
				 	core/perfmodel/perfmodel_bus.c				\
			
 
				 	core/perfmodel/perfmodel.c				\
			
 
				 	core/perfmodel/perfmodel_print.c			\
			
@@ -325,6 +325,7 @@ endif
 
				 
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c
			
 
				 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/driver_cuda_init.c
			
 
				 if STARPU_USE_CUDA
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/driver_cuda.c
			
 
				 else
			
@@ -336,6 +337,7 @@ endif
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c
			
 
				 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_init.c
			
 
				 if STARPU_USE_OPENCL
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl_utils.c
			
@@ -380,6 +382,7 @@ endif
 
				 #										#
			
 
				 #########################################
			
 
				 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_init.c
			
 
				 if STARPU_USE_MIC
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_common.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_source.c
			
@@ -393,6 +396,7 @@ endif
 
				 #                                       #
			
 
				 #########################################
			
 
				 
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_init.c
			
 
				 if STARPU_USE_MPI_MASTER_SLAVE
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_common.c
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_source.c
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -37,12 +37,8 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 /* some key to identify the worker kind */
			
 
				-#define _STARPU_FUT_APPS_KEY	0x100
			
 
				-#define _STARPU_FUT_CPU_KEY	0x101
			
 
				-#define _STARPU_FUT_CUDA_KEY	0x102
			
 
				-#define _STARPU_FUT_OPENCL_KEY	0x103
			
 
				-#define _STARPU_FUT_MIC_KEY	0x104
			
 
				-#define _STARPU_FUT_MPI_KEY	0x106
			
 
				+#define _STARPU_FUT_WORKER_KEY(kind) (kind + 0x100)
			
 
				+#define _STARPU_FUT_KEY_WORKER(key) (key - 0x100)
			
 
				 
			
 
				 #define _STARPU_FUT_WORKER_INIT_START	0x5100
			
 
				 #define _STARPU_FUT_WORKER_INIT_END	0x5101
			
@@ -737,10 +733,9 @@ do {									\
 
				 		FUT_DO_ALWAYS_PROBE2(FUT_NEW_LWP_CODE, cpuid, _starpu_gettid()); \
			
 
				 } while (0)
			
 
				 
			
 
				-/* workerkind = _STARPU_FUT_CPU_KEY for instance */
			
 
				 #define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode, bindid, sync)	do {\
			
 
				 	if (_starpu_fxt_started) \
			
 
				-		FUT_DO_ALWAYS_PROBE7(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, bindid, sync, _starpu_gettid()); \
			
 
				+		FUT_DO_ALWAYS_PROBE7(_STARPU_FUT_WORKER_INIT_START, _STARPU_FUT_WORKER_KEY(workerkind), workerid, devid, memnode, bindid, sync, _starpu_gettid()); \
			
 
				 } while (0)
			
 
				 
			
 
				 #define _STARPU_TRACE_WORKER_INIT_END(__workerid)		do {\
			
@@ -930,7 +925,7 @@ do {										\
 
				 
			
 
				 #define _STARPU_TRACE_WORKER_DEINIT_END(workerkind)		do {\
			
 
				 	if (_starpu_fxt_started) \
			
 
				-		FUT_DO_ALWAYS_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, _starpu_gettid()); \
			
 
				+		FUT_DO_ALWAYS_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, _STARPU_FUT_WORKER_KEY(workerkind), _starpu_gettid()); \
			
 
				 } while(0)
			
 
				 
			
 
				 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
			
--- a/src/common/rbtree_i.h
+++ b/src/common/rbtree_i.h
@@ -44,7 +44,8 @@
 
				  * architectures, as long as the nodes aren't embedded in structures with
			
 
				  * special alignment constraints such as member packing.
			
 
				  */
			
 
				-struct starpu_rbtree_node {
			
 
				+struct starpu_rbtree_node
			
 
				+{
			
 
				     uintptr_t parent;
			
 
				     struct starpu_rbtree_node *children[2];
			
 
				 };
			
@@ -52,7 +53,8 @@ struct starpu_rbtree_node {
 
				 /**
			
 
				  * Red-black tree structure.
			
 
				  */
			
 
				-struct starpu_rbtree {
			
 
				+struct starpu_rbtree
			
 
				+{
			
 
				     struct starpu_rbtree_node *root;
			
 
				 };
			
 
				 
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -96,14 +96,22 @@ int starpu_pthread_create_on(const char *name, starpu_pthread_t *thread, const s
 
				 	if (attr && attr->stacksize)
			
 
				 		sg_actor_set_stacksize(*thread, attr->stacksize);
			
 
				 #endif
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+	sg_actor_set_data(*thread, tsd);
			
 
				+#else
			
 
				 	sg_actor_data_set(*thread, tsd);
			
 
				+#endif
			
 
				 	sg_actor_start(*thread, _starpu_simgrid_thread_start, 2, _args);
			
 
				 #else
			
 
				 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
			
 
				 #ifdef HAVE_SG_ACTOR_DATA
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+	sg_actor_set_data(*thread, tsd);
			
 
				+#else
			
 
				 	sg_actor_data_set(*thread, tsd);
			
 
				 #endif
			
 
				 #endif
			
 
				+#endif
			
 
				 #ifndef HAVE_SG_ACTOR_SET_STACKSIZE
			
 
				 	if (attr && attr->stacksize)
			
 
				 		_starpu_simgrid_set_stack_size(_starpu_default_stack_size);
			
@@ -328,7 +336,9 @@ extern void *smpi_process_get_user_data();
 
				 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
			
 
				 {
			
 
				 	void **array;
			
 
				-#ifdef HAVE_SG_ACTOR_DATA
			
 
				+#ifdef HAVE_SG_ACTOR_GET_DATA
			
 
				+	array = sg_actor_get_data(sg_actor_self());
			
 
				+#elif defined(HAVE_SG_ACTOR_DATA)
			
 
				 	array = sg_actor_data(sg_actor_self());
			
 
				 #else
			
 
				 #if defined(HAVE_SMPI_PROCESS_SET_USER_DATA) || defined(smpi_process_get_user_data)
			
@@ -355,7 +365,9 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 
				 void* starpu_pthread_getspecific(starpu_pthread_key_t key)
			
 
				 {
			
 
				 	void **array;
			
 
				-#ifdef HAVE_SG_ACTOR_DATA
			
 
				+#ifdef HAVE_SG_ACTOR_GET_DATA
			
 
				+	array = sg_actor_get_data(sg_actor_self());
			
 
				+#elif defined(HAVE_SG_ACTOR_DATA)
			
 
				 	array = sg_actor_data(sg_actor_self());
			
 
				 #else
			
 
				 #if defined(HAVE_SMPI_PROCESS_SET_USER_DATA) || defined(smpi_process_get_user_data)
			
--- a/src/core/dependencies/cg.c
+++ b/src/core/dependencies/cg.c
@@ -221,7 +221,8 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 
				 					tag_successors->ndeps_completed = 0;
			
 
				 					/* This releases the lock */
			
 
				 					_starpu_tag_set_ready(tag);
			
 
				-				} else
			
 
				+				}
			
 
				+				else
			
 
				 					_starpu_spin_unlock(&tag->lock);
			
 
				 				break;
			
 
				 			}
			
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -252,7 +252,7 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 
				 	int cpu_workers[STARPU_NMAXWORKERS];
			
 
				 	unsigned ncpus = 0;
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;
			
 
				+	unsigned nb_mics = _starpu_get_machine_config()->topology.ndevices[STARPU_MIC_WORKER];
			
 
				 	unsigned * nmics_table;
			
 
				 	int * mic_id;
			
 
				 	int ** mic_workers;
			
--- a/src/core/disk_ops/disk_leveldb.cpp
+++ b/src/core/disk_ops/disk_leveldb.cpp
@@ -27,7 +27,6 @@
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				-#include <starpu_parameters.h>
			
 
				 
			
 
				 #define NITER	_starpu_calibration_minimum
			
 
				 
			
--- a/src/core/disk_ops/disk_stdio.c
+++ b/src/core/disk_ops/disk_stdio.c
@@ -28,7 +28,6 @@
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				-#include <starpu_parameters.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
 
				 #  include <io.h>
			
--- a/src/core/disk_ops/unistd/disk_unistd_global.c
+++ b/src/core/disk_ops/unistd/disk_unistd_global.c
@@ -37,7 +37,6 @@
 
				 #include <datawizard/copy_driver.h>
			
 
				 #include <datawizard/data_request.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				-#include <starpu_parameters.h>
			
 
				 #include <common/uthash.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -347,19 +347,10 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				 	/* Check nowhere before releasing the sequential consistency (which may
			
 
				 	 * unregister the handle and free its switch_cl, and thus task->cl here.  */
			
 
				 	unsigned nowhere = !task->cl || task->cl->where == STARPU_NOWHERE || task->where == STARPU_NOWHERE;
			
 
				-	/* If this is a continuation, we do not release task dependencies now.
			
 
				-	 * Task dependencies will be released only when the continued task
			
 
				-	 * fully completes */
			
 
				-	if (!continuation)
			
 
				-	{
			
 
				-		/* Tell other tasks that we don't exist any more, thus no need for
			
 
				-		 * implicit dependencies any more.  */
			
 
				-		_starpu_release_task_enforce_sequential_consistency(j);
			
 
				-	}
			
 
				-
			
 
				 	/* If the job was executed on a combined worker there is no need for the
			
 
				 	 * scheduler to process it : the task structure doesn't contain any valuable
			
 
				 	 * data as it's not linked to an actual worker */
			
@@ -395,6 +386,16 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 	if (!callback && task->cl)
			
 
				 		callback = task->cl->callback_func;
			
 
				 
			
 
				+	/* If this is a continuation, we do not release task dependencies now.
			
 
				+	 * Task dependencies will be released only when the continued task
			
 
				+	 * fully completes */
			
 
				+	if (!continuation)
			
 
				+	{
			
 
				+		/* Tell other tasks that we don't exist any more, thus no need for
			
 
				+		 * implicit dependencies any more.  */
			
 
				+		_starpu_release_task_enforce_sequential_consistency(j);
			
 
				+	}
			
 
				+
			
 
				 	/* Task does not have a cl, but has explicit data dependencies, we need
			
 
				 	 * to tell them that we will not exist any more before notifying the
			
 
				 	 * tasks waiting for us
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -52,10 +52,7 @@ struct _starpu_worker;
 
				 /** codelet function */
			
 
				 typedef void (*_starpu_cl_func_t)(void **, void *);
			
 
				 
			
 
				-#define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->where & STARPU_CPU)
			
 
				-#define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->where & STARPU_CUDA)
			
 
				-#define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->where & STARPU_OPENCL)
			
 
				-#define _STARPU_MIC_MAY_PERFORM(j)	((j)->task->where & STARPU_MIC)
			
 
				+#define _STARPU_MAY_PERFORM(j, arch)	((j)->task->where & STARPU_##arch)
			
 
				 
			
 
				 struct _starpu_data_descr
			
 
				 {
			
--- a/src/core/perfmodel/energy_model.c
+++ b/src/core/perfmodel/energy_model.c
@@ -0,0 +1,280 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2008-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#ifdef STARPU_PAPI
			
 
				+#include <papi.h>
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+#include <hwloc.h>
			
 
				+#endif
			
 
				+#include <starpu_perfmodel.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/utils.h>
			
 
				+#ifdef HAVE_UNISTD_H
			
 
				+#include <unistd.h>
			
 
				+#endif
			
 
				+#include <sys/stat.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <core/jobs.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <datawizard/datawizard.h>
			
 
				+#include <core/task.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				+#include <nvml.h>
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
			
 
				+
			
 
				+#if 0
			
 
				+#define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
			
 
				+#else
			
 
				+#define debug(fmt, ...)
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_PAPI
			
 
				+static const int N_EVTS = 2;
			
 
				+
			
 
				+static int nsockets;
			
 
				+
			
 
				+static const char* event_names[] =
			
 
				+{
			
 
				+	"rapl::RAPL_ENERGY_PKG:cpu=%d",
			
 
				+	"rapl::RAPL_ENERGY_DRAM:cpu=%d"
			
 
				+};
			
 
				+
			
 
				+static int add_event(int EventSet, int socket);
			
 
				+
			
 
				+/* PAPI variables*/
			
 
				+
			
 
				+/*must be initialized to PAPI_NULL before calling PAPI_create_event*/
			
 
				+static int EventSet = PAPI_NULL;
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+static double t1;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				+static unsigned long long energy_begin, energy_end;
			
 
				+static nvmlDevice_t device;
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
			
 
				+{
			
 
				+	t1 = starpu_timing_now();
			
 
				+
			
 
				+	switch (archi)
			
 
				+	{
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	case STARPU_CPU_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
			
 
				+
			
 
				+		int retval, number;
			
 
				+
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		hwloc_topology_t topology = config->topology.hwtopology;
			
 
				+
			
 
				+		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
			
 
				+
			
 
				+		if ((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		/* Creating the eventset */
			
 
				+		if ((retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		int i;
			
 
				+		for (i = 0 ; i < nsockets ; i ++ )
			
 
				+		{
			
 
				+			/* return the index of socket */
			
 
				+			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
			
 
				+			if ( (retval = add_event(EventSet, obj->os_index)) != PAPI_OK)
			
 
				+				ERROR_RETURN(retval);
			
 
				+		}
			
 
				+
			
 
				+		/* get the number of events in the event set */
			
 
				+		number = 0;
			
 
				+		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		debug("There are %d events in the event set\n", number);
			
 
				+
			
 
				+		/* Start counting */
			
 
				+		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		return retval;
			
 
				+	}
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
			
 
				+		int devid = starpu_worker_get_devid(workerid);
			
 
				+		int ret = nvmlDeviceGetHandleByIndex_v2 (devid,  &device);
			
 
				+		if (ret != NVML_SUCCESS)
			
 
				+		{
			
 
				+			_STARPU_DISP("Could not get CUDA device %d from nvml\n", devid);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_begin );
			
 
				+		if (ret != NVML_SUCCESS)
			
 
				+		{
			
 
				+			_STARPU_DISP("Could not measure energy used by CUDA device %d\n", devid);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+	break;
			
 
				+#endif
			
 
				+
			
 
				+	default:
			
 
				+		printf("Error: worker is not supported ! \n");
			
 
				+		return -1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi)
			
 
				+{
			
 
				+	double energy = 0.;
			
 
				+	int retval;
			
 
				+	unsigned cpuid = 0;
			
 
				+	double t2 = starpu_timing_now();
			
 
				+	double t STARPU_ATTRIBUTE_UNUSED = t2 - t1;
			
 
				+
			
 
				+	switch (archi)
			
 
				+	{
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	case STARPU_CPU_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
			
 
				+
			
 
				+		/*This is where we store the values we read from the eventset */
			
 
				+		long long values[nsockets*N_EVTS];
			
 
				+
			
 
				+		/* Stop counting and store the values into the array */
			
 
				+		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		int k,s;
			
 
				+
			
 
				+		for( s = 0 ; s < nsockets ; s ++)
			
 
				+		{
			
 
				+			for(k = 0 ; k < N_EVTS; k++)
			
 
				+			{
			
 
				+				double delta = values[s * N_EVTS + k]*0.23/1.0e9;
			
 
				+				energy += delta;
			
 
				+
			
 
				+				debug("%-40s%12.6f J\t(for %f us, Average Power %.1fW)\n",
			
 
				+				      event_names[k],
			
 
				+				      delta, t, delta/(t*1.0E-6));
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/*removes all events from a PAPI event set */
			
 
				+		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		/*deallocates the memory associated with an empty PAPI EventSet*/
			
 
				+		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		break;
			
 
				+	}
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
			
 
				+		int ret = nvmlDeviceGetTotalEnergyConsumption(device, &energy_end );
			
 
				+		if (ret != NVML_SUCCESS)
			
 
				+			return -1;
			
 
				+		energy = (energy_end - energy_begin) / 1000.;
			
 
				+		debug("energy consumption on device %d is %f mJ (for %f us, Average power %0.1fW)\n", 0, energy * 1000., t, energy / (t*1.0E-6));
			
 
				+		break;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	default:
			
 
				+	{
			
 
				+		printf("Error: worker type %d is not supported! \n", archi);
			
 
				+		return -1;
			
 
				+		break;
			
 
				+	}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	struct starpu_perfmodel_arch *arch;
			
 
				+	if (workerid == -1)
			
 
				+		/* Just take one of them */
			
 
				+		workerid = starpu_worker_get_by_type(archi, 0);
			
 
				+
			
 
				+	arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
			
 
				+
			
 
				+	starpu_perfmodel_update_history_n(model, task, arch, cpuid, nimpl, energy / ntasks, ntasks);
			
 
				+
			
 
				+	return retval;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+static int add_event(int eventSet, int socket)
			
 
				+{
			
 
				+	int retval, i;
			
 
				+	for (i = 0; i < N_EVTS; i++)
			
 
				+	{
			
 
				+		char buf[255];
			
 
				+		snprintf(buf, sizeof(buf), event_names[i], socket);
			
 
				+
			
 
				+		/* printf("Activating multiplex\n"); */
			
 
				+		/* retval = PAPI_set_multiplex(eventSet); */
			
 
				+		/* if(retval != PAPI_OK) { */
			
 
				+		/*      _STARPU_DISP("cannot set multiplex\n"); */
			
 
				+		/*      return retval; */
			
 
				+		/* } */
			
 
				+		retval = PAPI_add_named_event(eventSet, buf);
			
 
				+		if (retval != PAPI_OK)
			
 
				+		{
			
 
				+			if (!strcmp(event_names[i], "rapl::RAPL_ENERGY_DRAM:cpu=%d"))
			
 
				+			{
			
 
				+				/* Ok, too bad */
			
 
				+				_STARPU_DISP("Note: DRAM energy measurement not available\n");
			
 
				+				return PAPI_OK;
			
 
				+			}
			
 
				+			_STARPU_DISP("cannot add event '%s': %d\n", buf, retval);
			
 
				+			return retval;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return ( PAPI_OK );
			
 
				+}
			
 
				+#endif
			
 
				+#endif
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -126,18 +126,8 @@ double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arc
 
				 	int dev;
			
 
				 	for(dev = 0; dev < perf_arch->ndevices; dev++)
			
 
				 	{
			
 
				-		double coef = 0.0;
			
 
				-		if (perf_arch->devices[dev].type == STARPU_CPU_WORKER)
			
 
				-			coef = _STARPU_CPU_ALPHA;
			
 
				-		else if (perf_arch->devices[dev].type == STARPU_CUDA_WORKER)
			
 
				-			coef = _STARPU_CUDA_ALPHA;
			
 
				-		else if (perf_arch->devices[dev].type == STARPU_OPENCL_WORKER)
			
 
				-			coef = _STARPU_OPENCL_ALPHA;
			
 
				-		else if (perf_arch->devices[dev].type == STARPU_MIC_WORKER)
			
 
				-			coef = _STARPU_MIC_ALPHA;
			
 
				-		else if (perf_arch->devices[dev].type == STARPU_MPI_MS_WORKER)
			
 
				-			coef = _STARPU_MPI_MS_ALPHA;
			
 
				-
			
 
				+		enum starpu_worker_archtype archtype = perf_arch->devices[dev].type;
			
 
				+		double coef = starpu_driver_info[archtype].alpha;
			
 
				 		speedup += coef * (perf_arch->devices[dev].ncores);
			
 
				 	}
			
 
				 	return speedup;
			
@@ -305,7 +295,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 		if (!_starpu_data_is_multiformat_handle(handle))
			
 
				 			continue;
			
 
				 
			
 
				-		node_kind = _starpu_worker_get_node_kind(arch->devices[0].type);
			
 
				+		node_kind = starpu_worker_get_memory_node_kind(arch->devices[0].type);
			
 
				 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
			
 
				 			continue;
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -74,14 +74,10 @@ void _starpu_initialize_registered_performance_models(void);
 
				 void _starpu_deinitialize_registered_performance_models(void);
			
 
				 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model);
			
 
				 
			
 
				-double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model,
			
 
				-					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				-double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model,
			
 
				-					struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				-double _starpu_multiple_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,
			
 
				-					struct _starpu_job *j, unsigned nimpl);
			
 
				-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
			
 
				-				unsigned cpuid, double measured, unsigned nimpl);
			
 
				+double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+double _starpu_multiple_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, struct _starpu_job *j, unsigned nimpl);
			
 
				+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch, unsigned cpuid, double measured, unsigned nimpl, unsigned number);
			
 
				 int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch);
			
 
				 
			
 
				 void _starpu_create_sampling_directory_if_needed(void);
			
@@ -98,8 +94,7 @@ unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid);
 
				 unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid);
			
 
				 #endif
			
 
				 
			
 
				-void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read,
			
 
				-					     double latency_write, double latency_read, unsigned node, const char *name);
			
 
				+void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node, const char *name);
			
 
				 
			
 
				 void _starpu_write_double(FILE *f, const char *format, double val);
			
 
				 int _starpu_read_double(FILE *f, char *format, double *val);
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -290,7 +290,11 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 
			
 
				 	cudaFree(d_buffer);
			
 
				 
			
 
				+#if CUDART_VERSION >= 4000
			
 
				+	cudaDeviceReset();
			
 
				+#else
			
 
				 	cudaThreadExit();
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
@@ -400,7 +404,11 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cudaSetDevice(src);
			
 
				 	cudaFree(s_buffer);
			
 
				 
			
 
				+#if CUDART_VERSION >= 4000
			
 
				+	cudaDeviceReset();
			
 
				+#else
			
 
				 	cudaThreadExit();
			
 
				+#endif
			
 
				 }
			
 
				 #endif
			
 
				 #endif
			
@@ -3037,7 +3045,7 @@ double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size
 
				 	int busid = starpu_bus_get_id(src_node, dst_node);
			
 
				 	int direct = starpu_bus_get_direct(busid);
			
 
				 #endif
			
 
				-	float ngpus = topology->ncudagpus+topology->nopenclgpus;
			
 
				+	float ngpus = topology->ndevices[STARPU_CUDA_WORKER]+topology->ndevices[STARPU_OPENCL_WORKER];
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning FIXME: ngpus should not be used e.g. for slow disk transfers...
			
 
				 #endif
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -33,7 +33,6 @@
 
				 #include <core/perfmodel/regression.h>
			
 
				 #include <core/perfmodel/multiple_regression.h>
			
 
				 #include <common/config.h>
			
 
				-#include <starpu_parameters.h>
			
 
				 #include <common/uthash.h>
			
 
				 #include <limits.h>
			
 
				 #include <core/task.h>
			
@@ -54,7 +53,7 @@ static int current_arch_comb;
 
				 static int nb_arch_combs;
			
 
				 static starpu_pthread_rwlock_t arch_combs_mutex;
			
 
				 static int historymaxerror;
			
 
				-static char ignore_devid[STARPU_ANY_WORKER];
			
 
				+static char ignore_devid[STARPU_NARCH];
			
 
				 
			
 
				 /* How many executions a codelet will have to be measured before we
			
 
				  * consider that calibration will provide a value good enough for scheduling */
			
@@ -804,7 +803,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 
				 	/* Dump the history into the model file in case it is necessary */
			
 
				        if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
			
 
				 	{
			
 
				-		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us)\tdev (us)\tsum\t\tsum2\t\tn\n");
			
 
				+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us or J)\tdev (us or J)\tsum\t\tsum2\t\tn\n");
			
 
				 		ptr = per_arch_model->list;
			
 
				 		while (ptr)
			
 
				 		{
			
@@ -1030,16 +1029,9 @@ void starpu_perfmodel_dump_xml(FILE *f, struct starpu_perfmodel *model)
 
				 		fprintf(f, "  <combination>\n");
			
 
				 		for(dev = 0; dev < ndevices; dev++)
			
 
				 		{
			
 
				-			const char *type;
			
 
				-			switch (arch_combs[comb]->devices[dev].type)
			
 
				-			{
			
 
				-				case STARPU_CPU_WORKER: type = "CPU"; break;
			
 
				-				case STARPU_CUDA_WORKER: type = "CUDA"; break;
			
 
				-				case STARPU_OPENCL_WORKER: type = "OpenCL"; break;
			
 
				-				case STARPU_MIC_WORKER: type = "MIC"; break;
			
 
				-				case STARPU_MPI_MS_WORKER: type = "MPI_MS"; break;
			
 
				-				default: STARPU_ASSERT(0);
			
 
				-			}
			
 
				+			enum starpu_worker_archtype archtype = arch_combs[comb]->devices[dev].type;
			
 
				+			const char *type = starpu_driver_info[archtype].name_upper;
			
 
				+			STARPU_ASSERT(type);
			
 
				 			fprintf(f, "    <device type=\"%s\" id=\"%d\"",
			
 
				 					type,
			
 
				 					arch_combs[comb]->devices[dev].devid);
			
@@ -1225,21 +1217,22 @@ void _starpu_initialize_registered_performance_models(void)
 
				 	starpu_perfmodel_initialize();
			
 
				 
			
 
				 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-	unsigned ncores = conf->topology.nhwcpus;
			
 
				-	unsigned ncuda =  conf->topology.nhwcudagpus;
			
 
				-	unsigned nopencl = conf->topology.nhwopenclgpus;
			
 
				+	unsigned ncores = conf->topology.nhwworker[STARPU_CPU_WORKER][0];
			
 
				+	unsigned ncuda =  conf->topology.nhwdevices[STARPU_CUDA_WORKER];
			
 
				+	unsigned nopencl = conf->topology.nhwdevices[STARPU_OPENCL_WORKER];
			
 
				 	unsigned nmic = 0;
			
 
				+	enum starpu_worker_archtype archtype;
			
 
				 #if STARPU_MAXMICDEVS > 0 || STARPU_MAXMPIDEVS > 0
			
 
				 	unsigned i;
			
 
				 #endif
			
 
				 #if STARPU_MAXMICDEVS > 0
			
 
				-	for(i = 0; i < conf->topology.nhwmicdevices; i++)
			
 
				-		nmic += conf->topology.nhwmiccores[i];
			
 
				+	for(i = 0; i < conf->topology.nhwdevices[STARPU_MIC_WORKER]; i++)
			
 
				+		nmic += conf->topology.nhwworker[STARPU_MIC_WORKER][i];
			
 
				 #endif
			
 
				 	unsigned nmpi = 0;
			
 
				 #if STARPU_MAXMPIDEVS > 0
			
 
				-	for(i = 0; i < conf->topology.nhwmpidevices; i++)
			
 
				-		nmpi += conf->topology.nhwmpicores[i];
			
 
				+	for(i = 0; i < conf->topology.nhwdevices[STARPU_MPI_MS_WORKER]; i++)
			
 
				+		nmpi += conf->topology.nhwworker[STARPU_MPI_MS_WORKER][i];
			
 
				 #endif
			
 
				 
			
 
				 	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nmpi), this is too big
			
@@ -1249,11 +1242,15 @@ void _starpu_initialize_registered_performance_models(void)
 
				 	current_arch_comb = 0;
			
 
				 	historymaxerror = starpu_get_env_number_default("STARPU_HISTORY_MAX_ERROR", STARPU_HISTORYMAXERROR);
			
 
				 	_starpu_calibration_minimum = starpu_get_env_number_default("STARPU_CALIBRATE_MINIMUM", 10);
			
 
				-	ignore_devid[STARPU_CPU_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CPU", 1);
			
 
				-	ignore_devid[STARPU_CUDA_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CUDA", 0);
			
 
				-	ignore_devid[STARPU_OPENCL_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL", 0);
			
 
				-	ignore_devid[STARPU_MIC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MIC", 0);
			
 
				-	ignore_devid[STARPU_MPI_MS_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS", 0);
			
 
				+
			
 
				+	for (archtype = 0; archtype < STARPU_NARCH; archtype++)
			
 
				+	{
			
 
				+		char name[128];
			
 
				+		const char *arch = starpu_worker_get_type_as_env_var(archtype);
			
 
				+		int def = archtype == STARPU_CPU_WORKER ? 1 : 0;
			
 
				+		snprintf(name, sizeof(name), "STARPU_PERF_MODEL_HOMOGENEOUS_%s", arch);
			
 
				+		ignore_devid[archtype] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CPU", def);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void _starpu_deinitialize_performance_model(struct starpu_perfmodel *model)
			
@@ -1522,8 +1519,8 @@ int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int starpu_perfmodel_deinit(struct starpu_perfmodel *model){
			
 
				-
			
 
				+int starpu_perfmodel_deinit(struct starpu_perfmodel *model)
			
 
				+{
			
 
				 	_starpu_deinitialize_performance_model(model);
			
 
				 	free(model->state);
			
 
				 	model->state = NULL;
			
@@ -1546,29 +1543,11 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model){
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
			
 
				+const char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
			
 
				 {
			
 
				-	switch(archtype)
			
 
				-	{
			
 
				-		case(STARPU_CPU_WORKER):
			
 
				-			return "cpu";
			
 
				-			break;
			
 
				-		case(STARPU_CUDA_WORKER):
			
 
				-			return "cuda";
			
 
				-			break;
			
 
				-		case(STARPU_OPENCL_WORKER):
			
 
				-			return "opencl";
			
 
				-			break;
			
 
				-		case(STARPU_MIC_WORKER):
			
 
				-			return "mic";
			
 
				-			break;
			
 
				-		case(STARPU_MPI_MS_WORKER):
			
 
				-			return "mpi_ms";
			
 
				-			break;
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				-	}
			
 
				+	const char *name = starpu_driver_info[archtype].name_lower;
			
 
				+	STARPU_ASSERT(name);
			
 
				+	return name;
			
 
				 }
			
 
				 
			
 
				 void starpu_perfmodel_get_arch_name(struct starpu_perfmodel_arch* arch, char *archname, size_t maxlen,unsigned impl)
			
@@ -1839,7 +1818,7 @@ int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch)
 
				 	return comb;
			
 
				 }
			
 
				 
			
 
				-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
			
 
				+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl, unsigned number)
			
 
				 {
			
 
				 	STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
			
 
				 	if (model)
			
@@ -1909,11 +1888,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 
			
 
				 				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
			
 
				 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
			
 
				-				if (model->type != STARPU_HISTORY_BASED)
			
 
				+				if (number != 1 || model->type != STARPU_HISTORY_BASED)
			
 
				 				{
			
 
				-					entry->sum = measured;
			
 
				-					entry->sum2 = measured*measured;
			
 
				-					entry->nsample = 1;
			
 
				+					entry->sum = measured * number;
			
 
				+					entry->sum2 = measured*measured * number;
			
 
				+					entry->nsample = number;
			
 
				 					entry->mean = measured;
			
 
				 				}
			
 
				 
			
@@ -1934,7 +1913,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 					(100 * local_deviation > (100 + historymaxerror)
			
 
				 					 || (100 / local_deviation > (100 + historymaxerror))))
			
 
				 				{
			
 
				-					entry->nerror++;
			
 
				+					entry->nerror+=number;
			
 
				 
			
 
				 					/* More errors than measurements, we're most probably completely wrong, we flush out all the entries */
			
 
				 					if (entry->nerror >= entry->nsample)
			
@@ -1952,9 +1931,9 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				-					entry->sum += measured;
			
 
				-					entry->sum2 += measured*measured;
			
 
				-					entry->nsample++;
			
 
				+					entry->sum += measured * number;
			
 
				+					entry->sum2 += measured*measured * number;
			
 
				+					entry->nsample += number;
			
 
				 
			
 
				 					unsigned n = entry->nsample;
			
 
				 					entry->mean = entry->sum / n;
			
@@ -2070,7 +2049,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
			
 
				+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured, unsigned number)
			
 
				 {
			
 
				 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
			
 
				 
			
@@ -2080,11 +2059,16 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 
			
 
				 	_starpu_init_and_load_perfmodel(model);
			
 
				 	/* Record measurement */
			
 
				-	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl);
			
 
				+	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl, number);
			
 
				 	/* and save perfmodel on termination */
			
 
				 	_starpu_set_calibrate_flag(1);
			
 
				 }
			
 
				 
			
 
				+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
			
 
				+{
			
 
				+	starpu_perfmodel_update_history_n(model, task, arch, cpuid, nimpl, measured, 1);
			
 
				+}
			
 
				+
			
 
				 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	int comb;
			
@@ -2099,7 +2083,7 @@ int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model)
 
				 		fprintf(output, "\tComb %d: %d device%s\n", model->state->combs[comb], arch->ndevices, arch->ndevices>1?"s":"");
			
 
				 		for(device=0 ; device<arch->ndevices ; device++)
			
 
				 		{
			
 
				-			char *name = starpu_perfmodel_get_archtype_name(arch->devices[device].type);
			
 
				+			const char *name = starpu_perfmodel_get_archtype_name(arch->devices[device].type);
			
 
				 			fprintf(output, "\t\tDevice %d: type: %s - devid: %d - ncores: %d\n", device, name, arch->devices[device].devid, arch->devices[device].ncores);
			
 
				 		}
			
 
				 	}
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -30,7 +30,7 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 	ptr = per_arch_model->list;
			
 
				 
			
 
				 	if (!parameter && ptr)
			
 
				-		fprintf(output, "# hash\t\tsize\t\tflops\t\tmean (us)\tstddev (us)\t\tn\n");
			
 
				+		fprintf(output, "# hash\t\tsize\t\tflops\t\tmean (us or J)\tstddev (us or J)\t\tn\n");
			
 
				 
			
 
				 	while (ptr)
			
 
				 	{
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -417,6 +417,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 
			
 
				 int _starpu_push_task(struct _starpu_job *j)
			
 
				 {
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	//if (_starpu_simgrid_task_push_cost())
			
 
				+		starpu_sleep(0.000001);
			
 
				+#endif
			
 
				 	if(j->task->prologue_callback_func)
			
 
				 	{
			
 
				 		_starpu_set_current_task(j->task);
			
@@ -625,14 +629,16 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 				&& starpu_get_prefetch_flag()
			
 
				 				&& starpu_memory_nodes_get_count() > 1)
			
 
				 			{
			
 
				-				if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
			
 
				-					starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
			
 
				-				else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
			
 
				-					starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
			
 
				-				else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
			
 
				-					starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
			
 
				-				else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
			
 
				-					starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
			
 
				+				enum starpu_worker_archtype type;
			
 
				+				for (type = 0; type < STARPU_NARCH; type++)
			
 
				+				{
			
 
				+					if (task->where == (int32_t) STARPU_WORKER_TO_MASK(type))
			
 
				+					{
			
 
				+						if (config->arch_nodeid[type] >= 0)
			
 
				+							starpu_prefetch_task_input_on_node(task, config->arch_nodeid[type]);
			
 
				+						break;
			
 
				+					}
			
 
				+				}
			
 
				 			}
			
 
				 
			
 
				 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -357,11 +357,16 @@ void _starpu_start_simgrid(int *argc, char **argv)
 
				 	int limit_bandwidth = starpu_get_env_number("STARPU_LIMIT_BANDWIDTH");
			
 
				 	if (limit_bandwidth >= 0)
			
 
				 	{
			
 
				-#ifdef HAVE_SG_LINK_BANDWIDTH_SET
			
 
				+#if defined(HAVE_SG_LINK_BANDWIDTH_SET) || defined(HAVE_SG_LINK_SET_BANDWIDTH)
			
 
				 		sg_link_t *links = sg_link_list();
			
 
				 		int count = sg_link_count(), i;
			
 
				-		for (i = 0; i < count; i++) {
			
 
				+		for (i = 0; i < count; i++)
			
 
				+		{
			
 
				+#ifdef HAVE_SG_LINK_SET_BANDWIDTH
			
 
				+			sg_link_set_bandwidth(links[i], limit_bandwidth * 1000000.);
			
 
				+#else
			
 
				 			sg_link_bandwidth_set(links[i], limit_bandwidth * 1000000.);
			
 
				+#endif
			
 
				 		}
			
 
				 #else
			
 
				 		_STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but this requires simgrid 3.26, thus ignored\n", limit_bandwidth);
			
@@ -492,7 +497,11 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 
				 
			
 
				 #if defined(HAVE_SG_ACTOR_ATTACH) && defined (HAVE_SG_ACTOR_DATA)
			
 
				 		sg_actor_t actor = sg_actor_attach("main", NULL, _starpu_simgrid_get_host_by_name("MAIN"), NULL);
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+		sg_actor_set_data(actor, tsd);
			
 
				+#else
			
 
				 		sg_actor_data_set(actor, tsd);
			
 
				+#endif
			
 
				 #else
			
 
				 		MSG_process_attach("main", tsd, _starpu_simgrid_get_host_by_name("MAIN"), NULL);
			
 
				 #endif
			
@@ -519,7 +528,11 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 
				 		void **tsd;
			
 
				 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
			
 
				 #ifdef HAVE_SG_ACTOR_DATA
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+		sg_actor_set_data(sg_actor_self(), tsd);
			
 
				+#else
			
 
				 		sg_actor_data_set(sg_actor_self(), tsd);
			
 
				+#endif
			
 
				 #else
			
 
				 		smpi_process_set_user_data(tsd);
			
 
				 #endif
			
@@ -735,6 +748,9 @@ void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_j
 
				 		 * to be able to easily check scheduling robustness */
			
 
				 	}
			
 
				 
			
 
				+#ifdef HAVE_SG_HOST_GET_SPEED
			
 
				+	flops = length/1000000.0*sg_host_get_speed(sg_host_self());
			
 
				+#else
			
 
				 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
			
 
				 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
			
 
				 	flops = length/1000000.0*sg_host_speed(sg_host_self());
			
@@ -746,6 +762,7 @@ void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_j
 
				 #else
			
 
				 	flops = length/1000000.0*MSG_get_host_speed(MSG_host_self());
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 #ifndef HAVE_SG_ACTOR_SELF_EXECUTE
			
 
				 	simgrid_task = MSG_task_create(_starpu_job_get_task_name(j), flops, 0, NULL);
			
@@ -1210,14 +1227,22 @@ starpu_pthread_t _starpu_simgrid_actor_create(const char *name, xbt_main_func_t
 
				 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
			
 
				 #ifdef HAVE_SG_ACTOR_INIT
			
 
				 	actor = sg_actor_init(name, host);
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+	sg_actor_set_data(actor, tsd);
			
 
				+#else
			
 
				 	sg_actor_data_set(actor, tsd);
			
 
				+#endif
			
 
				 	sg_actor_start(actor, code, argc, argv);
			
 
				 #else
			
 
				 	actor = MSG_process_create_with_arguments(name, code, tsd, host, argc, argv);
			
 
				 #ifdef HAVE_SG_ACTOR_DATA
			
 
				+#ifdef HAVE_SG_ACTOR_SET_DATA
			
 
				+	sg_actor_set_data(actor, tsd);
			
 
				+#else
			
 
				 	sg_actor_data_set(actor, tsd);
			
 
				 #endif
			
 
				 #endif
			
 
				+#endif
			
 
				 	return actor;
			
 
				 }
			
 
				 
			
@@ -1251,7 +1276,7 @@ starpu_sg_host_t _starpu_simgrid_get_memnode_host(unsigned node)
 
				 
			
 
				 void _starpu_simgrid_count_ngpus(void)
			
 
				 {
			
 
				-#if (defined(HAVE_SG_LINK_NAME) || defined sg_link_name) && (SIMGRID_VERSION >= 31300)
			
 
				+#if (defined(HAVE_SG_LINK_GET_NAME) || defined(HAVE_SG_LINK_NAME) || defined sg_link_name) && (SIMGRID_VERSION >= 31300)
			
 
				 	unsigned src, dst;
			
 
				 	starpu_sg_host_t ramhost = _starpu_simgrid_get_host_by_name("RAM");
			
 
				 
			
@@ -1261,7 +1286,7 @@ void _starpu_simgrid_count_ngpus(void)
 
				 		{
			
 
				 			int busid;
			
 
				 			starpu_sg_host_t srchost, dsthost;
			
 
				-#if defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				+#if defined(HAVE_SG_HOST_GET_ROUTE) || defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				 			xbt_dynar_t route_dynar = xbt_dynar_new(sizeof(SD_link_t), NULL);
			
 
				 			SD_link_t *route;
			
 
				 #else
			
@@ -1281,8 +1306,12 @@ void _starpu_simgrid_count_ngpus(void)
 
				 
			
 
				 			srchost = _starpu_simgrid_get_memnode_host(src);
			
 
				 			dsthost = _starpu_simgrid_get_memnode_host(dst);
			
 
				-#if defined(HAVE_SG_HOST_ROUTE)  || defined(sg_host_route)
			
 
				+#if defined(HAVE_SG_HOST_GET_ROUTE) || defined(HAVE_SG_HOST_ROUTE)  || defined(sg_host_route)
			
 
				+#ifdef HAVE_SG_HOST_GET_ROUTE
			
 
				+			sg_host_get_route(srchost, dsthost, route_dynar);
			
 
				+#else
			
 
				 			sg_host_route(srchost, dsthost, route_dynar);
			
 
				+#endif
			
 
				 			routesize = xbt_dynar_length(route_dynar);
			
 
				 			route = xbt_dynar_to_array(route_dynar);
			
 
				 #else
			
@@ -1293,7 +1322,13 @@ void _starpu_simgrid_count_ngpus(void)
 
				 			/* If it goes through "Host", do not care, there is no
			
 
				 			 * direct transfer support */
			
 
				 			for (i = 0; i < routesize; i++)
			
 
				-				if (!strcmp(sg_link_name(route[i]), "Host"))
			
 
				+				if (
			
 
				+#ifdef HAVE_SG_LINK_GET_NAME
			
 
				+					!strcmp(sg_link_get_name(route[i]), "Host")
			
 
				+#else
			
 
				+					!strcmp(sg_link_name(route[i]), "Host")
			
 
				+#endif
			
 
				+					)
			
 
				 					break;
			
 
				 			if (i < routesize)
			
 
				 				continue;
			
@@ -1302,7 +1337,11 @@ void _starpu_simgrid_count_ngpus(void)
 
				 			through = -1;
			
 
				 			for (i = 0; i < routesize; i++)
			
 
				 			{
			
 
				+#ifdef HAVE_SG_LINK_GET_NAME
			
 
				+				name = sg_link_get_name(route[i]);
			
 
				+#else
			
 
				 				name = sg_link_name(route[i]);
			
 
				+#endif
			
 
				 				size_t len = strlen(name);
			
 
				 				if (!strcmp(" through", name+len-8))
			
 
				 					through = i;
			
@@ -1315,7 +1354,11 @@ void _starpu_simgrid_count_ngpus(void)
 
				 				_STARPU_DEBUG("Didn't find through-link for %d->%d\n", src, dst);
			
 
				 				continue;
			
 
				 			}
			
 
				+#ifdef HAVE_SG_LINK_GET_NAME
			
 
				+			name = sg_link_get_name(route[through]);
			
 
				+#else
			
 
				 			name = sg_link_name(route[through]);
			
 
				+#endif
			
 
				 
			
 
				 			/*
			
 
				 			 * count how many direct routes go through it between
			
@@ -1339,10 +1382,14 @@ void _starpu_simgrid_count_ngpus(void)
 
				 
			
 
				 				starpu_sg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
			
 
				 				int routesize2;
			
 
				-#if defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				+#if defined(HAVE_SG_HOST_GET_ROUTE) || defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				 				xbt_dynar_t route_dynar2 = xbt_dynar_new(sizeof(SD_link_t), NULL);
			
 
				 				SD_link_t *route2;
			
 
				+#ifdef HAVE_SG_HOST_GET_ROUTE
			
 
				+				sg_host_get_route(srchost2, ramhost, route_dynar2);
			
 
				+#else
			
 
				 				sg_host_route(srchost2, ramhost, route_dynar2);
			
 
				+#endif
			
 
				 				routesize2 = xbt_dynar_length(route_dynar2);
			
 
				 				route2 = xbt_dynar_to_array(route_dynar2);
			
 
				 #else
			
@@ -1351,19 +1398,25 @@ void _starpu_simgrid_count_ngpus(void)
 
				 #endif
			
 
				 
			
 
				 				for (i = 0; i < routesize2; i++)
			
 
				-					if (!strcmp(name, sg_link_name(route2[i])))
			
 
				+					if (
			
 
				+#ifdef HAVE_SG_LINK_GET_NAME
			
 
				+						!strcmp(name, sg_link_get_name(route2[i]))
			
 
				+#else
			
 
				+						!strcmp(name, sg_link_name(route2[i]))
			
 
				+#endif
			
 
				+						)
			
 
				 					{
			
 
				 						/* This GPU goes through this PCI bridge to access RAM */
			
 
				 						ngpus++;
			
 
				 						break;
			
 
				 					}
			
 
				-#if defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				+#if defined(HAVE_SG_HOST_GET_ROUTE) || defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				 				free(route2);
			
 
				 #endif
			
 
				 			}
			
 
				 			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
			
 
				 			starpu_bus_set_ngpus(busid, ngpus);
			
 
				-#if defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				+#if defined(HAVE_SG_HOST_GET_ROUTE) || defined(HAVE_SG_HOST_ROUTE) || defined(sg_host_route)
			
 
				 			free(route);
			
 
				 #endif
			
 
				 		}
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -24,6 +24,9 @@
 
				 extern "C"
			
 
				 {
			
 
				 #endif
			
 
				+
			
 
				+/* Note: when changing something here, update the include list in configure.ac
			
 
				+ * in the part that tries to enable stdc++11 */
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #ifdef STARPU_HAVE_SIMGRID_MSG_H
			
 
				 #include <simgrid/msg.h>
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -1455,38 +1455,29 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 
				 	switch (node_kind)
			
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				+		case STARPU_MIC_RAM:
			
 
				+		case STARPU_MPI_MS_RAM:
			
 
				 			switch(starpu_node_get_kind(handle->mf_node))
			
 
				 			{
			
 
				 				case STARPU_CPU_RAM:
			
 
				-					return 0;
			
 
				-				case STARPU_CUDA_RAM:      /* Fall through */
			
 
				-				case STARPU_OPENCL_RAM:
			
 
				 				case STARPU_MIC_RAM:
			
 
				                                 case STARPU_MPI_MS_RAM:
			
 
				-					return 1;
			
 
				+					return 0;
			
 
				 				default:
			
 
				-					STARPU_ABORT();
			
 
				+					return 1;
			
 
				 			}
			
 
				 			break;
			
 
				-		case STARPU_CUDA_RAM:    /* Fall through */
			
 
				-		case STARPU_OPENCL_RAM:
			
 
				-		case STARPU_MIC_RAM:
			
 
				-		case STARPU_MPI_MS_RAM:
			
 
				+		default:
			
 
				 			switch(starpu_node_get_kind(handle->mf_node))
			
 
				 			{
			
 
				 				case STARPU_CPU_RAM:
			
 
				-					return 1;
			
 
				-				case STARPU_CUDA_RAM:
			
 
				-				case STARPU_OPENCL_RAM:
			
 
				 				case STARPU_MIC_RAM:
			
 
				                                 case STARPU_MPI_MS_RAM:
			
 
				-					return 0;
			
 
				+					return 1;
			
 
				 				default:
			
 
				-					STARPU_ABORT();
			
 
				+					return 0;
			
 
				 			}
			
 
				 			break;
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				 	}
			
 
				 	/* that instruction should never be reached */
			
 
				 	return -EINVAL;
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -559,13 +559,13 @@ static void _starpu_initialize_workers_cuda_gpuid(struct _starpu_machine_config
 
				 					    &(config->current_cuda_gpuid),
			
 
				 					    (int *)topology->workers_cuda_gpuid,
			
 
				 					    "STARPU_WORKERS_CUDAID",
			
 
				-					    topology->nhwcudagpus,
			
 
				+					    topology->nhwdevices[STARPU_CUDA_WORKER],
			
 
				 					    STARPU_CUDA_WORKER);
			
 
				 }
			
 
				 
			
 
				 static inline int _starpu_get_next_cuda_gpuid(struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	unsigned i = ((config->current_cuda_gpuid++) % config->topology.ncudagpus);
			
 
				+	unsigned i = ((config->current_cuda_gpuid++) % config->topology.ndevices[STARPU_CUDA_WORKER]);
			
 
				 
			
 
				 	return (int)config->topology.workers_cuda_gpuid[i];
			
 
				 }
			
@@ -583,7 +583,7 @@ static void _starpu_initialize_workers_opencl_gpuid(struct _starpu_machine_confi
 
				 					    &(config->current_opencl_gpuid),
			
 
				 					    (int *)topology->workers_opencl_gpuid,
			
 
				 					    "STARPU_WORKERS_OPENCLID",
			
 
				-					    topology->nhwopenclgpus,
			
 
				+					    topology->nhwdevices[STARPU_OPENCL_WORKER],
			
 
				 					    STARPU_OPENCL_WORKER);
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
@@ -646,7 +646,7 @@ static void _starpu_initialize_workers_opencl_gpuid(struct _starpu_machine_confi
 
				 
			
 
				 static inline int _starpu_get_next_opencl_gpuid(struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	unsigned i = ((config->current_opencl_gpuid++) % config->topology.nopenclgpus);
			
 
				+	unsigned i = ((config->current_opencl_gpuid++) % config->topology.ndevices[STARPU_OPENCL_WORKER]);
			
 
				 
			
 
				 	return (int)config->topology.workers_opencl_gpuid[i];
			
 
				 }
			
@@ -665,7 +665,7 @@ static void _starpu_initialize_workers_mic_deviceid(struct _starpu_machine_confi
 
				 					    &(config->current_mic_deviceid),
			
 
				 					    (int *)topology->workers_mic_deviceid,
			
 
				 					    "STARPU_WORKERS_MICID",
			
 
				-					    topology->nhwmiccores,
			
 
				+					    topology->nhwdevices[STARPU_MIC_WORKER],
			
 
				 					    STARPU_MIC_WORKER);
			
 
				 }
			
 
				 #endif
			
@@ -675,7 +675,7 @@ static void _starpu_initialize_workers_mic_deviceid(struct _starpu_machine_confi
 
				 #ifdef STARPU_USE_MIC
			
 
				 static inline int _starpu_get_next_mic_deviceid(struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	unsigned i = ((config->current_mic_deviceid++) % config->topology.nmicdevices);
			
 
				+	unsigned i = ((config->current_mic_deviceid++) % config->topology.ndevices[STARPU_MIC_WORKER]);
			
 
				 
			
 
				 	return (int)config->topology.workers_mic_deviceid[i];
			
 
				 }
			
@@ -685,7 +685,7 @@ static inline int _starpu_get_next_mic_deviceid(struct _starpu_machine_config *c
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 static inline int _starpu_get_next_mpi_deviceid(struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	unsigned i = ((config->current_mpi_deviceid++) % config->topology.nmpidevices);
			
 
				+	unsigned i = ((config->current_mpi_deviceid++) % config->topology.ndevices[STARPU_MPI_MS_WORKER]);
			
 
				 
			
 
				 	return (int)config->topology.workers_mpi_ms_deviceid[i];
			
 
				 }
			
@@ -694,14 +694,14 @@ static void _starpu_init_mpi_topology(struct _starpu_machine_config *config, lon
 
				 {
			
 
				 	/* Discover the topology of the mpi node identifier by MPI_IDX. That
			
 
				 	 * means, make this StarPU instance aware of the number of cores available
			
 
				-	 * on this MPI device. Update the `nhwmpicores' topology field
			
 
				+	 * on this MPI device. Update the `nhwworker[STARPU_MPI_MS_WORKER]' topology field
			
 
				 	 * accordingly. */
			
 
				 
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				 	int nbcores;
			
 
				 	_starpu_src_common_sink_nbcores(_starpu_mpi_ms_nodes[mpi_idx], &nbcores);
			
 
				-	topology->nhwmpicores[mpi_idx] = nbcores;
			
 
				+	topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx] = nbcores;
			
 
				 }
			
 
				 
			
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
@@ -711,14 +711,14 @@ static void _starpu_init_mic_topology(struct _starpu_machine_config *config, lon
 
				 {
			
 
				 	/* Discover the topology of the mic node identifier by MIC_IDX. That
			
 
				 	 * means, make this StarPU instance aware of the number of cores available
			
 
				-	 * on this MIC device. Update the `nhwmiccores' topology field
			
 
				+	 * on this MIC device. Update the `nhwworker[STARPU_MIC_WORKER]' topology field
			
 
				 	 * accordingly. */
			
 
				 
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				 	int nbcores;
			
 
				 	_starpu_src_common_sink_nbcores(_starpu_mic_nodes[mic_idx], &nbcores);
			
 
				-	topology->nhwmiccores[mic_idx] = nbcores;
			
 
				+	topology->nhwworker[STARPU_MIC_WORKER][mic_idx] = nbcores;
			
 
				 }
			
 
				 
			
 
				 static int _starpu_init_mic_node(struct _starpu_machine_config *config, int mic_idx,
			
@@ -837,7 +837,8 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 
				 
			
 
				 	nobind = starpu_get_env_number("STARPU_WORKERS_NOBIND");
			
 
				 
			
 
				-	topology->nhwcpus = 0;
			
 
				+	topology->nhwdevices[STARPU_CPU_WORKER] = 1;
			
 
				+	topology->nhwworker[STARPU_CPU_WORKER][0] = 0;
			
 
				 	topology->nhwpus = 0;
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
@@ -856,7 +857,7 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 
				 #ifdef HAVE_HWLOC_CPUKINDS_GET_NR
			
 
				 	int nr_kinds = hwloc_cpukinds_get_nr(topology->hwtopology, 0);
			
 
				 	if (nr_kinds > 1)
			
 
				-		_STARPU_DISP("Warning: there are several kinds of CPU on this system. For now StarPU assumes all CPU are equal\n", strerror(errno));
			
 
				+		_STARPU_DISP("Warning: there are several kinds of CPU on this system. For now StarPU assumes all CPU are equal\n");
			
 
				 #endif
			
 
				 
			
 
				 	if (starpu_get_env_number_default("STARPU_WORKERS_GETBIND", 0))
			
@@ -881,7 +882,7 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	config->topology.nhwcpus = config->topology.nhwpus = _starpu_simgrid_get_nbhosts("CPU");
			
 
				+	config->topology.nhwworker[STARPU_CPU_WORKER][0] = config->topology.nhwpus = _starpu_simgrid_get_nbhosts("CPU");
			
 
				 #elif defined(STARPU_HAVE_HWLOC)
			
 
				 	/* Discover the CPUs relying on the hwloc interface and fills CONFIG
			
 
				 	 * accordingly. */
			
@@ -901,24 +902,24 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 
				 							 HWLOC_OBJ_PU);
			
 
				 	}
			
 
				 
			
 
				-	topology->nhwcpus = hwloc_get_nbobjs_by_depth(topology->hwtopology, config->cpu_depth);
			
 
				+	topology->nhwworker[STARPU_CPU_WORKER][0] = hwloc_get_nbobjs_by_depth(topology->hwtopology, config->cpu_depth);
			
 
				 	topology->nhwpus = hwloc_get_nbobjs_by_depth(topology->hwtopology, config->pu_depth);
			
 
				 
			
 
				 #elif defined(HAVE_SYSCONF)
			
 
				 	/* Discover the CPUs relying on the sysconf(3) function and fills
			
 
				 	 * CONFIG accordingly. */
			
 
				 
			
 
				-	config->topology.nhwcpus = config->topology.nhwpus = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				+	config->topology.nhwworker[STARPU_CPU_WORKER][0] = config->topology.nhwpus = sysconf(_SC_NPROCESSORS_ONLN);
			
 
				 
			
 
				 #elif defined(_WIN32)
			
 
				 	/* Discover the CPUs on Cygwin and MinGW systems. */
			
 
				 
			
 
				 	SYSTEM_INFO sysinfo;
			
 
				 	GetSystemInfo(&sysinfo);
			
 
				-	config->topology.nhwcpus = config->topology.nhwpus = sysinfo.dwNumberOfProcessors;
			
 
				+	config->topology.nhwworker[STARPU_CPU_WORKER][0] = config->topology.nhwpus = sysinfo.dwNumberOfProcessors;
			
 
				 #else
			
 
				 #warning no way to know number of cores, assuming 1
			
 
				-	config->topology.nhwcpus = config->topology.nhwpus = 1;
			
 
				+	config->topology.nhwworker[STARPU_CPU_WORKER][0] = config->topology.nhwpus = 1;
			
 
				 #endif
			
 
				 
			
 
				 	if (config->conf.ncuda != 0)
			
@@ -926,7 +927,7 @@ static void _starpu_init_topology(struct _starpu_machine_config *config)
 
				 	if (config->conf.nopencl != 0)
			
 
				 		_starpu_opencl_discover_devices(config);
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-        config->topology.nhwmpi = _starpu_mpi_src_get_device_count();
			
 
				+        config->topology.nhwdevices[STARPU_MPI_MS_WORKER] = _starpu_mpi_src_get_device_count();
			
 
				 #endif
			
 
				 
			
 
				 	topology_is_initialized = 1;
			
@@ -941,7 +942,7 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 
				 	unsigned i;
			
 
				 
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				-	int nhyperthreads = topology->nhwpus / topology->nhwcpus;
			
 
				+	int nhyperthreads = topology->nhwpus / topology->nhwworker[STARPU_CPU_WORKER][0];
			
 
				 	unsigned bind_on_core = 0;
			
 
				 	int scale = 1;
			
 
				 
			
@@ -1005,7 +1006,7 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 
				 						}
			
 
				 						else
			
 
				 						{
			
 
				-							endval = (bind_on_core ? topology->nhwcpus : topology->nhwpus) - 1;
			
 
				+							endval = (bind_on_core ? topology->nhwworker[STARPU_CPU_WORKER][0] : topology->nhwpus) - 1;
			
 
				 							if (*strval)
			
 
				 								strval++;
			
 
				 						}
			
@@ -1109,7 +1110,7 @@ static inline unsigned _starpu_get_next_bindid(struct _starpu_machine_config *co
 
				 	STARPU_ASSERT_MSG(topology_is_initialized, "The StarPU core is not initialized yet, have you called starpu_init?");
			
 
				 
			
 
				 	unsigned current_preferred;
			
 
				-	unsigned nhyperthreads = topology->nhwpus / topology->nhwcpus;
			
 
				+	unsigned nhyperthreads = topology->nhwpus / topology->nhwworker[STARPU_CPU_WORKER][0];
			
 
				 	unsigned ncores = topology->nhwpus / nhyperthreads;
			
 
				 	unsigned i;
			
 
				 
			
@@ -1192,7 +1193,7 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config)
 
				 #endif
			
 
				 	_starpu_init_topology(config);
			
 
				 
			
 
				-	return config->topology.nhwcpus;
			
 
				+	return config->topology.nhwworker[STARPU_CPU_WORKER][0];
			
 
				 }
			
 
				 
			
 
				 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config)
			
@@ -1272,7 +1273,7 @@ static void _starpu_init_mic_config(struct _starpu_machine_config *config,
 
				 
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-	topology->nhwmiccores[mic_idx] = 0;
			
 
				+	topology->nhwworker[STARPU_MIC_WORKER][mic_idx] = 0;
			
 
				 
			
 
				 	_starpu_init_mic_topology(config, mic_idx);
			
 
				 
			
@@ -1284,29 +1285,29 @@ static void _starpu_init_mic_config(struct _starpu_machine_config *config,
 
				 	{
			
 
				 		/* Nothing was specified, so let's use the number of
			
 
				 		 * detected mic cores. ! */
			
 
				-		nmiccores = topology->nhwmiccores[mic_idx];
			
 
				+		nmiccores = topology->nhwworker[STARPU_MIC_WORKER][mic_idx];
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		if ((unsigned) nmiccores > topology->nhwmiccores[mic_idx])
			
 
				+		if ((unsigned) nmiccores > topology->nhwworker[STARPU_MIC_WORKER][mic_idx])
			
 
				 		{
			
 
				 			/* The user requires more MIC cores than there is available */
			
 
				-			_STARPU_MSG("# Warning: %d MIC cores requested. Only %u available.\n", nmiccores, topology->nhwmiccores[mic_idx]);
			
 
				-			nmiccores = topology->nhwmiccores[mic_idx];
			
 
				+			_STARPU_MSG("# Warning: %d MIC cores requested. Only %u available.\n", nmiccores, topology->nhwworker[STARPU_MIC_WORKER][mic_idx]);
			
 
				+			nmiccores = topology->nhwworker[STARPU_MIC_WORKER][mic_idx];
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	topology->nmiccores[mic_idx] = nmiccores;
			
 
				-	STARPU_ASSERT_MSG(topology->nmiccores[mic_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
			
 
				-			  "topology->nmiccores[mic_idx(%u)] (%u) + topology->nworkers (%u) <= STARPU_NMAXWORKERS (%d)",
			
 
				-			  mic_idx, topology->nmiccores[mic_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				+	topology->nworker[STARPU_MIC_WORKER][mic_idx] = nmiccores;
			
 
				+	STARPU_ASSERT_MSG(topology->nworker[STARPU_MIC_WORKER][mic_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
			
 
				+			  "topology->nworker[STARPU_MIC_WORKER][mic_idx(%u)] (%u) + topology->nworkers (%u) <= STARPU_NMAXWORKERS (%d)",
			
 
				+			  mic_idx, topology->nworker[STARPU_MIC_WORKER][mic_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				 
			
 
				 	/* _starpu_initialize_workers_mic_deviceid (config); */
			
 
				 
			
 
				 	mic_worker_set[mic_idx].workers = &config->workers[topology->nworkers];
			
 
				-	mic_worker_set[mic_idx].nworkers = topology->nmiccores[mic_idx];
			
 
				+	mic_worker_set[mic_idx].nworkers = topology->nworker[STARPU_MIC_WORKER][mic_idx];
			
 
				 	unsigned miccore_id;
			
 
				-	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
			
 
				+	for (miccore_id = 0; miccore_id < topology->nworker[STARPU_MIC_WORKER][mic_idx]; miccore_id++)
			
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + miccore_id;
			
 
				 		config->workers[worker_idx].set = &mic_worker_set[mic_idx];
			
@@ -1323,7 +1324,7 @@ static void _starpu_init_mic_config(struct _starpu_machine_config *config,
 
				 	}
			
 
				 	_starpu_mic_nodes[mic_idx]->baseworkerid = topology->nworkers;
			
 
				 
			
 
				-	topology->nworkers += topology->nmiccores[mic_idx];
			
 
				+	topology->nworkers += topology->nworker[STARPU_MIC_WORKER][mic_idx];
			
 
				 }
			
 
				 
			
 
				 static COIENGINE mic_handles[STARPU_MAXMICDEVS];
			
@@ -1337,7 +1338,7 @@ static void _starpu_init_mpi_config(struct _starpu_machine_config *config,
 
				 {
			
 
				         struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-        topology->nhwmpicores[mpi_idx] = 0;
			
 
				+        topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx] = 0;
			
 
				 
			
 
				         _starpu_init_mpi_topology(config, mpi_idx);
			
 
				 
			
@@ -1348,28 +1349,28 @@ static void _starpu_init_mpi_config(struct _starpu_machine_config *config,
 
				         {
			
 
				                 /* Nothing was specified, so let's use the number of
			
 
				                  * detected mpi cores. ! */
			
 
				-                nmpicores = topology->nhwmpicores[mpi_idx];
			
 
				+                nmpicores = topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx];
			
 
				         }
			
 
				         else
			
 
				         {
			
 
				-                if ((unsigned) nmpicores > topology->nhwmpicores[mpi_idx])
			
 
				+                if ((unsigned) nmpicores > topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx])
			
 
				                 {
			
 
				                         /* The user requires more MPI cores than there is available */
			
 
				                         _STARPU_MSG("# Warning: %d MPI cores requested. Only %u available.\n",
			
 
				-				    nmpicores, topology->nhwmpicores[mpi_idx]);
			
 
				-                        nmpicores = topology->nhwmpicores[mpi_idx];
			
 
				+				    nmpicores, topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx]);
			
 
				+                        nmpicores = topology->nhwworker[STARPU_MPI_MS_WORKER][mpi_idx];
			
 
				                 }
			
 
				         }
			
 
				 
			
 
				-        topology->nmpicores[mpi_idx] = nmpicores;
			
 
				-        STARPU_ASSERT_MSG(topology->nmpicores[mpi_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
			
 
				-                        "topology->nmpicores[mpi_idx(%u)] (%u) + topology->nworkers (%u) <= STARPU_NMAXWORKERS (%d)",
			
 
				-                        mpi_idx, topology->nmpicores[mpi_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				+        topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx] = nmpicores;
			
 
				+        STARPU_ASSERT_MSG(topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
			
 
				+                        "topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx(%u)] (%u) + topology->nworkers (%u) <= STARPU_NMAXWORKERS (%d)",
			
 
				+                        mpi_idx, topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx], topology->nworkers, STARPU_NMAXWORKERS);
			
 
				 
			
 
				         mpi_worker_set[mpi_idx].workers = &config->workers[topology->nworkers];
			
 
				-        mpi_worker_set[mpi_idx].nworkers = topology->nmpicores[mpi_idx];
			
 
				+        mpi_worker_set[mpi_idx].nworkers = topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx];
			
 
				         unsigned mpicore_id;
			
 
				-        for (mpicore_id = 0; mpicore_id < topology->nmpicores[mpi_idx]; mpicore_id++)
			
 
				+        for (mpicore_id = 0; mpicore_id < topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx]; mpicore_id++)
			
 
				         {
			
 
				                 int worker_idx = topology->nworkers + mpicore_id;
			
 
				                 config->workers[worker_idx].set = &mpi_worker_set[mpi_idx];
			
@@ -1386,7 +1387,7 @@ static void _starpu_init_mpi_config(struct _starpu_machine_config *config,
 
				         }
			
 
				 	_starpu_mpi_ms_nodes[mpi_idx]->baseworkerid = topology->nworkers;
			
 
				 
			
 
				-        topology->nworkers += topology->nmpicores[mpi_idx];
			
 
				+        topology->nworkers += topology->nworker[STARPU_MPI_MS_WORKER][mpi_idx];
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -1429,13 +1430,13 @@ static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		topology->nmicdevices = 0;
			
 
				+		topology->ndevices[STARPU_MIC_WORKER] = 0;
			
 
				 		unsigned i;
			
 
				 		for (i = 0; i < (unsigned) reqmicdevices; i++)
			
 
				 			if (0 == _starpu_init_mic_node(config, i, &mic_handles[i], &_starpu_mic_process[i]))
			
 
				-				topology->nmicdevices++;
			
 
				+				topology->ndevices[STARPU_MIC_WORKER]++;
			
 
				 
			
 
				-		for (i = 0; i < topology->nmicdevices; i++)
			
 
				+		for (i = 0; i < topology->ndevices[STARPU_MIC_WORKER]; i++)
			
 
				 			_starpu_init_mic_config(config, user_conf, i);
			
 
				 	}
			
 
				 #endif
			
@@ -1466,10 +1467,10 @@ static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		topology->nmpidevices = reqmpidevices;
			
 
				+		topology->ndevices[STARPU_MPI_MS_WORKER] = reqmpidevices;
			
 
				 
			
 
				 		/* if user don't want to use MPI slaves, we close the slave processes */
			
 
				-		if (no_mp_config && topology->nmpidevices == 0)
			
 
				+		if (no_mp_config && topology->ndevices[STARPU_MPI_MS_WORKER] == 0)
			
 
				 		{
			
 
				 			_starpu_mpi_common_mp_deinit();
			
 
				 			exit(0);
			
@@ -1478,10 +1479,10 @@ static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 
				 		if (!no_mp_config)
			
 
				 		{
			
 
				 			unsigned i;
			
 
				-			for (i = 0; i < topology->nmpidevices; i++)
			
 
				+			for (i = 0; i < topology->ndevices[STARPU_MPI_MS_WORKER]; i++)
			
 
				 				_starpu_mpi_ms_nodes[i] = _starpu_mp_common_node_create(STARPU_NODE_MPI_SOURCE, i);
			
 
				 
			
 
				-			for (i = 0; i < topology->nmpidevices; i++)
			
 
				+			for (i = 0; i < topology->ndevices[STARPU_MPI_MS_WORKER]; i++)
			
 
				 				_starpu_init_mpi_config(config, user_conf, i);
			
 
				 		}
			
 
				 	}
			
@@ -1517,12 +1518,12 @@ static void _starpu_deinit_mp_config(struct _starpu_machine_config *config)
 
				 	unsigned i;
			
 
				 
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-	for (i = 0; i < topology->nmicdevices; i++)
			
 
				+	for (i = 0; i < topology->ndevices[STARPU_MIC_WORKER]; i++)
			
 
				 		_starpu_deinit_mic_node(i);
			
 
				 	_starpu_mic_clear_kernels();
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-	for (i = 0; i < topology->nmpidevices; i++)
			
 
				+	for (i = 0; i < topology->ndevices[STARPU_MPI_MS_WORKER]; i++)
			
 
				 		_starpu_deinit_mpi_node(i);
			
 
				 #endif
			
 
				 }
			
@@ -1632,9 +1633,10 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 	}
			
 
				 
			
 
				 	/* Now we know how many CUDA devices will be used */
			
 
				-	topology->ncudagpus = ncuda;
			
 
				-	topology->nworkerpercuda = nworker_per_cuda;
			
 
				-	STARPU_ASSERT(topology->ncudagpus <= STARPU_MAXCUDADEVS);
			
 
				+	topology->ndevices[STARPU_CUDA_WORKER] = ncuda;
			
 
				+	for (i = 0; i < ncuda; i++)
			
 
				+		topology->nworker[STARPU_CUDA_WORKER][i] = nworker_per_cuda;
			
 
				+	STARPU_ASSERT(topology->ndevices[STARPU_CUDA_WORKER] <= STARPU_MAXCUDADEVS);
			
 
				 
			
 
				 	_starpu_initialize_workers_cuda_gpuid(config);
			
 
				 
			
@@ -1661,11 +1663,11 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 	if (!topology->cuda_th_per_dev)
			
 
				 	{
			
 
				 		cuda_worker_set[0].workers = &config->workers[topology->nworkers];
			
 
				-		cuda_worker_set[0].nworkers = topology->ncudagpus * nworker_per_cuda;
			
 
				+		cuda_worker_set[0].nworkers = topology->ndevices[STARPU_CUDA_WORKER] * nworker_per_cuda;
			
 
				 	}
			
 
				 
			
 
				 	unsigned cudagpu;
			
 
				-	for (cudagpu = 0; cudagpu < topology->ncudagpus; cudagpu++)
			
 
				+	for (cudagpu = 0; cudagpu < topology->ndevices[STARPU_CUDA_WORKER]; cudagpu++)
			
 
				 	{
			
 
				 		int devid = _starpu_get_next_cuda_gpuid(config);
			
 
				 		int worker_idx0 = topology->nworkers + cudagpu * nworker_per_cuda;
			
@@ -1737,7 +1739,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 #endif
			
 
				         }
			
 
				 
			
 
				-	topology->nworkers += topology->ncudagpus * nworker_per_cuda;
			
 
				+	topology->nworkers += topology->ndevices[STARPU_CUDA_WORKER] * nworker_per_cuda;
			
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
@@ -1781,20 +1783,22 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	topology->nopenclgpus = nopencl;
			
 
				-	STARPU_ASSERT(topology->nopenclgpus + topology->nworkers <= STARPU_NMAXWORKERS);
			
 
				+	topology->ndevices[STARPU_OPENCL_WORKER] = nopencl;
			
 
				+	for (i = 0; i < nopencl; i++)
			
 
				+		topology->nworker[STARPU_OPENCL_WORKER][i] = 1;
			
 
				+	STARPU_ASSERT(topology->ndevices[STARPU_OPENCL_WORKER] + topology->nworkers <= STARPU_NMAXWORKERS);
			
 
				 
			
 
				 	_starpu_initialize_workers_opencl_gpuid(config);
			
 
				 
			
 
				 	unsigned openclgpu;
			
 
				-	for (openclgpu = 0; openclgpu < topology->nopenclgpus; openclgpu++)
			
 
				+	for (openclgpu = 0; openclgpu < topology->ndevices[STARPU_OPENCL_WORKER]; openclgpu++)
			
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + openclgpu;
			
 
				 		int devid = _starpu_get_next_opencl_gpuid(config);
			
 
				 		if (devid == -1)
			
 
				 		{
			
 
				 			// There is no more devices left
			
 
				-			topology->nopenclgpus = openclgpu;
			
 
				+			topology->ndevices[STARPU_OPENCL_WORKER] = openclgpu;
			
 
				 			break;
			
 
				 		}
			
 
				 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
			
@@ -1809,7 +1813,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 		config->worker_mask |= STARPU_OPENCL;
			
 
				 	}
			
 
				 
			
 
				-	topology->nworkers += topology->nopenclgpus;
			
 
				+	topology->nworkers += topology->ndevices[STARPU_OPENCL_WORKER];
			
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
@@ -1829,13 +1833,13 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 			unsigned mic_busy_cpus = 0;
			
 
				 			int j = 0;
			
 
				 			for (j = 0; j < STARPU_MAXMICDEVS; j++)
			
 
				-				mic_busy_cpus += (topology->nmiccores[j] ? 1 : 0);
			
 
				+				mic_busy_cpus += (topology->nworker[STARPU_MIC_WORKER][j] ? 1 : 0);
			
 
				 
			
 
				 			unsigned mpi_ms_busy_cpus = 0;
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 #ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				 			for (j = 0; j < STARPU_MAXMPIDEVS; j++)
			
 
				-				mpi_ms_busy_cpus += (topology->nmpicores[j] ? 1 : 0);
			
 
				+				mpi_ms_busy_cpus += (topology->nworker[STARPU_MPI_MS_WORKER][j] ? 1 : 0);
			
 
				 #else
			
 
				 			mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
			
 
				 #endif
			
@@ -1843,14 +1847,14 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 			unsigned cuda_busy_cpus = 0;
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			cuda_busy_cpus =
			
 
				-				topology->cuda_th_per_dev == 0 && topology->cuda_th_per_stream == 0 ? (topology->ncudagpus ? 1 : 0) :
			
 
				-				topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : topology->ncudagpus;
			
 
				+				topology->cuda_th_per_dev == 0 && topology->cuda_th_per_stream == 0 ? (topology->ndevices[STARPU_CUDA_WORKER] ? 1 : 0) :
			
 
				+				topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ndevices[STARPU_CUDA_WORKER]) : topology->ndevices[STARPU_CUDA_WORKER];
			
 
				 #endif
			
 
				 			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus
			
 
				 				+ cuda_busy_cpus
			
 
				-				+ topology->nopenclgpus;
			
 
				+				+ topology->ndevices[STARPU_OPENCL_WORKER];
			
 
				 
			
 
				-			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
			
 
				+			long avail_cpus = (long) topology->nhwworker[STARPU_CPU_WORKER][0] - (long) already_busy_cpus;
			
 
				 			if (avail_cpus < 0)
			
 
				 				avail_cpus = 0;
			
 
				 			int nth_per_core = starpu_get_env_number_default("STARPU_NTHREADS_PER_CORE", 1);
			
@@ -1880,12 +1884,13 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 
			
 
				 	}
			
 
				 
			
 
				-	topology->ncpus = ncpu;
			
 
				-	STARPU_ASSERT(topology->ncpus + topology->nworkers <= STARPU_NMAXWORKERS);
			
 
				+	topology->ndevices[STARPU_CPU_WORKER] = 1;
			
 
				+	topology->nworker[STARPU_CPU_WORKER][0] = ncpu;
			
 
				+	STARPU_ASSERT(topology->nworker[STARPU_CPU_WORKER][0] + topology->nworkers <= STARPU_NMAXWORKERS);
			
 
				 
			
 
				 	unsigned cpu;
			
 
				 	unsigned homogeneous = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CPU", 1);
			
 
				-	for (cpu = 0; cpu < topology->ncpus; cpu++)
			
 
				+	for (cpu = 0; cpu < topology->nworker[STARPU_CPU_WORKER][0]; cpu++)
			
 
				 	{
			
 
				 		int worker_idx = topology->nworkers + cpu;
			
 
				 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
			
@@ -1900,7 +1905,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 
				 		config->worker_mask |= STARPU_CPU;
			
 
				 	}
			
 
				 
			
 
				-	topology->nworkers += topology->ncpus;
			
 
				+	topology->nworkers += topology->nworker[STARPU_CPU_WORKER][0];
			
 
				 #endif
			
 
				 
			
 
				 	if (topology->nworkers == 0)
			
@@ -2033,7 +2038,7 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
				 
			
 
				 			if (workerid >= 0)
			
 
				 				/* This shouldn't happen for workers */
			
 
				-				_STARPU_DISP("[%s] Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", hostname, config->topology.nhwcpus, config->topology.nhwpus);
			
 
				+				_STARPU_DISP("[%s] Maybe check starpu_machine_display's output to determine what wrong binding happened. Hwloc reported %d cores and %d threads, perhaps there is misdetection between hwloc, the kernel and the BIOS, or an administrative allocation issue from e.g. the job scheduler?\n", hostname, config->topology.nhwworker[STARPU_CPU_WORKER][0], config->topology.nhwpus);
			
 
				 			ret = -1;
			
 
				 		}
			
 
				 		else
			
@@ -2323,7 +2328,7 @@ static void _starpu_init_numa_node(struct _starpu_machine_config *config)
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
			
 
				-		for (i = 0; i < config->topology.ncudagpus; i++)
			
 
				+		for (i = 0; i < config->topology.ndevices[STARPU_CUDA_WORKER]; i++)
			
 
				 		{
			
 
				 			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
			
 
				 			if (obj)
			
@@ -2358,7 +2363,7 @@ static void _starpu_init_numa_node(struct _starpu_machine_config *config)
 
				 		}
			
 
				 #endif
			
 
				 #if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
			
 
				-		if (config->topology.nopenclgpus > 0)
			
 
				+		if (config->topology.ndevices[STARPU_OPENCL_WORKER] > 0)
			
 
				 		{
			
 
				 			cl_int err;
			
 
				 			cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
			
@@ -2970,6 +2975,7 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 
				 {
			
 
				 	int ret;
			
 
				 	unsigned i;
			
 
				+	enum starpu_worker_archtype type;
			
 
				 
			
 
				 	ret = _starpu_init_machine_config(config, no_mp_config);
			
 
				 	if (ret)
			
@@ -2983,48 +2989,16 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 
				 
			
 
				 	_starpu_mem_chunk_init_last();
			
 
				 
			
 
				-	config->cpus_nodeid = -1;
			
 
				-	config->cuda_nodeid = -1;
			
 
				-	config->opencl_nodeid = -1;
			
 
				-	config->mic_nodeid = -1;
			
 
				-        config->mpi_nodeid = -1;
			
 
				+	for (type = 0; type < STARPU_NARCH; type++)
			
 
				+		config->arch_nodeid[type] = -1;
			
 
				+
			
 
				 	for (i = 0; i < starpu_worker_get_count(); i++)
			
 
				 	{
			
 
				-		switch (starpu_worker_get_type(i))
			
 
				-		{
			
 
				-			case STARPU_CPU_WORKER:
			
 
				-				if (config->cpus_nodeid == -1)
			
 
				-					config->cpus_nodeid = starpu_worker_get_memory_node(i);
			
 
				-				else if (config->cpus_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				-					config->cpus_nodeid = -2;
			
 
				-				break;
			
 
				-			case STARPU_CUDA_WORKER:
			
 
				-				if (config->cuda_nodeid == -1)
			
 
				-					config->cuda_nodeid = starpu_worker_get_memory_node(i);
			
 
				-				else if (config->cuda_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				-					config->cuda_nodeid = -2;
			
 
				-				break;
			
 
				-			case STARPU_OPENCL_WORKER:
			
 
				-				if (config->opencl_nodeid == -1)
			
 
				-					config->opencl_nodeid = starpu_worker_get_memory_node(i);
			
 
				-				else if (config->opencl_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				-					config->opencl_nodeid = -2;
			
 
				-				break;
			
 
				-			case STARPU_MIC_WORKER:
			
 
				-				if (config->mic_nodeid == -1)
			
 
				-					config->mic_nodeid = starpu_worker_get_memory_node(i);
			
 
				-				else if (config->mic_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				-					config->mic_nodeid = -2;
			
 
				-				break;
			
 
				-			case STARPU_MPI_MS_WORKER:
			
 
				-				if (config->mpi_nodeid == -1)
			
 
				-					config->mpi_nodeid = starpu_worker_get_memory_node(i);
			
 
				-				else if (config->mpi_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				-					config->mpi_nodeid = -2;
			
 
				-				break;
			
 
				-			case STARPU_ANY_WORKER:
			
 
				-				STARPU_ASSERT(0);
			
 
				-		}
			
 
				+		type = starpu_worker_get_type(i);
			
 
				+		if (config->arch_nodeid[type] == -1)
			
 
				+			config->arch_nodeid[type] = starpu_worker_get_memory_node(i);
			
 
				+		else if (config->arch_nodeid[type] != (int) starpu_worker_get_memory_node(i))
			
 
				+			config->arch_nodeid[type] = -2;
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -3052,7 +3026,7 @@ void starpu_topology_print(FILE *output)
 
				 	unsigned worker;
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	unsigned ncombinedworkers = topology->ncombinedworkers;
			
 
				-	unsigned nthreads_per_core = topology->nhwpus / topology->nhwcpus;
			
 
				+	unsigned nthreads_per_core = topology->nhwpus / topology->nhwworker[STARPU_CPU_WORKER][0];
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_topology_t topo = topology->hwtopology;
			
@@ -3141,5 +3115,5 @@ unsigned _starpu_get_nhyperthreads()
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				-	return config->topology.nhwpus / config->topology.nhwcpus;
			
 
				+	return config->topology.nhwpus / config->topology.nhwworker[STARPU_CPU_WORKER][0];
			
 
				 }
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -44,6 +44,9 @@
 
				 #include <drivers/cpu/driver_cpu.h>
			
 
				 #include <drivers/cuda/driver_cuda.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				+#include <drivers/mic/driver_mic_source.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+#include <drivers/disk/driver_disk.h>
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #include <core/simgrid.h>
			
@@ -202,6 +205,20 @@ void _starpu__workers_c__register_kobs(void)
 
				 	/* TODO */
			
 
				 }
			
 
				 
			
 
				+struct starpu_driver_info starpu_driver_info[STARPU_NARCH];
			
 
				+
			
 
				+void starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct starpu_driver_info *info)
			
 
				+{
			
 
				+	starpu_driver_info[archtype] = *info;
			
 
				+}
			
 
				+
			
 
				+struct starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
			
 
				+
			
 
				+void starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct starpu_memory_driver_info *info)
			
 
				+{
			
 
				+	starpu_memory_driver_info[kind] = *info;
			
 
				+}
			
 
				+
			
 
				 /* Initialize value of static argc and argv, called when the process begins
			
 
				  */
			
 
				 void _starpu_set_argc_argv(int *argc_param, char ***argv_param)
			
@@ -626,10 +643,12 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
				 			if (d->id.cuda_id == conf->not_launched_drivers[i].id.cuda_id)
			
 
				 				return 0;
			
 
				 			break;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 		case STARPU_OPENCL_WORKER:
			
 
				 			if (d->id.opencl_id == conf->not_launched_drivers[i].id.opencl_id)
			
 
				 				return 0;
			
 
				 			break;
			
 
				+#endif
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 		}
			
@@ -753,23 +772,23 @@ static void _starpu_worker_deinit(struct _starpu_worker *workerarg)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key, unsigned sync)
			
 
				+void _starpu_worker_start(struct _starpu_worker *worker, enum starpu_worker_archtype archtype, unsigned sync)
			
 
				 {
			
 
				 	unsigned devid = worker->devid;
			
 
				 	unsigned memnode = worker->memory_node;
			
 
				-	_STARPU_TRACE_WORKER_INIT_START(fut_key, worker->workerid, devid, memnode, worker->bindid, sync);
			
 
				+	_STARPU_TRACE_WORKER_INIT_START(archtype, worker->workerid, devid, memnode, worker->bindid, sync);
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsigned sync STARPU_ATTRIBUTE_UNUSED)
			
 
				+void _starpu_driver_start(struct _starpu_worker *worker, enum starpu_worker_archtype archtype, unsigned sync STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	(void) fut_key;
			
 
				+	(void) archtype;
			
 
				 	int devid = worker->devid;
			
 
				 	(void) devid;
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_STARPU_TRACE_REGISTER_THREAD(worker->bindid);
			
 
				-	_starpu_worker_start(worker, fut_key, sync);
			
 
				+	_starpu_worker_start(worker, archtype, sync);
			
 
				 #endif
			
 
				 	_starpu_set_local_worker_key(worker);
			
 
				 
			
@@ -987,7 +1006,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 	}
			
 
				 
			
 
				 #if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
			
 
				-        if (pconfig->topology.nmpidevices > 0)
			
 
				+        if (pconfig->topology.ndevices[STARPU_MPI_MS_WORKER] > 0)
			
 
				         {
			
 
				                 struct _starpu_worker_set * worker_set_zero = &mpi_worker_set[0];
			
 
				                 struct _starpu_worker * worker_zero = &worker_set_zero->workers[0];
			
@@ -1152,6 +1171,16 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int starpu_conf_noworker(struct starpu_conf *conf)
			
 
				+{
			
 
				+	conf->ncpus = 0;
			
 
				+	conf->ncuda = 0;
			
 
				+	conf->nopencl = 0;
			
 
				+	conf->nmic = 0;
			
 
				+	conf->nmpi_ms = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static void _starpu_conf_set_value_against_environment(char *name, int *value, int precedence_over_env)
			
 
				 {
			
 
				 	if (precedence_over_env == 0)
			
@@ -1373,6 +1402,16 @@ int _starpu_get_catch_signals(void)
 
				 	return _starpu_config.conf.catch_signals;
			
 
				 }
			
 
				 
			
 
				+void starpu_drivers_preinit(void)
			
 
				+{
			
 
				+	_starpu_cpu_preinit();
			
 
				+	_starpu_cuda_preinit();
			
 
				+	_starpu_opencl_preinit();
			
 
				+	_starpu_mic_preinit();
			
 
				+	_starpu_mpi_ms_preinit();
			
 
				+	_starpu_disk_preinit();
			
 
				+}
			
 
				+
			
 
				 int starpu_init(struct starpu_conf *user_conf)
			
 
				 {
			
 
				 	return starpu_initialize(user_conf, NULL, NULL);
			
@@ -1557,6 +1596,9 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 
			
 
				 	_starpu_load_bus_performance_files();
			
 
				 
			
 
				+	/* Let drivers register themselves */
			
 
				+	starpu_drivers_preinit();
			
 
				+
			
 
				 	/* Note: nothing before here should be allocating anything, in case we
			
 
				 	 * actually return ENODEV here */
			
 
				 
			
@@ -2065,32 +2107,22 @@ unsigned starpu_worker_is_slave_somewhere(int workerid)
 
				 
			
 
				 int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
			
 
				 {
			
 
				-	switch (type)
			
 
				-	{
			
 
				-		case STARPU_CPU_WORKER:
			
 
				-			return _starpu_config.topology.ncpus;
			
 
				+	unsigned n = 0;
			
 
				 
			
 
				-		case STARPU_CUDA_WORKER:
			
 
				-			return _starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda;
			
 
				-
			
 
				-		case STARPU_OPENCL_WORKER:
			
 
				-			return _starpu_config.topology.nopenclgpus;
			
 
				-
			
 
				-		case STARPU_MIC_WORKER:
			
 
				-			return _starpu_config.topology.nmicdevices;
			
 
				-
			
 
				-                case STARPU_MPI_MS_WORKER:
			
 
				-                        return _starpu_config.topology.nmpidevices;
			
 
				-
			
 
				-                case STARPU_ANY_WORKER:
			
 
				-                        return _starpu_config.topology.ncpus+
			
 
				-				_starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda+
			
 
				-                                _starpu_config.topology.nopenclgpus+
			
 
				-                                _starpu_config.topology.nmicdevices+
			
 
				-                                _starpu_config.topology.nmpidevices;
			
 
				-		default:
			
 
				+	if (type != STARPU_ANY_WORKER)
			
 
				+	{
			
 
				+		if (type >= STARPU_NARCH)
			
 
				 			return -EINVAL;
			
 
				+
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < _starpu_config.topology.ndevices[type]; i++)
			
 
				+			n += _starpu_config.topology.nworker[type][i];
			
 
				+		return n;
			
 
				 	}
			
 
				+
			
 
				+	for (type = 0; type < STARPU_NARCH; type++)
			
 
				+		n += starpu_worker_get_count_by_type(type);
			
 
				+	return n;
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_combined_worker_get_count(void)
			
@@ -2100,17 +2132,17 @@ unsigned starpu_combined_worker_get_count(void)
 
				 
			
 
				 unsigned starpu_cpu_worker_get_count(void)
			
 
				 {
			
 
				-	return _starpu_config.topology.ncpus;
			
 
				+	return starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_cuda_worker_get_count(void)
			
 
				 {
			
 
				-	return _starpu_config.topology.ncudagpus * _starpu_config.topology.nworkerpercuda;
			
 
				+	return starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_opencl_worker_get_count(void)
			
 
				 {
			
 
				-	return _starpu_config.topology.nopenclgpus;
			
 
				+	return starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER);
			
 
				 }
			
 
				 
			
 
				 int starpu_asynchronous_copy_disabled(void)
			
@@ -2140,17 +2172,12 @@ int starpu_asynchronous_mpi_ms_copy_disabled(void)
 
				 
			
 
				 unsigned starpu_mic_worker_get_count(void)
			
 
				 {
			
 
				-	int i = 0, count = 0;
			
 
				-
			
 
				-	for (i = 0; i < STARPU_MAXMICDEVS; i++)
			
 
				-		count += _starpu_config.topology.nmiccores[i];
			
 
				-
			
 
				-	return count;
			
 
				+	return starpu_worker_get_count_by_type(STARPU_MIC_WORKER);
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_mpi_ms_worker_get_count(void)
			
 
				 {
			
 
				-        return _starpu_config.topology.nmpidevices;
			
 
				+	return starpu_worker_get_count_by_type(STARPU_MPI_MS_WORKER);
			
 
				 }
			
 
				 
			
 
				 /* When analyzing performance, it is useful to see what is the processing unit
			
@@ -2555,15 +2582,20 @@ unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 
				 	return nsched_ctxs;
			
 
				 }
			
 
				 
			
 
				-char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
			
 
				+const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
			
 
				+{
			
 
				+	const char *ret = starpu_driver_info[type].name_upper;
			
 
				+	if (!ret)
			
 
				+		ret = "unknown";
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+const char *starpu_worker_get_type_as_env_var(enum starpu_worker_archtype type)
			
 
				 {
			
 
				-	if (type == STARPU_CPU_WORKER) return "STARPU_CPU_WORKER";
			
 
				-	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
			
 
				-	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
			
 
				-	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
			
 
				-        if (type == STARPU_MPI_MS_WORKER) return "STARPU_MPI_MS_WORKER";
			
 
				-	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
			
 
				-	return "STARPU_unknown_WORKER";
			
 
				+	const char *ret = starpu_driver_info[type].name_var;
			
 
				+	if (!ret)
			
 
				+		ret = "UNKNOWN";
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx)
			
@@ -2771,22 +2803,9 @@ void starpu_worker_set_waking_up_callback(void (*callback)(unsigned workerid))
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type)
			
 
				+enum starpu_node_kind starpu_worker_get_memory_node_kind(enum starpu_worker_archtype type)
			
 
				 {
			
 
				-	switch(type)
			
 
				-	{
			
 
				-		case STARPU_CPU_WORKER:
			
 
				-			return STARPU_CPU_RAM;
			
 
				-		case STARPU_CUDA_WORKER:
			
 
				-			return STARPU_CUDA_RAM;
			
 
				-		case STARPU_OPENCL_WORKER:
			
 
				-			return STARPU_OPENCL_RAM;
			
 
				-			break;
			
 
				-		case STARPU_MIC_WORKER:
			
 
				-			return STARPU_MIC_RAM;
			
 
				-		case STARPU_MPI_MS_WORKER:
			
 
				-			return STARPU_MPI_MS_RAM;
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-	}
			
 
				+	enum starpu_node_kind kind = starpu_driver_info[type].memory_kind;
			
 
				+	STARPU_ASSERT_MSG(kind != (enum starpu_node_kind) -1, "no memory for archtype %d", type);
			
 
				+	return kind;
			
 
				 }
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -59,8 +59,6 @@
 
				 
			
 
				 #include <datawizard/datawizard.h>
			
 
				 
			
 
				-#include <starpu_parameters.h>
			
 
				-
			
 
				 #define STARPU_MAX_PIPELINE 4
			
 
				 
			
 
				 enum initialization { UNINITIALIZED = 0, CHANGING, INITIALIZED };
			
@@ -271,58 +269,33 @@ struct _starpu_machine_topology
 
				 	/** custom hwloc tree*/
			
 
				 	struct starpu_tree *tree;
			
 
				 
			
 
				-	/** Total number of CPU cores, as detected by the topology code. May
			
 
				-	 * be different from the actual number of CPU workers.
			
 
				-	 */
			
 
				-	unsigned nhwcpus;
			
 
				-
			
 
				 	/** Total number of PUs (i.e. threads), as detected by the topology code. May
			
 
				-	 * be different from the actual number of PU workers.
			
 
				+	 * be different from the actual number of CPU workers.
			
 
				 	 */
			
 
				 	unsigned nhwpus;
			
 
				 
			
 
				-	/** Total number of CUDA devices, as detected. May be different
			
 
				-	 * from the actual number of CUDA workers.
			
 
				+	/** Total number of devices, as detected. May be different from the
			
 
				+	 * actual number of devices run by StarPU.
			
 
				 	 */
			
 
				-	unsigned nhwcudagpus;
			
 
				-
			
 
				-	/** Total number of OpenCL devices, as detected. May be
			
 
				-	 * different from the actual number of OpenCL workers.
			
 
				+	unsigned nhwdevices[STARPU_NARCH];
			
 
				+	/** Total number of worker for each device, as detected. May be different from the
			
 
				+	 * actual number of workers run by StarPU.
			
 
				 	 */
			
 
				-	unsigned nhwopenclgpus;
			
 
				+	unsigned nhwworker[STARPU_NARCH][STARPU_NMAXDEVS];
			
 
				 
			
 
				-	/** Total number of MPI nodes, as detected. May be different
			
 
				-	 * from the actual number of node workers.
			
 
				+	/** Actual number of devices used by StarPU.
			
 
				 	 */
			
 
				-	unsigned nhwmpi;
			
 
				+	unsigned ndevices[STARPU_NARCH];
			
 
				 
			
 
				-	/** Actual number of CPU workers used by StarPU. */
			
 
				-	unsigned ncpus;
			
 
				+	/** Number of worker per device
			
 
				+	 */
			
 
				+	unsigned nworker[STARPU_NARCH][STARPU_NMAXDEVS];
			
 
				 
			
 
				-	/** Actual number of CUDA GPUs used by StarPU. */
			
 
				-	unsigned ncudagpus;
			
 
				-	unsigned nworkerpercuda;
			
 
				+	/** Whether we should have one thread per stream */
			
 
				 	int cuda_th_per_stream;
			
 
				+	/** Whether we should have one thread per device */
			
 
				 	int cuda_th_per_dev;
			
 
				 
			
 
				-	/** Actual number of OpenCL workers used by StarPU. */
			
 
				-	unsigned nopenclgpus;
			
 
				-
			
 
				-	/** Actual number of MPI workers used by StarPU. */
			
 
				-	unsigned nmpidevices;
			
 
				-        unsigned nhwmpidevices;
			
 
				-
			
 
				-	unsigned nhwmpicores[STARPU_MAXMPIDEVS]; /**< Each MPI node has its set of cores. */
			
 
				-	unsigned nmpicores[STARPU_MAXMPIDEVS];
			
 
				-
			
 
				-	/** Topology of MP nodes (MIC) as well as necessary
			
 
				-	 * objects to communicate with them. */
			
 
				-	unsigned nhwmicdevices;
			
 
				-	unsigned nmicdevices;
			
 
				-
			
 
				-	unsigned nhwmiccores[STARPU_MAXMICDEVS]; /**< Each MIC node has its set of cores. */
			
 
				-	unsigned nmiccores[STARPU_MAXMICDEVS];
			
 
				-
			
 
				 	/** Indicates the successive logical PU identifier that should be used
			
 
				 	 * to bind the workers. It is either filled according to the
			
 
				 	 * user's explicit parameters (from starpu_conf) or according
			
@@ -386,16 +359,8 @@ struct _starpu_machine_config
 
				 	/** Which MPI do we use? */
			
 
				 	int current_mpi_deviceid;
			
 
				 
			
 
				-	/** Memory node for cpus, if only one */
			
 
				-	int cpus_nodeid;
			
 
				-	/** Memory node for CUDA, if only one */
			
 
				-	int cuda_nodeid;
			
 
				-	/** Memory node for OpenCL, if only one */
			
 
				-	int opencl_nodeid;
			
 
				-	/** Memory node for MIC, if only one */
			
 
				-	int mic_nodeid;
			
 
				-	/** Memory node for MPI, if only one */
			
 
				-	int mpi_nodeid;
			
 
				+	/** Memory node for different worker types, if only one */
			
 
				+	int arch_nodeid [STARPU_NARCH];
			
 
				 
			
 
				 	/** Separate out previous variables from per-worker data. */
			
 
				 	char padding1[STARPU_CACHELINE_SIZE];
			
@@ -450,6 +415,33 @@ struct _starpu_machine_config
 
				 	int perf_counter_pause_depth;
			
 
				 };
			
 
				 
			
 
				+/** Provides information for a device driver */
			
 
				+struct starpu_driver_info
			
 
				+{
			
 
				+	const char *name_upper;	/**< Name of worker type in upper case */
			
 
				+	const char *name_var;	/**< Name of worker type for environment variables */
			
 
				+	const char *name_lower;	/**< Name of worker type in lower case */
			
 
				+	enum starpu_node_kind memory_kind;	/**< Kind of memory in device */
			
 
				+	double alpha;	/**< Typical relative speed compared to a CPU core */
			
 
				+};
			
 
				+
			
 
				+/** Device driver information, indexed by enum starpu_worker_archtype */
			
 
				+extern struct starpu_driver_info starpu_driver_info[STARPU_NARCH];
			
 
				+
			
 
				+void starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct starpu_driver_info *info);
			
 
				+
			
 
				+/** Provides information for a memory node driver */
			
 
				+struct starpu_memory_driver_info
			
 
				+{
			
 
				+	const char *name_upper;	/**< Name of memory in upper case */
			
 
				+	enum starpu_worker_archtype worker_archtype;	/**< Kind of device */
			
 
				+};
			
 
				+
			
 
				+/** Memory driver information, indexed by enum starpu_node_kind */
			
 
				+extern struct starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
			
 
				+
			
 
				+void starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct starpu_memory_driver_info *info);
			
 
				+
			
 
				 extern int _starpu_worker_parallel_blocks;
			
 
				 
			
 
				 extern struct _starpu_machine_config _starpu_config STARPU_ATTRIBUTE_INTERNAL;
			
@@ -507,9 +499,9 @@ unsigned _starpu_worker_can_block(unsigned memnode, struct _starpu_worker *worke
 
				 void _starpu_block_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 
			
 
				 /** This function initializes the current driver for the given worker */
			
 
				-void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsigned sync);
			
 
				+void _starpu_driver_start(struct _starpu_worker *worker, enum starpu_worker_archtype archtype, unsigned sync);
			
 
				 /** This function initializes the current thread for the given worker */
			
 
				-void _starpu_worker_start(struct _starpu_worker *worker, unsigned fut_key, unsigned sync);
			
 
				+void _starpu_worker_start(struct _starpu_worker *worker, enum starpu_worker_archtype archtype, unsigned sync);
			
 
				 
			
 
				 static inline unsigned _starpu_worker_get_count(void)
			
 
				 {
			
@@ -658,8 +650,6 @@ static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 
				 }
			
 
				 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
			
 
				 
			
 
				-enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type);
			
 
				-
			
 
				 void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx);
			
 
				 
			
 
				 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -139,26 +139,24 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
				 			 * 	Unless peer transfer is supported (and it would then have been selected above).
			
 
				 			 * 	Other should be ok */
			
 
				 
			
 
				-			if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
			
 
				-			    starpu_node_get_kind(i) == STARPU_OPENCL_RAM ||
			
 
				-			    starpu_node_get_kind(i) == STARPU_MIC_RAM)
			
 
				-				i_gpu = i;
			
 
				-
			
 
				 			if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
			
 
				                             starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
			
 
				 				i_ram = i;
			
 
				-			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
			
 
				+			else if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
			
 
				 				i_disk = i;
			
 
				+			else
			
 
				+				i_gpu = i;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/* we have to use cpu_ram in first */
			
 
				 	if (i_ram != -1)
			
 
				 		src_node = i_ram;
			
 
				-	/* no luck we have to use the disk memory */
			
 
				 	else if (i_gpu != -1)
			
 
				+	/* otherwise a gpu */
			
 
				 		src_node = i_gpu;
			
 
				 	else
			
 
				+	/* no luck we have to use the disk memory */
			
 
				 		src_node = i_disk;
			
 
				 
			
 
				 	STARPU_ASSERT(src_node != -1);
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1513,7 +1513,8 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 			/* First try to flush data explicitly marked for freeing */
			
 
				 			size_t freed = flush_memchunk_cache(dst_node, reclaim);
			
 
				 
			
 
				-			if (freed >= reclaim) {
			
 
				+			if (freed >= reclaim)
			
 
				+			{
			
 
				 				/* That freed enough data, retry allocating */
			
 
				 				prefetch_out_of_memory[dst_node] = 0;
			
 
				 				continue;
			
@@ -1550,7 +1551,8 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 			_starpu_memory_reclaim_generic(dst_node, 0, reclaim);
			
 
				 			_STARPU_TRACE_END_MEMRECLAIM(dst_node,is_prefetch);
			
 
				 			prefetch_out_of_memory[dst_node] = 0;
			
 
				-		} else
			
 
				+		}
			
 
				+		else
			
 
				 			prefetch_out_of_memory[dst_node] = 0;
			
 
				 	}
			
 
				 	while((allocated_memory == -ENOMEM) && attempts++ < 2);
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -179,3 +179,10 @@ int starpu_memory_node_get_devid(unsigned node)
 
				 {
			
 
				 	return _starpu_descr.devid[node];
			
 
				 }
			
 
				+
			
 
				+enum starpu_worker_archtype starpu_memory_node_get_worker_archtype(enum starpu_node_kind node_kind)
			
 
				+{
			
 
				+	enum starpu_worker_archtype archtype = starpu_memory_driver_info[node_kind].worker_archtype;
			
 
				+	STARPU_ASSERT_MSG(archtype != (enum starpu_worker_archtype) -1, "ambiguous memory node kind %d", node_kind);
			
 
				+	return archtype;
			
 
				+}
			
--- a/src/datawizard/node_ops.c
+++ b/src/datawizard/node_ops.c
@@ -28,23 +28,7 @@
 
				 
			
 
				 const char* _starpu_node_get_prefix(enum starpu_node_kind kind)
			
 
				 {
			
 
				-	switch (kind)
			
 
				-	{
			
 
				-		case STARPU_CPU_RAM:
			
 
				-			return "NUMA";
			
 
				-		case STARPU_CUDA_RAM:
			
 
				-			return "CUDA";
			
 
				-		case STARPU_OPENCL_RAM:
			
 
				-			return "OpenCL";
			
 
				-		case STARPU_DISK_RAM:
			
 
				-			return "Disk";
			
 
				-		case STARPU_MIC_RAM:
			
 
				-			return "MIC";
			
 
				-		case STARPU_MPI_MS_RAM:
			
 
				-			return "MPI_MS";
			
 
				-		case STARPU_UNUSED:
			
 
				-		default:
			
 
				-			STARPU_ASSERT(0);
			
 
				-			return "unknown";
			
 
				-	}
			
 
				+	const char *ret = starpu_memory_driver_info[kind].name_upper;
			
 
				+	STARPU_ASSERT(ret);
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/node_ops.h
+++ b/src/datawizard/node_ops.h
@@ -46,10 +46,10 @@ typedef int (*copy3d_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_
 
				 
			
 
				 struct _starpu_node_ops
			
 
				 {
			
 
				-	copy_interface_func_t copy_interface_to[STARPU_MPI_MS_RAM+1];
			
 
				-	copy_data_t copy_data_to[STARPU_MPI_MS_RAM+1];
			
 
				-	copy2d_data_t copy2d_data_to[STARPU_MPI_MS_RAM+1];
			
 
				-	copy3d_data_t copy3d_data_to[STARPU_MPI_MS_RAM+1];
			
 
				+	copy_interface_func_t copy_interface_to[STARPU_MAX_RAM+1];
			
 
				+	copy_data_t copy_data_to[STARPU_MAX_RAM+1];
			
 
				+	copy2d_data_t copy2d_data_to[STARPU_MAX_RAM+1];
			
 
				+	copy3d_data_t copy3d_data_to[STARPU_MAX_RAM+1];
			
 
				 	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
			
 
				 	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
			
 
				 	int (*is_direct_access_supported)(unsigned node, unsigned handling_node);
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -39,30 +39,18 @@
 
				 #include <starpu_hash.h>
			
 
				 
			
 
				 #define CPUS_WORKER_COLORS_NB	8
			
 
				-#define CUDA_WORKER_COLORS_NB	9
			
 
				-#define OPENCL_WORKER_COLORS_NB 9
			
 
				-#define MIC_WORKER_COLORS_NB	9
			
 
				-#define MPI_MS_WORKER_COLORS_NB	9
			
 
				-#define OTHER_WORKER_COLORS_NB	4
			
 
				+#define ACCEL_WORKER_COLORS_NB	9
			
 
				 
			
 
				 /* How many times longer an idle period has to be before the smoothing
			
 
				  * heuristics avoids averaging codelet gflops */
			
 
				 #define IDLE_FACTOR 2
			
 
				 
			
 
				 static char *cpus_worker_colors[CPUS_WORKER_COLORS_NB] = {"/greens9/7", "/greens9/6", "/greens9/5", "/greens9/4",  "/greens9/9", "/greens9/3",  "/greens9/2",  "/greens9/1"  };
			
 
				-static char *cuda_worker_colors[CUDA_WORKER_COLORS_NB] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
			
 
				-static char *opencl_worker_colors[OPENCL_WORKER_COLORS_NB] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
			
 
				-static char *mic_worker_colors[MIC_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
			
 
				-static char *mpi_ms_worker_colors[MPI_MS_WORKER_COLORS_NB] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
			
 
				-static char *other_worker_colors[OTHER_WORKER_COLORS_NB] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
			
 
				+static char *accel_worker_colors[ACCEL_WORKER_COLORS_NB] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
			
 
				 static char *worker_colors[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-static unsigned opencl_index = 0;
			
 
				-static unsigned cuda_index = 0;
			
 
				 static unsigned cpus_index = 0;
			
 
				-static unsigned mic_index = 0;
			
 
				-static unsigned mpi_ms_index = 0;
			
 
				-static unsigned other_index = 0;
			
 
				+static unsigned accel_index = 0;
			
 
				 static uint64_t* number_events = NULL;
			
 
				 
			
 
				 static unsigned long fut_keymask;
			
@@ -205,7 +193,8 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 
				 		fprintf(tasks_file, "Name: %s\n", task->name);
			
 
				 	if (task->model_name)
			
 
				 		fprintf(tasks_file, "Model: %s\n", task->model_name);
			
 
				-	if (task->file) {
			
 
				+	if (task->file)
			
 
				+	{
			
 
				 		fprintf(tasks_file, "File: %s\n", task->file);
			
 
				 		fprintf(tasks_file, "Line: %d\n", task->line);
			
 
				 	}
			
@@ -395,14 +384,6 @@ out:
 
				 	free(data);
			
 
				 }
			
 
				 
			
 
				-static void set_next_other_worker_color(int workerid)
			
 
				-{
			
 
				-	if (workerid >= STARPU_NMAXWORKERS)
			
 
				-		return;
			
 
				-	worker_colors[workerid] = other_worker_colors[other_index++];
			
 
				-	if (other_index == OTHER_WORKER_COLORS_NB) other_index = 0;
			
 
				-}
			
 
				-
			
 
				 static void set_next_cpu_worker_color(int workerid)
			
 
				 {
			
 
				 	if (workerid >= STARPU_NMAXWORKERS)
			
@@ -411,36 +392,12 @@ static void set_next_cpu_worker_color(int workerid)
 
				 	if (cpus_index == CPUS_WORKER_COLORS_NB) cpus_index = 0;
			
 
				 }
			
 
				 
			
 
				-static void set_next_cuda_worker_color(int workerid)
			
 
				-{
			
 
				-	if (workerid >= STARPU_NMAXWORKERS)
			
 
				-		return;
			
 
				-	worker_colors[workerid] = cuda_worker_colors[cuda_index++];
			
 
				-	if (cuda_index == CUDA_WORKER_COLORS_NB) cuda_index = 0;
			
 
				-}
			
 
				-
			
 
				-static void set_next_opencl_worker_color(int workerid)
			
 
				-{
			
 
				-	if (workerid >= STARPU_NMAXWORKERS)
			
 
				-		return;
			
 
				-	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
			
 
				-	if (opencl_index == OPENCL_WORKER_COLORS_NB) opencl_index = 0;
			
 
				-}
			
 
				-
			
 
				-static void set_next_mic_worker_color(int workerid)
			
 
				-{
			
 
				-	if (workerid >= STARPU_NMAXWORKERS)
			
 
				-		return;
			
 
				-	worker_colors[workerid] = mic_worker_colors[mic_index++];
			
 
				-	if (mic_index == MIC_WORKER_COLORS_NB) mic_index = 0;
			
 
				-}
			
 
				-
			
 
				-static void set_next_mpi_ms_worker_color(int workerid)
			
 
				+static void set_next_accel_worker_color(int workerid)
			
 
				 {
			
 
				 	if (workerid >= STARPU_NMAXWORKERS)
			
 
				 		return;
			
 
				-	worker_colors[workerid] = mpi_ms_worker_colors[mpi_ms_index++];
			
 
				-	if (mpi_ms_index == MPI_MS_WORKER_COLORS_NB) mpi_ms_index = 0;
			
 
				+	worker_colors[workerid] = accel_worker_colors[accel_index++];
			
 
				+	if (accel_index == ACCEL_WORKER_COLORS_NB) accel_index = 0;
			
 
				 }
			
 
				 
			
 
				 static const char *get_worker_color(int workerid)
			
@@ -1231,56 +1188,24 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 
			
 
				 	new_thread = register_worker_id(prefixTOnodeid(prefix), threadid, workerid, set);
			
 
				 
			
 
				-	char *kindstr = "";
			
 
				+	const char *kindstr;
			
 
				 	struct starpu_perfmodel_arch arch;
			
 
				 	arch.ndevices = 1;
			
 
				 	_STARPU_MALLOC(arch.devices, sizeof(struct starpu_perfmodel_device));
			
 
				 
			
 
				-	switch (ev->param[0])
			
 
				-	{
			
 
				-		case _STARPU_FUT_APPS_KEY:
			
 
				-			set_next_other_worker_color(workerid);
			
 
				-			kindstr = "APPS";
			
 
				-			break;
			
 
				-		case _STARPU_FUT_CPU_KEY:
			
 
				-			set_next_cpu_worker_color(workerid);
			
 
				-			kindstr = "CPU";
			
 
				-			arch.devices[0].type = STARPU_CPU_WORKER;
			
 
				-			arch.devices[0].devid = 0;
			
 
				-			arch.devices[0].ncores = 1;
			
 
				-			break;
			
 
				-		case _STARPU_FUT_CUDA_KEY:
			
 
				-			set_next_cuda_worker_color(workerid);
			
 
				-			kindstr = "CUDA";
			
 
				-			arch.devices[0].type = STARPU_CUDA_WORKER;
			
 
				-			arch.devices[0].devid = devid;
			
 
				-			arch.devices[0].ncores = 1;
			
 
				-			break;
			
 
				-		case _STARPU_FUT_OPENCL_KEY:
			
 
				-			set_next_opencl_worker_color(workerid);
			
 
				-			kindstr = "OPENCL";
			
 
				-			arch.devices[0].type = STARPU_OPENCL_WORKER;
			
 
				-			arch.devices[0].devid = devid;
			
 
				-			arch.devices[0].ncores = 1;
			
 
				-			break;
			
 
				-		case _STARPU_FUT_MIC_KEY:
			
 
				-			set_next_mic_worker_color(workerid);
			
 
				-			kindstr = "mic";
			
 
				-			arch.devices[0].type = STARPU_MIC_WORKER;
			
 
				-			arch.devices[0].devid = devid;
			
 
				-			arch.devices[0].ncores = 1;
			
 
				-			break;
			
 
				-		case _STARPU_FUT_MPI_KEY:
			
 
				-			set_next_mpi_ms_worker_color(workerid);
			
 
				-			kindstr = "mpi_ms";
			
 
				-			arch.devices[0].type = STARPU_MPI_MS_WORKER;
			
 
				-			arch.devices[0].devid = devid;
			
 
				-			arch.devices[0].ncores = 1;
			
 
				-			break;
			
 
				+	enum starpu_worker_archtype archtype = _STARPU_FUT_KEY_WORKER(ev->param[0]);
			
 
				+	STARPU_ASSERT(archtype >= 0 && archtype < STARPU_NARCH);
			
 
				+
			
 
				+	kindstr = starpu_worker_get_type_as_string(archtype);
			
 
				+	arch.devices[0].type = archtype;
			
 
				+	arch.devices[0].devid = 0;
			
 
				+	arch.devices[0].ncores = 1;
			
 
				+
			
 
				+	if (archtype == STARPU_CPU_WORKER)
			
 
				+		set_next_cpu_worker_color(workerid);
			
 
				+	else
			
 
				+		set_next_accel_worker_color(workerid);
			
 
				 
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-	}
			
 
				 	double now = get_event_time_stamp(ev, options);
			
 
				 
			
 
				 	if (out_paje_file)
			
@@ -4205,7 +4130,8 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 
			
 
				 	if (out_paje_file && !options->no_bus)
			
 
				 	{
			
 
				-		while (!_starpu_communication_list_empty(&communication_list)) {
			
 
				+		while (!_starpu_communication_list_empty(&communication_list))
			
 
				+		{
			
 
				 			struct _starpu_communication*itor;
			
 
				 			itor = _starpu_communication_list_pop_front(&communication_list);
			
 
				 
			
@@ -4499,7 +4425,7 @@ void _starpu_fxt_number_events_file_init(struct starpu_fxt_options *options)
 
				 			STARPU_ABORT_MSG("Failed to open '%s' (err %s)", options->number_events_path, strerror(errno));
			
 
				 
			
 
				 		/* FUT_SETUP_CODE is the event with the maximal value */
			
 
				-		number_events = calloc(FUT_SETUP_CODE+1, sizeof(uint64_t));
			
 
				+		_STARPU_CALLOC(number_events, FUT_SETUP_CODE+1, sizeof(uint64_t));
			
 
				 	}
			
 
				 	else
			
 
				 		number_events_file = NULL;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -59,6 +59,27 @@
 
				 #include <windows.h>
			
 
				 #endif
			
 
				 
			
 
				+static struct starpu_driver_info driver_info =
			
 
				+{
			
 
				+	.name_upper = "CPU",
			
 
				+	.name_var = "CPU",
			
 
				+	.name_lower = "cpu",
			
 
				+	.memory_kind = STARPU_CPU_RAM,
			
 
				+	.alpha = 0.5f,
			
 
				+};
			
 
				+
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "NUMA",
			
 
				+	.worker_archtype = STARPU_CPU_WORKER,
			
 
				+};
			
 
				+
			
 
				+void _starpu_cpu_preinit(void)
			
 
				+{
			
 
				+	starpu_driver_info_register(STARPU_CPU_WORKER, &driver_info);
			
 
				+	starpu_memory_driver_info_register(STARPU_CPU_RAM, &memory_driver_info);
			
 
				+}
			
 
				+
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 /* Actually launch the job on a cpu worker.
			
@@ -189,7 +210,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
				 {
			
 
				 	int devid = cpu_worker->devid;
			
 
				 
			
 
				-	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
			
 
				+	_starpu_driver_start(cpu_worker, STARPU_CPU_WORKER, 1);
			
 
				 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
			
 
				 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
			
 
				 	starpu_pthread_setname(cpu_worker->short_name);
			
@@ -360,7 +381,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 
				 	 * job. */
			
 
				 
			
 
				 	/* can a cpu perform that task ? */
			
 
				-	if (!_STARPU_CPU_MAY_PERFORM(j))
			
 
				+	if (!_STARPU_MAY_PERFORM(j, CPU))
			
 
				 	{
			
 
				 		/* put it and the end of the queue ... XXX */
			
 
				 		_starpu_push_task_to_workers(task);
			
@@ -416,7 +437,7 @@ int _starpu_cpu_driver_deinit(struct _starpu_worker *cpu_worker)
 
				 	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				 
			
 
				 	cpu_worker->worker_is_initialized = 0;
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CPU_KEY);
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(STARPU_CPU_WORKER);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/drivers/cpu/driver_cpu.h
+++ b/src/drivers/cpu/driver_cpu.h
@@ -22,6 +22,8 @@
 
				 #include <common/config.h>
			
 
				 #include <datawizard/node_ops.h>
			
 
				 
			
 
				+void _starpu_cpu_preinit(void);
			
 
				+
			
 
				 extern struct _starpu_driver_ops _starpu_driver_cpu_ops;
			
 
				 extern struct _starpu_node_ops _starpu_driver_cpu_node_ops;
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -110,7 +110,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				 	/* Discover the number of CUDA devices. Fill the result in CONFIG. */
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	config->topology.nhwcudagpus = _starpu_simgrid_get_nbhosts("CUDA");
			
 
				+	config->topology.nhwdevices[STARPU_CUDA_WORKER] = _starpu_simgrid_get_nbhosts("CUDA");
			
 
				 #else
			
 
				 	int cnt;
			
 
				 	cudaError_t cures;
			
@@ -118,7 +118,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				 	cures = cudaGetDeviceCount (&cnt);
			
 
				 	if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 		cnt = 0;
			
 
				-	config->topology.nhwcudagpus = cnt;
			
 
				+	config->topology.nhwdevices[STARPU_CUDA_WORKER] = cnt;
			
 
				 #ifdef HAVE_LIBNVIDIA_ML
			
 
				 	nvmlInit();
			
 
				 #endif
			
@@ -684,12 +684,12 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 	int lastdevid = -1;
			
 
				 	unsigned i;
			
 
				 
			
 
				-	_starpu_driver_start(worker0, _STARPU_FUT_CUDA_KEY, 0);
			
 
				+	_starpu_driver_start(worker0, STARPU_CUDA_WORKER, 0);
			
 
				 	_starpu_set_local_worker_set_key(worker_set);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	for (i = 1; i < worker_set->nworkers; i++)
			
 
				-		_starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_CUDA_KEY, 0);
			
 
				+		_starpu_worker_start(&worker_set->workers[i], STARPU_CUDA_WORKER, 0);
			
 
				 #endif
			
 
				 
			
 
				 	for (i = 0; i < worker_set->nworkers; i++)
			
@@ -710,7 +710,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 		init_device_context(devid, memnode);
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-		if (worker->config->topology.nworkerpercuda > 1 && props[devid].concurrentKernels == 0)
			
 
				+		if (worker->config->topology.nworker[STARPU_CUDA_WORKER][devid] > 1 && props[devid].concurrentKernels == 0)
			
 
				 			_STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but CUDA device %u does not support concurrent kernel execution!\n", worker_set->nworkers, devid);
			
 
				 #endif /* !STARPU_SIMGRID */
			
 
				 	}
			
@@ -723,7 +723,7 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 		struct _starpu_worker *worker = &worker_set->workers[i];
			
 
				 		unsigned devid = worker->devid;
			
 
				 		unsigned workerid = worker->workerid;
			
 
				-		unsigned subdev = i % _starpu_get_machine_config()->topology.nworkerpercuda;
			
 
				+		unsigned subdev = i % _starpu_get_machine_config()->topology.nworker[STARPU_CUDA_WORKER][devid];
			
 
				 
			
 
				 		float size = (float) global_mem[devid] / (1<<30);
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -968,7 +968,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 		/* can CUDA do that task ? */
			
 
				-		if (!_STARPU_CUDA_MAY_PERFORM(j))
			
 
				+		if (!_STARPU_MAY_PERFORM(j, CUDA))
			
 
				 		{
			
 
				 			/* this is neither a cuda or a cublas task */
			
 
				 			_starpu_worker_refuse_task(worker, task);
			
@@ -1067,7 +1067,7 @@ int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
 
				 	}
			
 
				 
			
 
				 	worker_set->workers[0].worker_is_initialized = 0;
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(STARPU_CUDA_WORKER);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1832,37 +1832,17 @@ struct _starpu_driver_ops _starpu_driver_cuda_ops =
 
				 #ifdef STARPU_SIMGRID
			
 
				 struct _starpu_node_ops _starpu_driver_cuda_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy2d_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy2d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				 	.copy2d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy3d_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				 	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	.wait_request_completion = NULL,
			
 
				 	.test_request_completion = NULL,
			
@@ -1874,31 +1854,15 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 
				 #else
			
 
				 struct _starpu_node_ops _starpu_driver_cuda_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cpu,
			
 
				 	.copy_interface_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cuda,
			
 
				-	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy_data_from_cuda_to_cpu,
			
 
				 	.copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_from_cuda_to_cuda,
			
 
				-	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy2d_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy2d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cpu,
			
 
				 	.copy2d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cuda,
			
 
				-	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy3d_data_to[STARPU_UNUSED] = NULL,
			
 
				 #if 0
			
 
				 	.copy3d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cpu,
			
 
				 	.copy3d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cuda,
			
@@ -1906,10 +1870,6 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 
				 	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				 	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				 #endif
			
 
				-	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	.wait_request_completion = _starpu_cuda_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_cuda_test_request_completion,
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -22,6 +22,8 @@
 
				 
			
 
				 #include <common/config.h>
			
 
				 
			
 
				+void _starpu_cuda_preinit(void);
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 #include <cuda.h>
			
 
				 #include <cuda_runtime_api.h>
			
--- a/src/drivers/cuda/driver_cuda_init.c
+++ b/src/drivers/cuda/driver_cuda_init.c
@@ -0,0 +1,39 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <drivers/cuda/driver_cuda.h>
			
 
				+
			
 
				+static struct starpu_driver_info driver_info =
			
 
				+{
			
 
				+	.name_upper = "CUDA",
			
 
				+	.name_var = "CUDA",
			
 
				+	.name_lower = "cuda",
			
 
				+	.memory_kind = STARPU_CUDA_RAM,
			
 
				+	.alpha = 13.33f,
			
 
				+};
			
 
				+
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "CUDA",
			
 
				+	.worker_archtype = STARPU_CUDA_WORKER,
			
 
				+};
			
 
				+
			
 
				+void _starpu_cuda_preinit(void)
			
 
				+{
			
 
				+	starpu_driver_info_register(STARPU_CUDA_WORKER, &driver_info);
			
 
				+	starpu_memory_driver_info_register(STARPU_CUDA_RAM, &memory_driver_info);
			
 
				+}
			
--- a/src/drivers/cuda/starpu_cublas.c
+++ b/src/drivers/cuda/starpu_cublas.c
@@ -100,9 +100,11 @@ void starpu_cublas_shutdown(void)
 
				 void starpu_cublas_set_stream(void)
			
 
				 {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+	unsigned workerid = starpu_worker_get_id_check();
			
 
				+	int devid = starpu_worker_get_devid(workerid);
			
 
				 	if (!_starpu_get_machine_config()->topology.cuda_th_per_dev ||
			
 
				 		(!_starpu_get_machine_config()->topology.cuda_th_per_stream &&
			
 
				-		 _starpu_get_machine_config()->topology.nworkerpercuda > 1))
			
 
				+		 _starpu_get_machine_config()->topology.nworker[STARPU_CUDA_WORKER][devid] > 1))
			
 
				 		cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				 #endif
			
 
				 }
			
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -23,6 +23,17 @@
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "Disk",
			
 
				+	.worker_archtype = (enum starpu_worker_archtype) -1,
			
 
				+};
			
 
				+
			
 
				+void _starpu_disk_preinit(void)
			
 
				+{
			
 
				+	starpu_memory_driver_info_register(STARPU_DISK_RAM, &memory_driver_info);
			
 
				+}
			
 
				+
			
 
				 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
			
 
				 {
			
 
				 	STARPU_ASSERT(starpu_node_get_kind(src_node) == STARPU_CPU_RAM);
			
@@ -251,21 +262,11 @@ void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, i
 
				 
			
 
				 struct _starpu_node_ops _starpu_driver_disk_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = _starpu_disk_copy_interface_from_disk_to_cpu,
			
 
				-	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_DISK_RAM] = _starpu_disk_copy_interface_from_disk_to_disk,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = _starpu_disk_copy_data_from_disk_to_cpu,
			
 
				-	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_from_disk_to_disk,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	/* TODO: copy2D/3D? */
			
 
				 
			
--- a/src/drivers/disk/driver_disk.h
+++ b/src/drivers/disk/driver_disk.h
@@ -22,6 +22,8 @@
 
				 
			
 
				 #include <datawizard/node_ops.h>
			
 
				 
			
 
				+void _starpu_disk_preinit(void);
			
 
				+
			
 
				 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel);
			
 
				 
			
 
				 int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_node, void * dst, unsigned dst_node, size_t size, void * async_channel);
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -276,7 +276,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 				do_update_time_model = 0;
			
 
				 			if (do_update_time_model)
			
 
				 			{
			
 
				-				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
			
 
				+				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl, 1);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -312,7 +312,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 			do_update_energy_model = 0;
			
 
				 		if (do_update_energy_model)
			
 
				 		{
			
 
				-			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl);
			
 
				+			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl, 1);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/src/starpu_parameters.h
+++ b/src/starpu_parameters.h
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				- * Copyright (C) 2013       Thibaut Lambert
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,19 +14,26 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#ifndef _STARPU_PARAMETERS_H
			
 
				-#define _STARPU_PARAMETERS_H
			
 
				+#include <core/workers.h>
			
 
				+#include <drivers/mic/driver_mic_source.h>
			
 
				 
			
 
				-/** @file */
			
 
				+static struct starpu_driver_info driver_info =
			
 
				+{
			
 
				+	.name_upper = "MIC",
			
 
				+	.name_var = "MIC",
			
 
				+	.name_lower = "mic",
			
 
				+	.memory_kind = STARPU_MIC_RAM,
			
 
				+	.alpha = 0.5f,
			
 
				+};
			
 
				 
			
 
				-/* Parameters which are not worth being added to ./configure options, but
			
 
				- * still interesting to easily change */
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "MIC",
			
 
				+	.worker_archtype = STARPU_MIC_WORKER,
			
 
				+};
			
 
				 
			
 
				-/* Assumed relative performance ratios */
			
 
				-/* TODO: benchmark a bit instead */
			
 
				-#define _STARPU_CPU_ALPHA	1.0f
			
 
				-#define _STARPU_CUDA_ALPHA	13.33f
			
 
				-#define _STARPU_OPENCL_ALPHA	12.22f
			
 
				-#define _STARPU_MIC_ALPHA	0.5f
			
 
				-#define _STARPU_MPI_MS_ALPHA	1.0f
			
 
				-#endif /* _STARPU_PARAMETERS_H */
			
 
				+void _starpu_mic_preinit(void)
			
 
				+{
			
 
				+	starpu_driver_info_register(STARPU_MIC_WORKER, &driver_info);
			
 
				+	starpu_memory_driver_info_register(STARPU_MIC_RAM, &memory_driver_info);
			
 
				+}
			
--- a/src/drivers/mic/driver_mic_source.c
+++ b/src/drivers/mic/driver_mic_source.c
@@ -241,7 +241,7 @@ unsigned starpu_mic_device_get_count(void)
 
				     struct _starpu_machine_config *config = _starpu_get_machine_config ();
			
 
				     struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-    return topology->nmicdevices;
			
 
				+    return topology->ndevices[STARPU_MIC_WORKER];
			
 
				 }
			
 
				 
			
 
				 starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codelet *cl, unsigned nimpl)
			
@@ -521,16 +521,16 @@ void *_starpu_mic_src_worker(void *arg)
 
				 
			
 
				 	/* unsigned memnode = baseworker->memory_node; */
			
 
				 
			
 
				-	_starpu_driver_start(baseworker, _STARPU_FUT_MIC_KEY, 0);
			
 
				+	_starpu_driver_start(baseworker, STARPU_MIC_WORKER, 0);
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	for (i = 1; i < worker_set->nworkers; i++)
			
 
				-		_starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_MIC_KEY, 0);
			
 
				+		_starpu_worker_start(&worker_set->workers[i], STARPU_FUT_WORKER, 0);
			
 
				 #endif
			
 
				 
			
 
				 	// Current task for a thread managing a worker set has no sense.
			
 
				 	_starpu_set_current_task(NULL);
			
 
				 
			
 
				-	for (i = 0; i < config->topology.nmiccores[devid]; i++)
			
 
				+	for (i = 0; i < config->topology.nworker[STARPU_MIC_WORKER][devid]; i++)
			
 
				 	{
			
 
				 		struct _starpu_worker *worker = &config->workers[baseworkerid+i];
			
 
				 		snprintf(worker->name, sizeof(worker->name), "MIC %u core %u", devid, i);
			
@@ -686,21 +686,11 @@ void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, in
 
				 /* TODO: MIC -> MIC */
			
 
				 struct _starpu_node_ops _starpu_driver_mic_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = _starpu_mic_copy_interface_from_mic_to_cpu,
			
 
				-	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = _starpu_mic_copy_data_from_mic_to_cpu,
			
 
				-	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	/* TODO: copy2D/3D? */
			
 
				 
			
--- a/src/drivers/mic/driver_mic_source.h
+++ b/src/drivers/mic/driver_mic_source.h
@@ -23,6 +23,8 @@
 
				 #include <starpu_mic.h>
			
 
				 #include <common/config.h>
			
 
				 
			
 
				+void _starpu_mic_preinit(void);
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 
			
 
				 #include <source/COIProcess_source.h>
			
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -127,7 +127,7 @@ void _starpu_mpi_common_mp_initialize_src_sink(struct _starpu_mp_node *node)
 
				 {
			
 
				         struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
			
 
				 
			
 
				-        node->nb_cores = topology->nhwcpus;
			
 
				+        node->nb_cores = topology->nhwworker[STARPU_CPU_WORKER][0];
			
 
				 }
			
 
				 
			
 
				 int _starpu_mpi_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
			
--- a/src/drivers/mpi/driver_mpi_init.c
+++ b/src/drivers/mpi/driver_mpi_init.c
@@ -0,0 +1,39 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <drivers/mpi/driver_mpi_source.h>
			
 
				+
			
 
				+static struct starpu_driver_info driver_info =
			
 
				+{
			
 
				+	.name_upper = "MPI_MS",
			
 
				+	.name_var = "MPI_MS",
			
 
				+	.name_lower = "mpi_ms",
			
 
				+	.memory_kind = STARPU_MPI_MS_RAM,
			
 
				+	.alpha = 1.0f,
			
 
				+};
			
 
				+
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "MPI_MS",
			
 
				+	.worker_archtype = STARPU_MPI_MS_WORKER,
			
 
				+};
			
 
				+
			
 
				+void _starpu_mpi_ms_preinit(void)
			
 
				+{
			
 
				+	starpu_driver_info_register(STARPU_MPI_MS_WORKER, &driver_info);
			
 
				+	starpu_memory_driver_info_register(STARPU_MPI_MS_RAM, &memory_driver_info);
			
 
				+}
			
--- a/src/drivers/mpi/driver_mpi_source.c
+++ b/src/drivers/mpi/driver_mpi_source.c
@@ -310,17 +310,17 @@ void *_starpu_mpi_src_worker(void *arg)
 
				 
			
 
				                 /* unsigned memnode = baseworker->memory_node; */
			
 
				 
			
 
				-                _starpu_driver_start(baseworker, _STARPU_FUT_MPI_KEY, 0);
			
 
				+                _starpu_driver_start(baseworker, STARPU_CPU_WORKER, 0);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				                 for (i = 1; i < worker_set->nworkers; i++)
			
 
				-                        _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_MPI_KEY, 0);
			
 
				+                        _starpu_worker_start(&worker_set->workers[i], STARPU_MPI_WORKER, 0);
			
 
				 #endif
			
 
				 
			
 
				                 // Current task for a thread managing a worker set has no sense.
			
 
				                 _starpu_set_current_task(NULL);
			
 
				 
			
 
				-                for (i = 0; i < config->topology.nmpicores[devid]; i++)
			
 
				+                for (i = 0; i < config->topology.nworker[STARPU_MPI_MS_WORKER][devid]; i++)
			
 
				                 {
			
 
				                         struct _starpu_worker *worker = &config->workers[baseworkerid+i];
			
 
				                         snprintf(worker->name, sizeof(worker->name), "MPI_MS %u core %u", devid, i);
			
@@ -546,20 +546,10 @@ void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, in
 
				 
			
 
				 struct _starpu_node_ops _starpu_driver_mpi_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = _starpu_mpi_copy_interface_from_mpi_to_cpu,
			
 
				-	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_interface_from_mpi_to_mpi,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = _starpu_mpi_copy_data_from_mpi_to_cpu,
			
 
				-	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_mpi_to_mpi,
			
 
				 
			
 
				 	/* TODO: copy2D/3D? */
			
--- a/src/drivers/mpi/driver_mpi_source.h
+++ b/src/drivers/mpi/driver_mpi_source.h
@@ -23,8 +23,9 @@
 
				 #include <starpu_mpi_ms.h>
			
 
				 #include <datawizard/node_ops.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				+void _starpu_mpi_ms_preinit(void);
			
 
				 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 extern struct _starpu_node_ops _starpu_driver_mpi_node_ops;
			
 
				 
			
 
				 /** Array of structures containing all the informations useful to send
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -71,7 +71,7 @@ _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
 
				 	/* As OpenCL must have been initialized before calling this function,
			
 
				 	 * `nb_device' is ensured to be correctly set. */
			
 
				 	STARPU_ASSERT(init_done == 1);
			
 
				-	config->topology.nhwopenclgpus = nb_devices;
			
 
				+	config->topology.nhwdevices[STARPU_OPENCL_WORKER] = nb_devices;
			
 
				 }
			
 
				 
			
 
				 static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
			
@@ -623,7 +623,7 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 
				 {
			
 
				 	int devid = worker->devid;
			
 
				 
			
 
				-	_starpu_driver_start(worker, _STARPU_FUT_OPENCL_KEY, 0);
			
 
				+	_starpu_driver_start(worker, STARPU_OPENCL_WORKER, 0);
			
 
				 
			
 
				 	_starpu_opencl_init_context(devid);
			
 
				 
			
@@ -817,7 +817,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
 
				 		worker->current_task = task;
			
 
				 
			
 
				 	/* can OpenCL do that task ? */
			
 
				-	if (!_STARPU_OPENCL_MAY_PERFORM(j))
			
 
				+	if (!_STARPU_MAY_PERFORM(j, OPENCL))
			
 
				 	{
			
 
				 		/* this is not a OpenCL task */
			
 
				 		_starpu_worker_refuse_task(worker, task);
			
@@ -853,7 +853,7 @@ int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
 
				         _starpu_opencl_deinit_context(devid);
			
 
				 
			
 
				 	worker->worker_is_initialized = 0;
			
 
				-	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
			
 
				+	_STARPU_TRACE_WORKER_DEINIT_END(STARPU_OPENCL_WORKER);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1356,21 +1356,11 @@ int _starpu_opencl_is_direct_access_supported(unsigned node, unsigned handling_n
 
				 #ifdef STARPU_SIMGRID
			
 
				 struct _starpu_node_ops _starpu_driver_opencl_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	.wait_request_completion = NULL,
			
 
				 	.test_request_completion = NULL,
			
@@ -1382,21 +1372,11 @@ struct _starpu_node_ops _starpu_driver_opencl_node_ops =
 
				 #else
			
 
				 struct _starpu_node_ops _starpu_driver_opencl_node_ops =
			
 
				 {
			
 
				-	.copy_interface_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_interface_to[STARPU_CPU_RAM] = _starpu_opencl_copy_interface_from_opencl_to_cpu,
			
 
				-	.copy_interface_to[STARPU_CUDA_RAM] = NULL,
			
 
				 	.copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_opencl_to_opencl,
			
 
				-	.copy_interface_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_interface_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				-	.copy_data_to[STARPU_UNUSED] = NULL,
			
 
				 	.copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_from_opencl_to_cpu,
			
 
				-	.copy_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_opencl_to_opencl,
			
 
				-	.copy_data_to[STARPU_DISK_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				-	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				 	/* TODO: copy2D/3D? */
			
 
				 
			
--- a/src/drivers/opencl/driver_opencl.h
+++ b/src/drivers/opencl/driver_opencl.h
@@ -36,6 +36,8 @@
 
				 #include <core/workers.h>
			
 
				 #include <datawizard/node_ops.h>
			
 
				 
			
 
				+void _starpu_opencl_preinit(void);
			
 
				+
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 struct _starpu_machine_config;
			
 
				 void _starpu_opencl_discover_devices(struct _starpu_machine_config *config);
			
--- a/src/drivers/opencl/driver_opencl_init.c
+++ b/src/drivers/opencl/driver_opencl_init.c
@@ -0,0 +1,39 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <core/workers.h>
			
 
				+#include <drivers/opencl/driver_opencl.h>
			
 
				+
			
 
				+static struct starpu_driver_info driver_info =
			
 
				+{
			
 
				+	.name_upper = "OpenCL",
			
 
				+	.name_var = "OPENCL",
			
 
				+	.name_lower = "opencl",
			
 
				+	.memory_kind = STARPU_OPENCL_RAM,
			
 
				+	.alpha = 12.22f,
			
 
				+};
			
 
				+
			
 
				+static struct starpu_memory_driver_info memory_driver_info =
			
 
				+{
			
 
				+	.name_upper = "OpenCL",
			
 
				+	.worker_archtype = STARPU_OPENCL_WORKER,
			
 
				+};
			
 
				+
			
 
				+void _starpu_opencl_preinit(void)
			
 
				+{
			
 
				+	starpu_driver_info_register(STARPU_OPENCL_WORKER, &driver_info);
			
 
				+	starpu_memory_driver_info_register(STARPU_OPENCL_RAM, &memory_driver_info);
			
 
				+}
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -231,10 +231,9 @@ static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
 
				 static void initialize_duration(struct bound_task *task)
			
 
				 {
			
 
				 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus);
			
 
				-	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL);
			
 
				-	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL);
			
 
				-	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores);
			
 
				+	enum starpu_worker_archtype type;
			
 
				+	for (type = 0; type < STARPU_NARCH; type++)
			
 
				+		task->duration[type] = initialize_arch_duration(conf->topology.nhwdevices[type], conf->topology.nworker[type]);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel_device device =
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -146,43 +146,42 @@ void _starpu_profiling_init(void)
 
				 	}
			
 
				 
			
 
				 #ifdef STARPU_PAPI
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				-		int retval = PAPI_library_init(PAPI_VER_CURRENT);
			
 
				-		if (retval != PAPI_VER_CURRENT)
			
 
				-		{
			
 
				-			 _STARPU_MSG("Failed init PAPI, error: %s.\n", PAPI_strerror(retval));
			
 
				-		}
			
 
				-		retval = PAPI_thread_init(pthread_self);
			
 
				-		if (retval != PAPI_OK)
			
 
				-		{
			
 
				-			 _STARPU_MSG("Failed init PAPI thread, error: %s.\n", PAPI_strerror(retval));
			
 
				-		}
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				+	int retval = PAPI_library_init(PAPI_VER_CURRENT);
			
 
				+	if (retval != PAPI_VER_CURRENT)
			
 
				+	{
			
 
				+		_STARPU_MSG("Failed init PAPI, error: %s.\n", PAPI_strerror(retval));
			
 
				+	}
			
 
				+	retval = PAPI_thread_init(pthread_self);
			
 
				+	if (retval != PAPI_OK)
			
 
				+	{
			
 
				+		_STARPU_MSG("Failed init PAPI thread, error: %s.\n", PAPI_strerror(retval));
			
 
				+	}
			
 
				 
			
 
				-		char *conf_papi_events;
			
 
				-		char *papi_event_name;
			
 
				-		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
			
 
				-		papi_nevents = 0;
			
 
				-		if (conf_papi_events != NULL)
			
 
				+	char *conf_papi_events;
			
 
				+	char *papi_event_name;
			
 
				+	conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
			
 
				+	papi_nevents = 0;
			
 
				+	if (conf_papi_events != NULL)
			
 
				+	{
			
 
				+		while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
			
 
				 		{
			
 
				-			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
			
 
				+			if (papi_nevents == PAPI_MAX_HWCTRS)
			
 
				 			{
			
 
				-				if (papi_nevents == PAPI_MAX_HWCTRS)
			
 
				-				{
			
 
				-				      _STARPU_MSG("Too many requested papi counters, ignoring %s\n", papi_event_name);
			
 
				-				      continue;
			
 
				-				}
			
 
				-
			
 
				-				_STARPU_DEBUG("Loading PAPI Event: %s\n", papi_event_name);
			
 
				-				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
			
 
				-				if (retval != PAPI_OK)
			
 
				-				      _STARPU_MSG("Failed to codify papi event [%s], error: %s.\n", papi_event_name, PAPI_strerror(retval));
			
 
				-				else
			
 
				-					papi_nevents++;
			
 
				+				_STARPU_MSG("Too many requested papi counters, ignoring %s\n", papi_event_name);
			
 
				+				continue;
			
 
				 			}
			
 
				+
			
 
				+			_STARPU_DEBUG("Loading PAPI Event: %s\n", papi_event_name);
			
 
				+			retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
			
 
				+			if (retval != PAPI_OK)
			
 
				+				_STARPU_MSG("Failed to codify papi event [%s], error: %s.\n", papi_event_name, PAPI_strerror(retval));
			
 
				+			else
			
 
				+				papi_nevents++;
			
 
				 		}
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
			
 
				+	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
			
 
				 #endif
			
 
				-
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_PAPI
			
@@ -195,17 +194,20 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
				 	profiling_info = task->profiling_info;
			
 
				 	if (profiling_info && papi_nevents)
			
 
				 	{
			
 
				+		int i;
			
 
				 		profiling_info->papi_event_set = PAPI_NULL;
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				 		PAPI_create_eventset(&profiling_info->papi_event_set);
			
 
				-		for(int i=0; i<papi_nevents; i++)
			
 
				+		for(i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				 			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				+#ifdef PAPI_ECMP_DISABLED
			
 
				 			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
			
 
				 			{
			
 
				 				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
			
 
				 				warned_component_unavailable = 1;
			
 
				 			}
			
 
				+#endif
			
 
				 			profiling_info->papi_values[i]=0;
			
 
				 		}
			
 
				 		PAPI_reset(profiling_info->papi_event_set);
			
@@ -224,9 +226,10 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 
				 
			
 
				 	if (profiling_info && papi_nevents)
			
 
				 	{
			
 
				+		int i;
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				 		PAPI_stop(profiling_info->papi_event_set, profiling_info->papi_values);
			
 
				-		for(int i=0; i<papi_nevents; i++)
			
 
				+		for(i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				 			_STARPU_TRACE_PAPI_TASK_EVENT(papi_events[i], task, profiling_info->papi_values[i]);
			
 
				 		}
			
@@ -266,7 +269,6 @@ void _starpu_profiling_terminate(void)
 
				 /*
			
 
				  *	Task profiling
			
 
				  */
			
 
				-
			
 
				 struct starpu_profiling_task_info *_starpu_allocate_profiling_info_if_needed(struct starpu_task *task)
			
 
				 {
			
 
				 	struct starpu_profiling_task_info *info = NULL;
			
@@ -283,7 +285,6 @@ struct starpu_profiling_task_info *_starpu_allocate_profiling_info_if_needed(str
 
				 /*
			
 
				  *	Worker profiling
			
 
				  */
			
 
				-
			
 
				 static void _starpu_worker_reset_profiling_info_with_lock(int workerid)
			
 
				 {
			
 
				 	_starpu_clock_gettime(&worker_info[workerid].start_time);
			
@@ -578,7 +579,7 @@ int starpu_bus_get_ngpus(int busid)
 
				 	int ngpus = bus_ngpus[busid];
			
 
				 	if (!ngpus)
			
 
				 		/* Unknown number of GPUs, assume it's shared by all GPUs */
			
 
				-		ngpus = topology->ncudagpus+topology->nopenclgpus;
			
 
				+		ngpus = topology->ndevices[STARPU_CUDA_WORKER]+topology->ndevices[STARPU_OPENCL_WORKER];
			
 
				 	return ngpus;
			
 
				 }
			
 
				 
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -434,9 +434,12 @@ static int heteroprio_push_task(struct starpu_sched_component * component, struc
 
				 			/* Didn't find it, add one */
			
 
				 			data->naccel++;
			
 
				 
			
 
				-			float *newaccel = malloc(data->naccel * sizeof(*newaccel));
			
 
				-			struct _starpu_prio_deque **newbuckets = malloc(data->naccel * sizeof(*newbuckets));
			
 
				-			struct _starpu_prio_deque *newbucket = malloc(sizeof(*newbucket));
			
 
				+			float *newaccel;
			
 
				+			_STARPU_MALLOC(newaccel, data->naccel * sizeof(*newaccel));
			
 
				+			struct _starpu_prio_deque **newbuckets;
			
 
				+			_STARPU_MALLOC(newbuckets, data->naccel * sizeof(*newbuckets));
			
 
				+			struct _starpu_prio_deque *newbucket;
			
 
				+			_STARPU_MALLOC(newbucket, sizeof(*newbucket));
			
 
				 			_starpu_prio_deque_init(newbucket);
			
 
				 			int inserted = 0;
			
 
				 
			
--- a/src/sched_policies/helper_mct.c
+++ b/src/sched_policies/helper_mct.c
@@ -88,6 +88,11 @@ static double compute_expected_time(double now, double predicted_end, double pre
 
				 
			
 
				 double starpu_mct_compute_fitness(struct _starpu_mct_data * d, double exp_end, double min_exp_end_of_task, double max_exp_end_of_workers, double transfer_len, double local_energy)
			
 
				 {
			
 
				+	if(isnan(local_energy))
			
 
				+		/* Energy not calibrated yet, but we cannot do this
			
 
				+		 * automatically anyway, so ignoring this for now */
			
 
				+		local_energy = 0.;
			
 
				+
			
 
				 	/* Note: the expected end includes the data transfer duration, which we want to be able to tune separately */
			
 
				 	
			
 
				 	/* min_exp_end_of_task is the minimum end time of the task over all workers */
			
--- a/src/sched_policies/heteroprio.c
+++ b/src/sched_policies/heteroprio.c
@@ -37,6 +37,8 @@
 
				 #define DBL_MAX __DBL_MAX__
			
 
				 #endif
			
 
				 
			
 
				+#define STARPU_NB_TYPES STARPU_NARCH
			
 
				+
			
 
				 /* A bucket corresponds to a Pair of priorities
			
 
				  * When a task is pushed with a priority X, it will be stored
			
 
				  * into the bucket X.
			
@@ -107,8 +109,16 @@ struct _starpu_heteroprio_data
 
				 	unsigned nb_workers_per_arch_index[STARPU_NB_TYPES];
			
 
				 };
			
 
				 
			
 
				+
			
 
				+static int starpu_heteroprio_types_to_arch(enum starpu_worker_archtype arch)
			
 
				+{
			
 
				+	if (arch >= STARPU_NARCH)
			
 
				+		return 0;
			
 
				+	return STARPU_WORKER_TO_MASK(arch);
			
 
				+}
			
 
				+
			
 
				 /** Tell how many prio there are for a given arch */
			
 
				-void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned max_prio)
			
 
				+void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned max_prio)
			
 
				 {
			
 
				 	struct _starpu_heteroprio_data *hp = (struct _starpu_heteroprio_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 
			
@@ -118,7 +128,7 @@ void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_heteropri
 
				 }
			
 
				 
			
 
				 /** Set the mapping for a given arch prio=>bucket */
			
 
				-inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned source_prio, unsigned dest_bucket_id)
			
 
				+inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned source_prio, unsigned dest_bucket_id)
			
 
				 {
			
 
				 	STARPU_ASSERT(dest_bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
			
 
				 
			
@@ -126,12 +136,12 @@ inline void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_het
 
				 
			
 
				 	hp->prio_mapping_per_arch_index[arch][source_prio] = dest_bucket_id;
			
 
				 
			
 
				-	hp->buckets[dest_bucket_id].valid_archs |= starpu_heteroprio_types_to_arch[arch];
			
 
				+	hp->buckets[dest_bucket_id].valid_archs |= starpu_heteroprio_types_to_arch(arch);
			
 
				 	_STARPU_DEBUG("Adding arch %d to bucket %u\n", arch, dest_bucket_id);
			
 
				 }
			
 
				 
			
 
				 /** Tell which arch is the faster for the tasks of a bucket (optional) */
			
 
				-inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id)
			
 
				+inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id)
			
 
				 {
			
 
				 	STARPU_ASSERT(bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
			
 
				 
			
@@ -143,7 +153,7 @@ inline void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu
 
				 }
			
 
				 
			
 
				 /** Tell how slow is a arch for the tasks of a bucket (optional) */
			
 
				-inline void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_heteroprio_types arch, unsigned bucket_id, float slow_factor)
			
 
				+inline void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id, float slow_factor)
			
 
				 {
			
 
				 	STARPU_ASSERT(bucket_id < STARPU_HETEROPRIO_MAX_PRIO);
			
 
				 
			
@@ -160,52 +170,22 @@ static inline void default_init_sched(unsigned sched_ctx_id)
 
				 	int max_prio = starpu_sched_ctx_get_max_priority(sched_ctx_id);
			
 
				 	STARPU_ASSERT(min_prio >= 0);
			
 
				 	STARPU_ASSERT(max_prio >= 0);
			
 
				+
			
 
				+	enum starpu_worker_archtype type;
			
 
				+
			
 
				 	// By default each type of devices uses 1 bucket and no slow factor
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-	if (starpu_cpu_worker_get_count() > 0)
			
 
				-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_CPU_IDX, max_prio-min_prio+1);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	if (starpu_cuda_worker_get_count() > 0)
			
 
				-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_CUDA_IDX, max_prio-min_prio+1);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	if (starpu_opencl_worker_get_count() > 0)
			
 
				-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_OPENCL_IDX, max_prio-min_prio+1);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_MIC
			
 
				-	if (starpu_mic_worker_get_count() > 0)
			
 
				-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_MIC_IDX, max_prio-min_prio+1);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-	if (starpu_mpi_ms_worker_get_count() > 0)
			
 
				-		starpu_heteroprio_set_nb_prios(sched_ctx_id, STARPU_MPI_MS_IDX, max_prio-min_prio+1);
			
 
				-#endif
			
 
				+	for (type = 0; type < STARPU_NARCH; type++)
			
 
				+		if (starpu_worker_get_count_by_type(type) > 0)
			
 
				+			starpu_heteroprio_set_nb_prios(sched_ctx_id, type, max_prio-min_prio+1);
			
 
				 
			
 
				 	// Direct mapping
			
 
				 	int prio;
			
 
				 	for(prio=min_prio ; prio<=max_prio ; prio++)
			
 
				 	{
			
 
				-#ifdef STARPU_USE_CPU
			
 
				-		if (starpu_cpu_worker_get_count() > 0)
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_CPU_IDX, prio, prio);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		if (starpu_cuda_worker_get_count() > 0)
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_CUDA_IDX, prio, prio);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-		if (starpu_opencl_worker_get_count() > 0)
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_OPENCL_IDX, prio, prio);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_MIC
			
 
				-		if (starpu_mic_worker_get_count() > 0)
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_MIC_IDX, prio, prio);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-		if (starpu_mpi_ms_worker_get_count() > 0)
			
 
				-			starpu_heteroprio_set_mapping(sched_ctx_id, STARPU_MPI_MS_IDX, prio, prio);
			
 
				-#endif
			
 
				+		// By default each type of devices uses 1 bucket and no slow factor
			
 
				+		for (type = 0; type < STARPU_NARCH; type++)
			
 
				+			if (starpu_worker_get_count_by_type(type) > 0)
			
 
				+				starpu_heteroprio_set_mapping(sched_ctx_id, type, prio, prio);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -249,7 +229,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 
				 			const unsigned mapped_prio = hp->prio_mapping_per_arch_index[arch_index][idx_prio];
			
 
				 			STARPU_ASSERT(mapped_prio <= STARPU_HETEROPRIO_MAX_PRIO);
			
 
				 			STARPU_ASSERT(hp->buckets[mapped_prio].slow_factors_per_index[arch_index] >= 0.0);
			
 
				-			STARPU_ASSERT(hp->buckets[mapped_prio].valid_archs & starpu_heteroprio_types_to_arch[arch_index]);
			
 
				+			STARPU_ASSERT(hp->buckets[mapped_prio].valid_archs & starpu_heteroprio_types_to_arch(arch_index));
			
 
				 			check_archs[mapped_prio]      = 1;
			
 
				 			check_all_archs[mapped_prio] += 1;
			
 
				 		}
			
@@ -257,7 +237,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 
				 		{
			
 
				 			/* Ensure the current arch use a bucket or someone else can use it */
			
 
				 			STARPU_ASSERT(check_archs[idx_prio] == 1 || hp->buckets[idx_prio].valid_archs == 0
			
 
				-				      || (hp->buckets[idx_prio].valid_archs & ~starpu_heteroprio_types_to_arch[arch_index]) != 0);
			
 
				+				      || (hp->buckets[idx_prio].valid_archs & ~starpu_heteroprio_types_to_arch(arch_index)) != 0);
			
 
				 		}
			
 
				 	}
			
 
				 	/* Ensure that if a valid_archs = (STARPU_CPU|STARPU_CUDA) then check_all_archs[] = 2 for example */
			
@@ -267,7 +247,7 @@ static void initialize_heteroprio_policy(unsigned sched_ctx_id)
 
				 		unsigned nb_arch_on_bucket = 0;
			
 
				 		for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
			
 
				 		{
			
 
				-			if(hp->buckets[idx_prio].valid_archs & starpu_heteroprio_types_to_arch[arch_index])
			
 
				+			if(hp->buckets[idx_prio].valid_archs & starpu_heteroprio_types_to_arch(arch_index))
			
 
				 			{
			
 
				 				nb_arch_on_bucket += 1;
			
 
				 			}
			
@@ -310,32 +290,10 @@ static void add_workers_heteroprio_policy(unsigned sched_ctx_id, int *workerids,
 
				 		memset(&hp->workers_heteroprio[workerid], 0, sizeof(hp->workers_heteroprio[workerid]));
			
 
				 		/* if the worker has already belonged to this context
			
 
				 		   the queue and the synchronization variables have been already initialized */
			
 
				-			_starpu_prio_deque_init(&hp->workers_heteroprio[workerid].tasks_queue);
			
 
				-			switch(starpu_worker_get_type(workerid))
			
 
				-			{
			
 
				-			case STARPU_CPU_WORKER:
			
 
				-				hp->workers_heteroprio[workerid].arch_type = STARPU_CPU;
			
 
				-				hp->workers_heteroprio[workerid].arch_index = STARPU_CPU_IDX;
			
 
				-				break;
			
 
				-			case STARPU_CUDA_WORKER:
			
 
				-				hp->workers_heteroprio[workerid].arch_type = STARPU_CUDA;
			
 
				-				hp->workers_heteroprio[workerid].arch_index = STARPU_CUDA_IDX;
			
 
				-				break;
			
 
				-			case STARPU_OPENCL_WORKER:
			
 
				-				hp->workers_heteroprio[workerid].arch_type = STARPU_OPENCL;
			
 
				-				hp->workers_heteroprio[workerid].arch_index = STARPU_OPENCL_IDX;
			
 
				-				break;
			
 
				-			case STARPU_MIC_WORKER:
			
 
				-				hp->workers_heteroprio[workerid].arch_type = STARPU_MIC;
			
 
				-				hp->workers_heteroprio[workerid].arch_index = STARPU_MIC_IDX;
			
 
				-				break;
			
 
				-			case STARPU_MPI_MS_WORKER:
			
 
				-				hp->workers_heteroprio[workerid].arch_type = STARPU_MPI_MS;
			
 
				-				hp->workers_heteroprio[workerid].arch_index = STARPU_MPI_MS_IDX;
			
 
				-				break;
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-			}
			
 
				+		enum starpu_worker_archtype arch_index = starpu_worker_get_type(workerid);
			
 
				+		_starpu_prio_deque_init(&hp->workers_heteroprio[workerid].tasks_queue);
			
 
				+		hp->workers_heteroprio[workerid].arch_index = arch_index;
			
 
				+		hp->workers_heteroprio[workerid].arch_type = starpu_heteroprio_types_to_arch(arch_index);
			
 
				 		hp->nb_workers_per_arch_index[hp->workers_heteroprio[workerid].arch_index]++;
			
 
				 
			
 
				 	}
			
@@ -379,7 +337,7 @@ static int push_task_heteroprio_policy(struct starpu_task *task)
 
				 	for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
			
 
				 	{
			
 
				 		/* We test the archs on the bucket and not on task->where since it is restrictive */
			
 
				-		if(bucket->valid_archs & starpu_heteroprio_types_to_arch[arch_index])
			
 
				+		if(bucket->valid_archs & starpu_heteroprio_types_to_arch(arch_index))
			
 
				 			hp->nb_remaining_tasks_per_arch_index[arch_index] += 1;
			
 
				 	}
			
 
				 
			
@@ -512,7 +470,7 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
 
				 				for(arch_index = 0; arch_index < STARPU_NB_TYPES; ++arch_index)
			
 
				 				{
			
 
				 					/* We test the archs on the bucket and not on task->where since it is restrictive */
			
 
				-					if(bucket->valid_archs & starpu_heteroprio_types_to_arch[arch_index])
			
 
				+					if(bucket->valid_archs & starpu_heteroprio_types_to_arch(arch_index))
			
 
				 					{
			
 
				 						hp->nb_remaining_tasks_per_arch_index[arch_index] -= 1;
			
 
				 					}
			
--- a/src/sched_policies/modular_ez.c
+++ b/src/sched_policies/modular_ez.c
@@ -112,8 +112,8 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 		nummaxids = starpu_worker_get_count() + starpu_combined_worker_get_count();
			
 
				 		if (starpu_memory_nodes_get_count() > nummaxids)
			
 
				 			nummaxids = starpu_memory_nodes_get_count();
			
 
				-		if (STARPU_ANY_WORKER > nummaxids)
			
 
				-			nummaxids = STARPU_ANY_WORKER;
			
 
				+		if (STARPU_NARCH > nummaxids)
			
 
				+			nummaxids = STARPU_NARCH;
			
 
				 
			
 
				 		if (sched == 0)
			
 
				 			decide_flags = flags & STARPU_SCHED_SIMPLE_DECIDE_MASK;
			
@@ -154,7 +154,7 @@ void starpu_sched_component_initialize_simple_schedulers(unsigned sched_ctx_id,
 
				 			/* Count available architecture types */
			
 
				 			enum starpu_worker_archtype type;
			
 
				 			nbelow = 0;
			
 
				-			for (type = STARPU_CPU_WORKER; type < STARPU_ANY_WORKER; type++)
			
 
				+			for (type = 0; type < STARPU_NARCH; type++)
			
 
				 			{
			
 
				 				if (starpu_worker_get_count_by_type(type))
			
 
				 				{
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -22,7 +22,6 @@
 
				 #include <limits.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <core/perfmodel/perfmodel.h>
			
 
				-#include <starpu_parameters.h>
			
 
				 #include <core/detect_combined_workers.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/task.h>
			
--- a/src/sched_policies/work_stealing_policy.c
+++ b/src/sched_policies/work_stealing_policy.c