7 years ago · eb5c169aeb
--- a/configure.ac
+++ b/configure.ac
@@ -1327,10 +1327,19 @@ if test x$enable_cuda = xyes; then
 
																 		NVCCFLAGS="${NVCCFLAGS} -m64"
															
 
																 	fi
															
 
																+	SAVED_CPPFLAGS="${CPPFLAGS}"
															
 
																+	CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
															
 
																+	SAVED_LDFLAGS="${LDFLAGS}"
															
 
																+	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
															
 
																 	AC_CHECK_HEADERS([cuda_gl_interop.h])
															
 
																 	AC_CHECK_LIB([cusparse], [cusparseCreate])
															
 
																 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
															
 
																+
															
 
																+	AC_CHECK_HEADER([nvml.h],
															
 
																+	  [AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetTotalEnergyConsumption])])
															
 
																+        CPPFLAGS="${SAVED_CPPFLAGS}"
															
 
																+	LDFLAGS="${SAVED_LDFLAGS}"
															
 
																 fi
															
 
																 dnl Hey dude, are you around?
															
--- a/contrib/ci.inria.fr/Jenkinsfile-basic
+++ b/contrib/ci.inria.fr/Jenkinsfile-basic
@@ -0,0 +1,132 @@
 
																+#!groovy
															
 
																+// StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+//
															
 
																+// Copyright (C) 2018                                CNRS
															
 
																+//
															
 
																+// StarPU is free software; you can redistribute it and/or modify
															
 
																+// it under the terms of the GNU Lesser General Public License as published by
															
 
																+// the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+// your option) any later version.
															
 
																+//
															
 
																+// StarPU is distributed in the hope that it will be useful, but
															
 
																+// WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+//
															
 
																+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+//
															
 
																+
															
 
																+def statusHasChanged = false
															
 
																+
															
 
																+pipeline
															
 
																+{
															
 
																+	agent none
															
 
																+
															
 
																+	// Trigger the build
															
 
																+	triggers
															
 
																+	{
															
 
																+		// Poll gitlab explicitly every 15mn
															
 
																+		pollSCM('00-59/15 * * * *')
															
 
																+	}
															
 
																+
															
 
																+	stages
															
 
																+	{
															
 
																+		stage('Tarball')
															
 
																+		{
															
 
																+			steps
															
 
																+			{
															
 
																+				node('autotools')
															
 
																+				{
															
 
																+					checkout scm
															
 
																+					sh 'contrib/ci.inria.fr/job-0-tarball.sh'
															
 
																+					script
															
 
																+					{
															
 
																+					       env.tarballgz = sh (script: 'ls *.tar.gz', returnStdout: true).trim()
															
 
																+					}
															
 
																+					stash includes: "${env.tarballgz}", name: 'tarballgz'
															
 
																+					stash includes: "starpu.pdf", name: 'doc'
															
 
																+					// Stash those scripts because they are not in make dist
															
 
																+					dir('contrib/ci.inria.fr')
															
 
																+					{
															
 
																+						stash includes: "job-1-check.sh", name: 'script-unix-check'
															
 
																+					}
															
 
																+					archiveArtifacts artifacts: "${env.tarballgz},starpu.pdf", fingerprint: true, onlyIfSuccessful: true
															
 
																+					deleteDir()
															
 
																+
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+		stage('Check')
															
 
																+		{
															
 
																+			steps
															
 
																+			{
															
 
																+				script
															
 
																+				{
															
 
																+					labelToSelect = 'unix'
															
 
																+					listOfNodeNames = jenkins.model.Jenkins.instance.nodes.collect
															
 
																+					{
															
 
																+						node -> node.getLabelString().contains(labelToSelect) ? node.name : null
															
 
																+					}
															
 
																+					listOfNodeNames.removeAll(Collections.singleton(null))
															
 
																+
															
 
																+					def p = listOfNodeNames.collectEntries
															
 
																+					{
															
 
																+						[ (it):
															
 
																+						{
															
 
																+							node(it)
															
 
																+							{
															
 
																+								dir('check-unix')
															
 
																+								{
															
 
																+									unstash 'tarballgz'
															
 
																+									unstash 'script-unix-check'
															
 
																+									sh 'chmod 755 job-1-check.sh && ./job-1-check.sh'
															
 
																+									deleteDir()
															
 
																+								}
															
 
																+							}
															
 
																+						}
															
 
																+					]}
															
 
																+					parallel p;
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	post
															
 
																+	{
															
 
																+		// hooks are called in order: always, changed, aborted, failure, success, unstable
															
 
																+		changed
															
 
																+		{
															
 
																+			echo "Build status has changed."
															
 
																+			script
															
 
																+			{
															
 
																+
															
 
																+				statusHasChanged = true
															
 
																+			}
															
 
																+		}
															
 
																+		success
															
 
																+		{
															
 
																+			echo "Build success."
															
 
																+			// email when changed to success
															
 
																+			script
															
 
																+			{
															
 
																+				if (statusHasChanged)
															
 
																+				{
															
 
																+					emailext(body: '${DEFAULT_CONTENT}',
															
 
																+						 subject: '${DEFAULT_SUBJECT}',
															
 
																+						 replyTo: '$DEFAULT_REPLYTO',
															
 
																+						 to: '$DEFAULT_RECIPIENTS',
															
 
																+						 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+		failure
															
 
																+		{
															
 
																+			echo "Build failure."
															
 
																+			// always email on failure
															
 
																+			emailext(body: '${DEFAULT_CONTENT}',
															
 
																+				 subject: '${DEFAULT_SUBJECT}',
															
 
																+				 replyTo: '$DEFAULT_REPLYTO',
															
 
																+				 to: '$DEFAULT_RECIPIENTS',
															
 
																+				 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
															
 
																+		}
															
 
																+	}
															
 
																+}
															
--- a/contrib/ci.inria.fr/job-0-tarball.sh
+++ b/contrib/ci.inria.fr/job-0-tarball.sh
@@ -0,0 +1,30 @@
 
																+#!/bin/sh
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2018                                CNRS
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																+
															
 
																+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
															
 
																+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
															
 
																+
															
 
																+./autogen.sh
															
 
																+if test -d build ; then chmod -R 777 build && rm -rf build ; fi
															
 
																+mkdir build && cd build
															
 
																+../configure
															
 
																+make V=1
															
 
																+make dist
															
 
																+cp *gz ..
															
 
																+cp doc/doxygen/starpu.pdf ..
															
 
																+make clean
															
 
																+
															
--- a/contrib/ci.inria.fr/job-1-check.sh
+++ b/contrib/ci.inria.fr/job-1-check.sh
@@ -0,0 +1,85 @@
 
																+#!/bin/sh
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2013-2018                                CNRS
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																+
															
 
																+set -e
															
 
																+set -x
															
 
																+
															
 
																+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
															
 
																+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
															
 
																+
															
 
																+tarball=$(ls -tr starpu-*.tar.gz | tail -1)
															
 
																+
															
 
																+if test -z "$tarball"
															
 
																+then
															
 
																+    echo Error. No tar.gz file
															
 
																+    ls
															
 
																+    pwd
															
 
																+    exit 1
															
 
																+fi
															
 
																+
															
 
																+basename=$(basename $tarball .tar.gz)
															
 
																+export STARPU_HOME=$PWD/$basename/home
															
 
																+mkdir -p $basename
															
 
																+cd $basename
															
 
																+env > $PWD/env
															
 
																+
															
 
																+test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
															
 
																+tar xfz ../$tarball
															
 
																+cd $basename
															
 
																+mkdir build
															
 
																+cd build
															
 
																+
															
 
																+STARPU_CONFIGURE_OPTIONS=""
															
 
																+suname=$(uname)
															
 
																+if test "$suname" == "Darwin"
															
 
																+then
															
 
																+    STARPU_CONFIGURE_OPTIONS="--without-hwloc"
															
 
																+fi
															
 
																+if test "$suname" == "OpenBSD"
															
 
																+then
															
 
																+    STARPU_CONFIGURE_OPTIONS="--without-hwloc --disable-mlr"
															
 
																+fi
															
 
																+if test "$suname" == "FreeBSD"
															
 
																+then
															
 
																+    STARPU_CONFIGURE_OPTIONS="--disable-fortran"
															
 
																+fi
															
 
																+
															
 
																+export CC=gcc
															
 
																+
															
 
																+day=$(date +%u)
															
 
																+if test $day -le 5
															
 
																+then
															
 
																+    ../configure --enable-quick-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
															
 
																+else
															
 
																+    ../configure --enable-long-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
															
 
																+fi
															
 
																+
															
 
																+make
															
 
																+#make check
															
 
																+(make -k check || true) > ../check_$$ 2>&1
															
 
																+cat ../check_$$
															
 
																+make showcheck
															
 
																+
															
 
																+grep "^FAIL:" ../check_$$ || true
															
 
																+
															
 
																+make clean
															
 
																+
															
 
																+grep "^FAIL:" ../check_$$ || true
															
 
																+
															
 
																+echo "Running on $(uname -a)"
															
 
																+exit $(grep "^FAIL:" ../check_$$ | wc -l)
															
 
																+
															
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2011-2013,2015,2017                      Inria
															
 
																  * Copyright (C) 2010-2018                                CNRS
															
 
																- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -26,6 +26,26 @@ performance, we give below a list of features which should be checked.
 
																 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
															
 
																 will show roughly where time is spent, and focus correspondingly.
															
 
																+\section CheckTaskSize Check Task Size
															
 
																+
															
 
																+Make sure that your tasks are not too small, because the StarPU runtime overhead
															
 
																+is not completely zero. You can run the tasks_size_overhead.sh script to get an
															
 
																+idea of the scalability of tasks depending on their duration (in µs), on your
															
 
																+own system.
															
 
																+
															
 
																+Typically, 10µs-ish tasks are definitely too small, the CUDA overhead itself is
															
 
																+much bigger than this.
															
 
																+
															
 
																+1ms-ish tasks may be a good start, but will not necessarily scale to many dozens
															
 
																+of cores, so it's better to try to get 10ms-ish tasks.
															
 
																+
															
 
																+Tasks durations can easily be observed when performance models are defined (see
															
 
																+\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
															
 
																+<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
															
 
																+
															
 
																+When using parallel tasks, the problem is even worse since StarPU has to
															
 
																+synchronize the execution of tasks.
															
 
																+
															
 
																 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
															
 
																 The \ref enable-fast "--enable-fast" configuration option disables all
															
@@ -116,6 +136,16 @@ enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
 
																 number of kernels to execute concurrently.  This is useful when kernels are
															
 
																 small and do not feed the whole GPU with threads to run.
															
 
																+Concerning memory allocation, you should really not use cudaMalloc/cudaFree
															
 
																+within the kernel, since cudaFree introduces a awfully lot of synchronizations
															
 
																+within CUDA itself. You should instead add a parameter to the codelet with the
															
 
																+STARPU_SCRATCH mode access. You can then pass to the task a handle registered
															
 
																+with the desired size but with the NULL pointer, that handle can even be the
															
 
																+shared between tasks, StarPU will allocate per-task data on the fly before task
															
 
																+execution, and reuse the allocated data between tasks.
															
 
																+
															
 
																+See <c>examples/pi/pi_redux.c</c> for an example of use.
															
 
																+
															
 
																 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
															
 
																 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011-2013,2015-2017                      Inria
															
 
																- * Copyright (C) 2010-2017                                CNRS
															
 
																+ * Copyright (C) 2010-2018                                CNRS
															
 
																  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
															
 
																  * Copyright (C) 2016                                     Uppsala University
															
 
																  *
															
@@ -1173,6 +1173,14 @@ If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the
 
																 discovered by StarPU.
															
 
																 </dd>
															
 
																+<dt>STARPU_IDLE_FILE</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_IDLE_FILE
															
 
																+\addindex __env__STARPU_IDLE_FILE
															
 
																+If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
															
 
																+The file will contain the sum of the idle times of all the workers.
															
 
																+</dd>
															
 
																+
															
 
																 </dl>
															
 
																 \section ConfiguringTheHypervisor Configuring The Hypervisor
															
--- a/doc/doxygen/chapters/api/profiling.doxy
+++ b/doc/doxygen/chapters/api/profiling.doxy
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010-2015,2017                           CNRS
															
 
																- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
															
 
																  * Copyright (C) 2011-2012                                Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -72,7 +72,7 @@ profiling was enabled.
 
																     Number of cycles stalled within the task, only available in the MoviSim
															
 
																 \var double starpu_profiling_task_info::energy_consumed
															
 
																-Energy consumed by the task, only available in the MoviSim
															
 
																+Energy consumed by the task, in Joules
															
 
																 \struct starpu_profiling_worker_info
															
 
																 This structure contains the profiling information associated to
															
@@ -94,7 +94,7 @@ starpu_profiling_worker_get_info()
 
																 \var uint64_t starpu_profiling_worker_info::stall_cycles
															
 
																         Number of cycles stalled within the worker, only available in the MoviSim
															
 
																 \var double starpu_profiling_worker_info::energy_consumed
															
 
																-        Energy consumed by the worker, only available in the MoviSim
															
 
																+        Energy consumed by the worker, in Joules
															
 
																 \struct starpu_profiling_bus_info
															
 
																 todo
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -3,7 +3,7 @@
 
																 # Copyright (C) 2011-2017                                Inria
															
 
																 # Copyright (C) 2017                                     Erwan Leria
															
 
																 # Copyright (C) 2009-2018                                Université de Bordeaux
															
 
																-# Copyright (C) 2010-2015,2017                           CNRS
															
 
																+# Copyright (C) 2010-2015,2017,2018                           CNRS
															
 
																 # Copyright (C) 2011                                     Télécom-SudParis
															
 
																 # Copyright (C) 2016                                     Uppsala University
															
 
																 #
															
@@ -227,6 +227,7 @@ STARPU_EXAMPLES +=				\
 
																 	filters/fmultiple_submit		\
															
 
																 	filters/fmultiple_submit_readonly	\
															
 
																 	filters/fmultiple_submit_implicit	\
															
 
																+	filters/frecursive			\
															
 
																 	tag_example/tag_example			\
															
 
																 	tag_example/tag_example2		\
															
 
																 	tag_example/tag_example3		\
															
--- a/examples/filters/frecursive.c
+++ b/examples/filters/frecursive.c
@@ -0,0 +1,170 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2018                                     CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+
															
 
																+void cpu_codelet(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+        unsigned i, j;
															
 
																+        int factor;
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &factor, 0);
															
 
																+        /* length of the matrix */
															
 
																+        unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
															
 
																+        unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
															
 
																+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
															
 
																+        /* local copy of the matrix pointer */
															
 
																+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
															
 
																+
															
 
																+	FPRINTF(stderr, "computing on matrix with nx=%d, ny=%d, ld=%d\n", nx, ny, ld);
															
 
																+        for(j=0; j<ny ; j++)
															
 
																+	{
															
 
																+                for(i=0; i<nx ; i++)
															
 
																+                        val[(j*ld)+i] *= factor;
															
 
																+        }
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+        .cpu_funcs[0] = cpu_codelet,
															
 
																+        .nbuffers = 1,
															
 
																+	.modes[0] = STARPU_RW,
															
 
																+};
															
 
																+
															
 
																+#define NX 400
															
 
																+#define NY 80
															
 
																+#define LD NX
															
 
																+#define PARTS 4
															
 
																+
															
 
																+int main(void)
															
 
																+{
															
 
																+        int *matrix;
															
 
																+	starpu_data_handle_t matrix_handle;
															
 
																+	starpu_data_handle_t subhandle_l1[PARTS];
															
 
																+	starpu_data_handle_t subhandle_l2[PARTS][PARTS];
															
 
																+	starpu_data_handle_t subhandle_l3[PARTS][PARTS][PARTS];
															
 
																+	int ret, submit;
															
 
																+
															
 
																+	int factor = 12;
															
 
																+	int n=1;
															
 
																+	int i,j,k;
															
 
																+
															
 
																+        ret = starpu_init(NULL);
															
 
																+	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+	{
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																+	if (starpu_cpu_worker_get_count() < 1)
															
 
																+	{
															
 
																+		FPRINTF(stderr, "This application requires at least 1 cpu worker\n");
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																+	matrix = (int*)malloc(NX * NY * sizeof(int));
															
 
																+        assert(matrix);
															
 
																+	starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, LD, NX, NY, sizeof(int));
															
 
																+
															
 
																+        for(j=0 ; j<NY ; j++)
															
 
																+	{
															
 
																+                for(i=0 ; i<NX ; i++)
															
 
																+		{
															
 
																+                        matrix[(j*LD)+i] = n++;
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	/* Split the matrix in PARTS sub-matrices, each sub-matrix in PARTS sub-sub-matrices, and each sub-sub matrix in PARTS sub-sub-sub-matrices */
															
 
																+	struct starpu_data_filter f =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	struct starpu_data_filter f2 =
															
 
																+	{
															
 
																+		.filter_func = starpu_matrix_filter_vertical_block,
															
 
																+		.nchildren = PARTS
															
 
																+	};
															
 
																+	starpu_data_partition_plan(matrix_handle, &f, subhandle_l1);
															
 
																+	for(i=0 ; i<PARTS ; i++)
															
 
																+	{
															
 
																+		starpu_data_partition_plan(subhandle_l1[i], &f2, subhandle_l2[i]);
															
 
																+		for(j=0 ; j<PARTS ; j++)
															
 
																+		{
															
 
																+			starpu_data_partition_plan(subhandle_l2[i][j], &f, subhandle_l3[i][j]);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+        /* Submit a task on the first sub-matrix and sub-sub matrix, and on all others sub-sub-matrices */
															
 
																+	ret = starpu_task_insert(&cl,
															
 
																+				 STARPU_RW, subhandle_l1[0],
															
 
																+				 STARPU_VALUE, &factor, sizeof(factor),
															
 
																+				 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+	for (i=1; i<PARTS; i++)
															
 
																+	{
															
 
																+		ret = starpu_task_insert(&cl,
															
 
																+					 STARPU_RW, subhandle_l2[i][0],
															
 
																+					 STARPU_VALUE, &factor, sizeof(factor),
															
 
																+					 0);
															
 
																+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+		for (j=1; j<PARTS; j++)
															
 
																+		{
															
 
																+			for (k=0; k<PARTS; k++)
															
 
																+			{
															
 
																+				ret = starpu_task_insert(&cl,
															
 
																+							 STARPU_RW, subhandle_l3[i][j][k],
															
 
																+							 STARPU_VALUE, &factor, sizeof(factor),
															
 
																+							 0);
															
 
																+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for(i=0 ; i<PARTS ; i++)
															
 
																+	{
															
 
																+		for(j=0 ; j<PARTS ; j++)
															
 
																+		{
															
 
																+			starpu_data_partition_clean(subhandle_l2[i][j], PARTS, subhandle_l3[i][j]);
															
 
																+
															
 
																+		}
															
 
																+		starpu_data_partition_clean(subhandle_l1[i], PARTS, subhandle_l2[i]);
															
 
																+	}
															
 
																+	starpu_data_partition_clean(matrix_handle, PARTS, subhandle_l1);
															
 
																+	starpu_data_unregister(matrix_handle);
															
 
																+
															
 
																+	/* Print result matrix */
															
 
																+	n=1;
															
 
																+	for(j=0 ; j<NY ; j++)
															
 
																+	{
															
 
																+		for(i=0 ; i<NX ; i++)
															
 
																+		{
															
 
																+			if (matrix[(j*LD)+i] != (int) n*12)
															
 
																+			{
															
 
																+				FPRINTF(stderr, "Incorrect result %4d != %4d", matrix[(j*LD)+i], n*12);
															
 
																+				ret=1;
															
 
																+			}
															
 
																+			n++;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	free(matrix);
															
 
																+        starpu_shutdown();
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -620,7 +620,7 @@ do {									\
 
																 		}							\
															
 
																 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
															
 
																 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
															
 
																-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
															
 
																+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
															
 
																 	}								\
															
 
																 } while(0);
															
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -167,7 +167,12 @@
 
																 	{ \
															
 
																 		/* Sort by decreasing order */ \
															
 
																 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
															
 
																-		return (e2->prio - prio); \
															
 
																+		if (e2->prio < prio) \
															
 
																+			return -1; \
															
 
																+		if (e2->prio == prio) \
															
 
																+			return 0; \
															
 
																+		/* e2->prio > prio */ \
															
 
																+		return 1; \
															
 
																 	} \
															
 
																 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
															
 
																 	{ \
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011-2013,2016-2017                      Inria
															
 
																- * Copyright (C) 2008-2017                                Université de Bordeaux
															
 
																+ * Copyright (C) 2008-2018                                Université de Bordeaux
															
 
																  * Copyright (C) 2010-2017                                CNRS
															
 
																  * Copyright (C) 2013                                     Thibaut Lambert
															
 
																  * Copyright (C) 2011                                     Télécom-SudParis
															
@@ -477,6 +477,8 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 
																 			multi_invalid = (multi_invalid||isnan(reg_model->coeff[i]));
															
 
																 		}
															
 
																 		reg_model->multi_valid = !multi_invalid;
															
 
																+		res = fscanf(f, "\n");
															
 
																+		STARPU_ASSERT_MSG(res == 0, "Incorrect performance model file %s", path);
															
 
																 	}
															
 
																 }
															
@@ -1763,7 +1765,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 					unsigned n = entry->nsample;
															
 
																 					entry->mean = entry->sum / n;
															
 
																-					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
															
 
																+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum)/n))/n);
															
 
																 				}
															
 
																 				if (j->task->flops != 0.)
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -397,6 +397,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
																 			struct starpu_task *alias = starpu_task_dup(task);
															
 
																 			alias->destroy = 1;
															
 
																+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
															
 
																 			worker = _starpu_get_worker_struct(combined_workerid[j]);
															
 
																 			ret |= _starpu_push_local_task(worker, alias, 0);
															
 
																 		}
															
@@ -581,6 +582,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 					if (job->task_size > 1)
															
 
																 					{
															
 
																 						alias = starpu_task_dup(task);
															
 
																+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
															
 
																 						alias->destroy = 1;
															
 
																 					}
															
 
																 					else
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -360,7 +360,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
																 				ret = -ENOMEM;
															
 
																 		}
															
 
																-#if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
															
 
																+#if (defined(STARPU_SIMGRID) && (SIMGRID_VERSION < 31500 || SIMGRID_VERSION == 31559)) || defined(STARPU_USE_CUDA)
															
 
																 end:
															
 
																 #endif
															
 
																 	if (ret == 0)
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -107,12 +107,22 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 			/* rebind to single CPU */
															
 
																 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
															
 
																 	}
															
 
																+	else
															
 
																+	{
															
 
																+		_STARPU_TRACE_START_EXECUTING();
															
 
																+	}
															
 
																+
															
 
																+	if (is_parallel_task)
															
 
																+	{
															
 
																+		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
															
 
																+		if (rank != 0)
															
 
																+			_STARPU_TRACE_END_EXECUTING();
															
 
																+	}
															
 
																 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
															
 
																 	if (is_parallel_task)
															
 
																 	{
															
 
																-		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 		if (rank == 0)
															
 
																 		{
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011-2012,2014,2016-2017                 Inria
															
 
																- * Copyright (C) 2008-2017                                Université de Bordeaux
															
 
																+ * Copyright (C) 2008-2018                                Université de Bordeaux
															
 
																  * Copyright (C) 2010                                     Mehdi Juhoor
															
 
																  * Copyright (C) 2010-2017                                CNRS
															
 
																  * Copyright (C) 2011                                     Télécom-SudParis
															
@@ -31,6 +31,9 @@
 
																 #ifdef HAVE_CUDA_GL_INTEROP_H
															
 
																 #include <cuda_gl_interop.h>
															
 
																 #endif
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+#include <nvml.h>
															
 
																+#endif
															
 
																 #include <datawizard/memory_manager.h>
															
 
																 #include <datawizard/memory_nodes.h>
															
 
																 #include <datawizard/malloc.h>
															
@@ -53,9 +56,13 @@
 
																 static int ncudagpus = -1;
															
 
																 static size_t global_mem[STARPU_MAXCUDADEVS];
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
															
 
																+#endif
															
 
																 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static cudaStream_t streams[STARPU_NMAXWORKERS];
															
 
																+static char used_stream[STARPU_NMAXWORKERS];
															
 
																 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
															
 
																 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
															
 
																 /* Note: streams are not thread-safe, so we define them for each CUDA worker
															
@@ -106,6 +113,9 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
																 	if (STARPU_UNLIKELY(cures != cudaSuccess))
															
 
																 		cnt = 0;
															
 
																 	config->topology.nhwcudagpus = cnt;
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+	nvmlInit();
															
 
																+#endif
															
 
																 #endif
															
 
																 }
															
@@ -215,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 
																 {
															
 
																 	int worker = starpu_worker_get_id_check();
															
 
																+	used_stream[worker] = 1;
															
 
																 	return streams[worker];
															
 
																 }
															
@@ -520,7 +531,30 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
																 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
															
 
																 				async ? &task_finished[workerid][pipeline_idx] : NULL);
															
 
																 #else
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+		unsigned long long energy_start = 0;
															
 
																+		nvmlReturn_t nvmlRet = -1;
															
 
																+		if (profiling || (cl->energy_model && cl->energy_model->benchmarking))
															
 
																+		{
															
 
																+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_start);
															
 
																+		}
															
 
																+#endif
															
 
																+
															
 
																 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+		if (nvmlRet == NVML_SUCCESS &&
															
 
																+			(profiling || (cl->energy_model && cl->energy_model->benchmarking)))
															
 
																+		{
															
 
																+			unsigned long long energy_end;
															
 
																+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_end);
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning TODO: measure idle consumption to subtract it
															
 
																+#endif
															
 
																+			if (nvmlRet == NVML_SUCCESS)
															
 
																+				task->profiling_info->energy_consumed += (energy_end - energy_start) / 1000.;
															
 
																+		}
															
 
																+#endif
															
 
																 #endif
															
 
																 		_STARPU_TRACE_END_EXECUTING();
															
 
																 	}
															
@@ -581,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 
																 		}
															
 
																 	}
															
 
																+#ifndef STARPU_SIMGRID
															
 
																+	if (!used_stream[workerid])
															
 
																+	{
															
 
																+		used_stream[workerid] = 1;
															
 
																+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
															
 
																+	}
															
 
																+#endif
															
 
																+
															
 
																 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
															
 
																 	{
															
 
																 		if (worker->pipeline_length == 0)
															
@@ -682,6 +724,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
																 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
															
 
																 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
															
 
																+#ifdef HAVE_LIBNVIDIA_ML
															
 
																+		char busid[13];
															
 
																+		snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
															
 
																+		nvmlDeviceGetHandleByPciBusId(busid, &nvmlDev[devid]);
															
 
																+#endif
															
 
																 		if (props[devid].pciDomainID)
															
 
																 			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
															
 
																 		else
															
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2011-2014,2017                           Inria
															
 
																  * Copyright (C) 2010-2012,2014-2017                      CNRS
															
 
																- * Copyright (C) 2010-2017                                Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2018                                Université de Bordeaux
															
 
																  * Copyright (C) 2011                                     Télécom-SudParis
															
 
																  * Copyright (C) 2013                                     Simon Archipoff
															
 
																  *
															
@@ -631,6 +631,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
																 	task_alias[0]->task->destroy = 1;
															
 
																 	task_alias[0]->left = NULL;
															
 
																 	task_alias[0]->ntasks = combined_worker->worker_size;
															
 
																+	_STARPU_TRACE_JOB_PUSH(task_alias[0]->task, task_alias[0]->task->priority > 0);
															
 
																 	int i;
															
 
																 	for(i = 1; i < combined_worker->worker_size; i++)
															
 
																 	{
															
@@ -641,6 +642,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
																 		task_alias[i]->left = task_alias[i-1];
															
 
																 		task_alias[i - 1]->right = task_alias[i];
															
 
																 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
															
 
																+		_STARPU_TRACE_JOB_PUSH(task_alias[i]->task, task_alias[i]->task->priority > 0);
															
 
																 	}
															
 
																 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;
															
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -342,6 +342,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
																 		struct starpu_task *alias = starpu_task_dup(task);
															
 
																 		int local_worker = combined_workerid[i];
															
 
																 		alias->destroy = 1;
															
 
																+		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
															
 
																 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
															
 
																 	}
															
@@ -352,6 +353,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
															
 
																+	_STARPU_TRACE_JOB_PUSH(master_alias, master_alias->priority > 0);
															
 
																+
															
 
																 	for (i = 1; i < worker_size; i++)
															
 
																 	{
															
 
																 		int local_worker = combined_workerid[i];
															
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2011-2013,2015,2017                      Inria
															
 
																- * Copyright (C) 2010-2017                                Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2018                                Université de Bordeaux
															
 
																  * Copyright (C) 2011-2017                                CNRS
															
 
																  * Copyright (C) 2011                                     Télécom-SudParis
															
 
																  *
															
@@ -175,6 +175,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 			ntasks[local_combined_workerid]++;
															
 
																 			_starpu_worker_unlock(local_combined_workerid);
															
 
																+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
															
 
																 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
															
 
																 		}
															
--- a/starpurm/dev/chameleon_test/dgemm.c
+++ b/starpurm/dev/chameleon_test/dgemm.c
@@ -1,3 +1,21 @@
 
																+/* StarPURM --- StarPU Resource Management Layer.
															
 
																+ *
															
 
																+ * Copyright (C) 2017, 2018  Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+/* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */
															
 
																+
															
 
																 #define _GNU_SOURCE
															
 
																 #include <sched.h>
															
 
																 #include <stdio.h>
															
@@ -6,18 +24,56 @@
 
																 #include <morse.h>
															
 
																 #include <starpurm.h>
															
 
																 #include <hwloc.h>
															
 
																+#include <pthread.h>
															
 
																+
															
 
																+#define CHECK
															
 
																 static int rm_cpu_type_id = -1;
															
 
																+static int rm_cuda_type_id = -1;
															
 
																 static int rm_nb_cpu_units = 0;
															
 
																+static int rm_nb_cuda_units = 0;
															
 
																+static const int nb_random_tests = 10;
															
 
																-static void test1();
															
 
																-static void init_rm_infos(void);
															
 
																+static unsigned spawn_pending = 0;
															
 
																+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																+static pthread_cond_t spawn_pending_cond;
															
 
																-static const int nb_random_tests = 10;
															
 
																+static void _inc_spawn_pending(void)
															
 
																+{
															
 
																+	pthread_mutex_lock(&spawn_pending_mutex);
															
 
																+	assert(spawn_pending < UINT_MAX);
															
 
																+	spawn_pending++;
															
 
																+	pthread_mutex_unlock(&spawn_pending_mutex);
															
 
																+}
															
 
																-static void test1()
															
 
																+static void _dec_spawn_pending(void)
															
 
																 {
															
 
																-	int i;
															
 
																+	pthread_mutex_lock(&spawn_pending_mutex);
															
 
																+	assert(spawn_pending > 0);
															
 
																+	spawn_pending--;
															
 
																+	if (spawn_pending == 0)
															
 
																+		pthread_cond_broadcast(&spawn_pending_cond);
															
 
																+	pthread_mutex_unlock(&spawn_pending_mutex);
															
 
																+}
															
 
																+
															
 
																+static void _wait_pending_spawns(void)
															
 
																+{
															
 
																+	pthread_mutex_lock(&spawn_pending_mutex);
															
 
																+	while (spawn_pending > 0)
															
 
																+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
															
 
																+	pthread_mutex_unlock(&spawn_pending_mutex);
															
 
																+}
															
 
																+
															
 
																+static void spawn_callback(void *_arg)
															
 
																+{
															
 
																+	assert(42 == (uintptr_t)_arg);
															
 
																+	_dec_spawn_pending();
															
 
																+}
															
 
																+
															
 
																+static void usage(void)
															
 
																+{
															
 
																+	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
															
 
																+	exit(EXIT_FAILURE);
															
 
																 }
															
 
																 static void init_rm_infos(void)
															
@@ -30,236 +86,223 @@ static void init_rm_infos(void)
 
																 		exit(77);
															
 
																 	}
															
 
																+	int cuda_type = starpurm_get_device_type_id("cuda");
															
 
																+	int nb_cuda_units = starpurm_get_nb_devices_by_type(cuda_type);
															
 
																+
															
 
																 	rm_cpu_type_id = cpu_type;
															
 
																+	rm_cuda_type_id = cuda_type;
															
 
																 	rm_nb_cpu_units = nb_cpu_units;
															
 
																+	rm_nb_cuda_units = nb_cuda_units;
															
 
																 }
															
 
																-static void disp_selected_cpuset(void)
															
 
																+
															
 
																+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
															
 
																 {
															
 
																-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
															
 
																+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
															
 
																 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
															
 
																 	char str[strl+1];
															
 
																 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
															
 
																-	printf("selected cpuset = %s\n", str);
															
 
																+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
															
 
																 }
															
 
																-int main( int argc, char const *argv[])
															
 
																+struct s_test_args
															
 
																 {
															
 
																-	starpurm_initialize();
															
 
																-	init_rm_infos();
															
 
																-	printf("using default units\n");
															
 
																-	disp_selected_cpuset();
															
 
																-	test1();
															
 
																-	starpurm_shutdown();
															
 
																-#if 0
															
 
																+	const int m;
															
 
																+	const int n;
															
 
																+	const int k;
															
 
																+	int transA;
															
 
																+	int transB;
															
 
																+};
															
 
																-	if(argc < 6 || argc > 6)
															
 
																-	{ 		
															
 
																-		fprintf(stderr, "Usage: ./test_dgemm M N K TRANS_A TRANS_B\n" );
															
 
																-		return 1;
															
 
																-	}
															
 
																-	
															
 
																-	// Local variables
															
 
																-	int i, j;
															
 
																-	int m, n, k;
															
 
																-	const char *transA_input = NULL;
															
 
																-	const char *transB_input = NULL;
															
 
																-	enum DDSS_TRANS transA = Trans;
															
 
																-	enum DDSS_TRANS transB = Trans;
															
 
																-	double alpha; 
															
 
																-	double beta;
															
 
																-	double error;
															
 
																-	double max_error;
															
 
																-	double count_error;	
															
 
																-	double *A;
															
 
																-	double *B;
															
 
																-	double *C;
															
 
																-	double *C_test;
															
 
																-	struct timeval start, end;
															
 
																-	double flops;
															
 
																-	double flops_ddss; 
															
 
																-	double flops_ref; 
															
 
																-	int ret;
															
 
																-	m = atoi( argv[1] );
															
 
																-	n = atoi( argv[2] );
															
 
																-	k = atoi( argv[3] );
															
 
																-	
															
 
																-	if ( strlen( argv[4] ) != 1 ) 
															
 
																-	{
															
 
																-		fprintf(stderr,"Illegal value of TRANS_A, TRANS_A can be T or N\n");
															
 
																-		return 1;
															
 
																-	}
															
 
																-	transA_input = argv[4];	
															
 
																-	
															
 
																-	if ( strlen( argv[5] ) != 1 ) 
															
 
																+static void test(void *_args)
															
 
																+{
															
 
																+	struct s_test_args *args = _args;
															
 
																+	const int m = args->m;
															
 
																+	const int n = args->n;
															
 
																+	const int k = args->k;
															
 
																+	int transA = args->transA;
															
 
																+	int transB = args->transB;
															
 
																+	unsigned rand_seed = (unsigned)time(NULL);
															
 
																+	double *A = malloc(m * k * sizeof(double));
															
 
																+	double *B = malloc(k * n * sizeof(double));
															
 
																+	double *C = calloc(m * n, sizeof(double));
															
 
																+	double *C_test = calloc(m * n, sizeof(double));
															
 
																+
															
 
																+	const double alpha = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
															
 
																+	const double beta  = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
															
 
																+ 
															
 
																+	int i;
															
 
																+	for (i = 0; i < m; i++)
															
 
																 	{
															
 
																-		fprintf(stderr,"Illegal value of TRANS_B, TRANS_B can be T or N\n");
															
 
																-		return 1;
															
 
																+		int j;
															
 
																+		for (j = 0; j < n; j++)
															
 
																+		{
															
 
																+			A[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
															
 
																+			B[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
															
 
																+		}
															
 
																 	}
															
 
																-	transB_input = argv[5];	
															
 
																-	// Set seed 
															
 
																-	srand(time(NULL));
															
 
																+	int res = MORSE_dgemm(transA, transB, m, n, k, alpha, A, k, B, n, beta, C, n);
															
 
																+#ifdef CHECK
															
 
																+	/* Check */
															
 
																+	cblas_dgemm( CblasColMajor, 
															
 
																+			( CBLAS_TRANSPOSE ) transA,
															
 
																+			( CBLAS_TRANSPOSE ) transB,
															
 
																+			m, n, k,
															
 
																+			alpha, A, k,
															
 
																+			B, n,
															
 
																+			beta, C_test, n );
															
 
																-	max_error = 1.0;
															
 
																-	count_error = 0.0;
															
 
																+	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
															
 
																+	cblas_daxpy(m*n, -1, C, 1, C_test, 1);
															
 
																+	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
															
 
																+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
															
 
																+#endif
															
 
																+	free(A);
															
 
																+	free(B);
															
 
																+	free(C);
															
 
																+	free(C_test);
															
 
																+}
															
 
																-	// Checking inputs
															
 
																-	if ( m < 0 )
															
 
																-	{
															
 
																-		fprintf(stderr, "Illegal value of M, M must be >= 0\n");
															
 
																-		return 1;
															
 
																-	}
															
 
																-	if ( n < 0 )
															
 
																-	{
															
 
																-		fprintf(stderr, "Illegal value of N, N must be >= 0\n");
															
 
																-		return 1;
															
 
																-	}
															
 
																-	if ( k < 0 )
															
 
																+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
															
 
																+{
															
 
																+	int first_idx = hwloc_bitmap_first(available_cpuset);
															
 
																+	int last_idx = hwloc_bitmap_last(available_cpuset);
															
 
																+	int count = 0;
															
 
																+	int idx = first_idx;
															
 
																+	while (idx != -1 && idx <= last_idx && count < offset+nb)
															
 
																 	{
															
 
																-		fprintf(stderr, "Illegal value of K, K must be >= 0\n");
															
 
																-		return 1;
															
 
																+		if (hwloc_bitmap_isset(available_cpuset, idx))
															
 
																+		{
															
 
																+			if (count >= offset)
															
 
																+			{
															
 
																+				hwloc_bitmap_set(selected_cpuset, idx);
															
 
																+			}
															
 
																+			count ++;
															
 
																+		}
															
 
																+		idx = hwloc_bitmap_next(available_cpuset, idx);
															
 
																 	}
															
 
																+	assert(count == offset+nb);
															
 
																+}
															
 
																+
															
 
																+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
															
 
																+{
															
 
																+	if (cpu_offset + cpu_nb > rm_nb_cpu_units)
															
 
																+		exit(77);
															
 
																+	if (cuda_offset + cuda_nb > rm_nb_cuda_units)
															
 
																+		exit(77);
															
 
																+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
															
 
																+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
															
 
																+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
															
 
																+	assert(sel_cpuset != NULL);
															
 
																+
															
 
																+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
															
 
																+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
															
 
																-	if ( transA_input[0] == 'T' )
															
 
																-	{
															
 
																-		transA = Trans;
															
 
																-	}
															
 
																-	else if ( transA_input[0] == 'N' )
															
 
																 	{
															
 
																-		transA = NoTrans;
															
 
																+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
															
 
																+		char str1[strl1+1];
															
 
																+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
															
 
																+
															
 
																+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
															
 
																+		char str2[strl2+1];
															
 
																+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
															
 
																+		printf("all cpus cpuset = %s\n", str1);
															
 
																+		
															
 
																+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
															
 
																+		char str3[strl3+1];
															
 
																+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
															
 
																+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
															
 
																 	}
															
 
																+
															
 
																+	_inc_spawn_pending();
															
 
																+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
															
 
																+
															
 
																+	hwloc_bitmap_free(sel_cpuset);
															
 
																+	hwloc_bitmap_free(cpu_cpuset);
															
 
																+	hwloc_bitmap_free(cuda_cpuset);
															
 
																+}
															
 
																+
															
 
																+int main( int argc, char const *argv[])
															
 
																+{
															
 
																+	pthread_cond_init(&spawn_pending_cond, NULL);
															
 
																+
															
 
																+	int transA = MorseTrans;
															
 
																+	int transB = MorseTrans;
															
 
																+
															
 
																+	if (argc < 6 || argc > 6)
															
 
																+		usage();
															
 
																+
															
 
																+	int m = atoi(argv[1]);
															
 
																+	if (m < 1)
															
 
																+		usage();
															
 
																+	int n = atoi(argv[2]);
															
 
																+	if (n < 1)
															
 
																+		usage();
															
 
																+	int k = atoi(argv[3]);
															
 
																+	if (k < 1)
															
 
																+		usage();
															
 
																+	
															
 
																+	if (strcmp(argv[4], "T") == 0) 
															
 
																+		transA = MorseTrans;
															
 
																+	else if (strcmp(argv[4], "N") == 0) 
															
 
																+		transA = MorseNoTrans;
															
 
																 	else
															
 
																-	{
															
 
																-		fprintf(stderr, "Illegal value of TRANS_A, TRANS_A can be T or N\n");
															
 
																-		return 1;
															
 
																-	}
															
 
																+		usage();
															
 
																-	if ( transB_input[0] == 'T' )
															
 
																-	{
															
 
																-		transB = Trans;
															
 
																-	}
															
 
																-	else if ( transB_input[0] == 'N' )
															
 
																-	{
															
 
																-		transB = NoTrans;
															
 
																-	}
															
 
																+	if (strcmp(argv[5], "T") == 0) 
															
 
																+		transB = MorseTrans;
															
 
																+	else if (strcmp(argv[5], "N") == 0) 
															
 
																+		transB = MorseNoTrans;
															
 
																 	else
															
 
																-	{
															
 
																-		fprintf(stderr, "Illegal value of TRANS_B, TRANS_B can be T or N\n");
															
 
																-		return 1;
															
 
																-	}
															
 
																+		usage();
															
 
																-	// Matrices allocation
															
 
																-	A = ( double * ) malloc( sizeof( double ) * m * k );
															
 
																-	B = ( double * ) malloc( sizeof( double ) * k * n );
															
 
																-	C = ( double * ) malloc( sizeof( double ) * m * n );
															
 
																-	C_test = ( double * ) malloc( sizeof( double ) * m * n );
															
 
																+	srand(time(NULL));
															
 
																-	// Alpha and beta initialization
															
 
																-	alpha = ( double ) rand() / (double) rand() + DBL_MIN;
															
 
																-	beta  = ( double ) rand() / (double) rand() + DBL_MIN;
															
 
																- 
															
 
																-	// Matrix A, B, C and C_test initialization
															
 
																-	for ( i = 0; i < m; i++ )
															
 
																-	{
															
 
																-		for ( j = 0; j < n; j++ )
															
 
																-		{
															
 
																-			A[ i * n + j ] = ( double ) rand() / (double) rand() 
															
 
																-							  + DBL_MIN;
															
 
																-			B[ i * n + j ] = ( double ) rand() / (double) rand() 
															
 
																-							  + DBL_MIN;
															
 
																-			C[ i * n + j ] = 0.0;
															
 
																-			C_test[ i * n + j ] = 0.0;
															
 
																-		}
															
 
																-	}
															
 
																+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
															
 
																 	/* Test case */
															
 
																-	{
															
 
																-		/* pocl_starpu_init */
															
 
																-		{
															
 
																-			hwloc_topology_init(&topology);
															
 
																-			hwloc_topology_load(topology);
															
 
																-			starpurm_initialize();
															
 
																-			starpurm_set_drs_enable(NULL);
															
 
																-		}
															
 
																+	starpurm_initialize();
															
 
																+	starpurm_set_drs_enable(NULL);
															
 
																+	init_rm_infos();
															
 
																+	printf("cpu units: %d\n", rm_nb_cpu_units);
															
 
																+	printf("cuda units: %d\n", rm_nb_cuda_units);
															
 
																+	printf("using default units\n");
															
 
																+	disp_cpuset(starpurm_get_selected_cpuset());
															
 
																-		/* pocl_starpu_submit_task */
															
 
																+	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
															
 
																+	test(&test_args);
															
 
																+	{
															
 
																+		int cpu_offset = 0;
															
 
																+		int cpu_nb = rm_nb_cpu_units/2;
															
 
																+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
															
 
																 		{
															
 
																-			/* GLIBC cpu_mask as supplied by POCL */
															
 
																-			cpu_set_t cpu_mask;
															
 
																-			CPU_ZERO(&cpu_mask);
															
 
																-			CPU_SET (0, &cpu_mask);
															
 
																-			CPU_SET (1, &cpu_mask);
															
 
																-			CPU_SET (2, &cpu_mask);
															
 
																-			CPU_SET (3, &cpu_mask);
															
 
																-
															
 
																-			/* Convert GLIBC cpu_mask into HWLOC cpuset */
															
 
																-			hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc();
															
 
																-			int status = hwloc_cpuset_from_glibc_sched_affinity(topology, hwloc_cpuset, &cpu_mask, sizeof(cpu_set_t));
															
 
																-			assert(status == 0);
															
 
																-
															
 
																-			/* Reset any unit previously allocated to StarPU */
															
 
																-			starpurm_withdraw_all_cpus_from_starpu(NULL);
															
 
																-			/* Enforce new cpu mask */
															
 
																-			starpurm_assign_cpu_mask_to_starpu(NULL, hwloc_cpuset);
															
 
																-
															
 
																-			/* task function */
															
 
																-			{
															
 
																-				int TRANS_A = transA==NoTrans?MorseNoTrans:MorseTrans;
															
 
																-				int TRANS_B = transB==NoTrans?MorseNoTrans:MorseTrans;
															
 
																-				int M = m;
															
 
																-				int N = n;
															
 
																-				int K = k;
															
 
																-				double ALPHA = alpha;
															
 
																-				int LDA = k;
															
 
																-				int LDB = n;
															
 
																-				double BETA = beta;
															
 
																-				int LDC = n;
															
 
																-
															
 
																-				MORSE_Init(4, 0);
															
 
																-				int res = MORSE_dgemm(TRANS_A, TRANS_B, M, N, K,
															
 
																-						ALPHA, A, LDA, B, LDB,
															
 
																-						BETA, C, LDC);
															
 
																-				MORSE_Finalize();
															
 
																-			}
															
 
																-
															
 
																-			/* Withdraw all CPU units from StarPU */
															
 
																-			starpurm_withdraw_all_cpus_from_starpu(NULL);
															
 
																-
															
 
																-			hwloc_bitmap_free(hwloc_cpuset);
															
 
																+			cpu_nb = 1;
															
 
																 		}
															
 
																-
															
 
																-		/* pocl_starpu_shutdown() */
															
 
																+		int cuda_offset = 0;
															
 
																+		int cuda_nb = rm_nb_cuda_units/2;
															
 
																+		if (cuda_nb == 0 && rm_nb_cuda_units > 0)
															
 
																 		{
															
 
																-			starpurm_shutdown();
															
 
																+			cuda_nb = 1;
															
 
																 		}
															
 
																+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
															
 
																 	}
															
 
																-
															
 
																-#if 0
															
 
																-	/* Check */
															
 
																-	cblas_dgemm( CblasColMajor, 
															
 
																-				 ( CBLAS_TRANSPOSE ) transA,
															
 
																-				 ( CBLAS_TRANSPOSE ) transB,
															
 
																-									 m, n, k,
															
 
																-							 		 alpha, A, k,
															
 
																-							 			    B, n,
															
 
																-							 		  beta, C_test, n );
															
 
																-	// Error computation
															
 
																-	for ( i = 0; i < m; i++ )
															
 
																 	{
															
 
																-		for ( j = 0; j < n; j++ )
															
 
																+		int cpu_offset = rm_nb_cpu_units/2;
															
 
																+		int cpu_nb = rm_nb_cpu_units/2;
															
 
																+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
															
 
																 		{
															
 
																-			error = abs( C[ i * n + j ] - C_test[ i * n + j ] );
															
 
																-			if ( max_error > error )
															
 
																-				max_error = error;
															
 
																-			count_error += error;
															
 
																+			cpu_nb = 1;
															
 
																 		}
															
 
																+		int cuda_offset = rm_nb_cuda_units/2;
															
 
																+		int cuda_nb = rm_nb_cuda_units/2;
															
 
																+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
															
 
																 	}
															
 
																+	_wait_pending_spawns();
															
 
																+	MORSE_Finalize();
															
 
																-	fprintf(stdout, "Max. error = %1.2f\n", max_error );
															
 
																-	fprintf(stdout, "Av. error = %1.2f\n", count_error / ( m * n ) );
															
 
																-#endif
															
 
																-#endif
															
 
																+	starpurm_shutdown();
															
 
																+	pthread_cond_destroy(&spawn_pending_cond);
															
 
																 	return 0;
															
--- a/starpurm/dev/cuda_vector_scale/vector_scale.c
+++ b/starpurm/dev/cuda_vector_scale/vector_scale.c
@@ -238,12 +238,15 @@ int main(int argc, char *argv[])
 
																 	if (rm_nb_cpu_units > 1 && rm_nb_cuda_units > 1)
															
 
																 	{
															
 
																-		const int nb_cpus = rm_nb_cpu_units;
															
 
																+		int nb_cpus = rm_nb_cpu_units;
															
 
																 		const int nb_cudas = rm_nb_cuda_units;
															
 
																 		const int cuda_type = rm_cuda_type_id;
															
 
																 		printf("nb_cpu_units = %d\n", nb_cpus);
															
 
																 		printf("nb_cuda_units = %d\n", nb_cudas);
															
 
																+		/* Keep at least one CPU core */
															
 
																+		nb_cpus--;
															
 
																+
															
 
																 		starpurm_set_drs_enable(NULL);
															
 
																 		drs_enabled = starpurm_drs_enabled_p();
															
 
																 		assert(drs_enabled != 0);
															
--- a/starpurm/include/starpurm.h
+++ b/starpurm/include/starpurm.h
@@ -138,6 +138,7 @@ hwloc_cpuset_t starpurm_get_global_cpuset(void);
 
																 hwloc_cpuset_t starpurm_get_selected_cpuset(void);
															
 
																 hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void);
															
 
																 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void);
															
 
																+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid);
															
 
																 #ifdef __cplusplus
															
 
																 }
															
--- a/starpurm/src/starpurm.c
+++ b/starpurm/src/starpurm.c
@@ -603,8 +603,19 @@ void starpurm_initialize(void)
 
																 	hwloc_topology_load(rm->topology);
															
 
																 	rm->global_cpuset = hwloc_bitmap_alloc();
															
 
																 	hwloc_bitmap_zero(rm->global_cpuset);
															
 
																+	
															
 
																 	rm->all_cpu_workers_cpuset = hwloc_bitmap_alloc();
															
 
																 	hwloc_bitmap_zero(rm->all_cpu_workers_cpuset);
															
 
																+	
															
 
																+	rm->all_opencl_device_workers_cpuset = hwloc_bitmap_alloc();
															
 
																+	hwloc_bitmap_zero(rm->all_opencl_device_workers_cpuset);
															
 
																+	
															
 
																+	rm->all_cuda_device_workers_cpuset = hwloc_bitmap_alloc();
															
 
																+	hwloc_bitmap_zero(rm->all_cuda_device_workers_cpuset);
															
 
																+	
															
 
																+	rm->all_mic_device_workers_cpuset = hwloc_bitmap_alloc();
															
 
																+	hwloc_bitmap_zero(rm->all_mic_device_workers_cpuset);
															
 
																+
															
 
																 	rm->all_device_workers_cpuset = hwloc_bitmap_alloc();
															
 
																 	hwloc_bitmap_zero(rm->all_device_workers_cpuset);
															
@@ -705,6 +716,7 @@ void starpurm_initialize(void)
 
																 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
															
 
																 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
															
 
																 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
															
 
																+		hwloc_bitmap_or(rm->all_opencl_device_workers_cpuset, rm->all_opencl_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		unitid++;
															
 
																 	}
															
@@ -725,6 +737,7 @@ void starpurm_initialize(void)
 
																 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
															
 
																 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
															
 
																 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
															
 
																+		hwloc_bitmap_or(rm->all_cuda_device_workers_cpuset, rm->all_cuda_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		unitid++;
															
 
																 	}
															
@@ -745,6 +758,7 @@ void starpurm_initialize(void)
 
																 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
															
 
																 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
															
 
																 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
															
 
																+		hwloc_bitmap_or(rm->all_mic_device_workers_cpuset, rm->all_mic_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
															
 
																 		unitid++;
															
 
																 	}
															
@@ -851,6 +865,9 @@ void starpurm_shutdown(void)
 
																 	hwloc_bitmap_free(rm->global_cpuset);
															
 
																 	hwloc_bitmap_free(rm->all_cpu_workers_cpuset);
															
 
																+	hwloc_bitmap_free(rm->all_opencl_device_workers_cpuset);
															
 
																+	hwloc_bitmap_free(rm->all_cuda_device_workers_cpuset);
															
 
																+	hwloc_bitmap_free(rm->all_mic_device_workers_cpuset);
															
 
																 	hwloc_bitmap_free(rm->all_device_workers_cpuset);
															
 
																 	hwloc_bitmap_free(rm->selected_cpuset);
															
@@ -1576,6 +1593,33 @@ hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void)
 
																 	return hwloc_bitmap_dup(rm->all_cpu_workers_cpuset);
															
 
																 }
															
 
																+static hwloc_cpuset_t starpurm_get_all_opencl_device_workers_cpuset(void)
															
 
																+{
															
 
																+	assert(_starpurm != NULL);
															
 
																+	assert(_starpurm->state != state_uninitialized);
															
 
																+	struct s_starpurm *rm = _starpurm;
															
 
																+
															
 
																+	return hwloc_bitmap_dup(rm->all_opencl_device_workers_cpuset);
															
 
																+}
															
 
																+
															
 
																+static hwloc_cpuset_t starpurm_get_all_cuda_device_workers_cpuset(void)
															
 
																+{
															
 
																+	assert(_starpurm != NULL);
															
 
																+	assert(_starpurm->state != state_uninitialized);
															
 
																+	struct s_starpurm *rm = _starpurm;
															
 
																+
															
 
																+	return hwloc_bitmap_dup(rm->all_cuda_device_workers_cpuset);
															
 
																+}
															
 
																+
															
 
																+static hwloc_cpuset_t starpurm_get_all_mic_device_workers_cpuset(void)
															
 
																+{
															
 
																+	assert(_starpurm != NULL);
															
 
																+	assert(_starpurm->state != state_uninitialized);
															
 
																+	struct s_starpurm *rm = _starpurm;
															
 
																+
															
 
																+	return hwloc_bitmap_dup(rm->all_mic_device_workers_cpuset);
															
 
																+}
															
 
																+
															
 
																 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
															
 
																 {
															
 
																 	assert(_starpurm != NULL);
															
@@ -1585,3 +1629,18 @@ hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 
																 	return hwloc_bitmap_dup(rm->all_device_workers_cpuset);
															
 
																 }
															
 
																+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid)
															
 
																+{
															
 
																+	assert(_starpurm != NULL);
															
 
																+	assert(_starpurm->state != state_uninitialized);
															
 
																+	assert(typeid != starpurm_unit_cpu);
															
 
																+	if (typeid == starpurm_unit_opencl)
															
 
																+		return starpurm_get_all_opencl_device_workers_cpuset();
															
 
																+	if (typeid == starpurm_unit_cuda)
															
 
																+		return starpurm_get_all_cuda_device_workers_cpuset();
															
 
																+	if (typeid == starpurm_unit_mic)
															
 
																+		return starpurm_get_all_mic_device_workers_cpuset();
															
 
																+	hwloc_cpuset_t empty_bitmap = hwloc_bitmap_alloc();
															
 
																+	hwloc_bitmap_zero(empty_bitmap);
															
 
																+	return empty_bitmap;
															
 
																+}
															
--- a/starpurm/src/starpurm_dlb.c
+++ b/starpurm/src/starpurm_dlb.c
@@ -22,12 +22,15 @@
 
																 #include <stdio.h>
															
 
																 #include <string.h>
															
 
																 #include <assert.h>
															
 
																+#include <config.h>
															
 
																+
															
 
																 #include <hwloc.h>
															
 
																+#ifdef HAVE_HWLOC_GLIBC_SCHED_H
															
 
																 #include <hwloc/glibc-sched.h>
															
 
																+#endif
															
 
																 #include <pthread.h>
															
 
																 #include <starpu.h>
															
 
																 #include <starpurm.h>
															
 
																-#include <config.h>
															
 
																 #include <starpurm_private.h>
															
 
																 #ifndef STARPURM_HAVE_DLB
															
--- a/starpurm/src/starpurm_private.h
+++ b/starpurm/src/starpurm_private.h
@@ -79,6 +79,15 @@ struct s_starpurm
 
																 	/* Cpuset of all StarPU CPU workers. */
															
 
																 	hwloc_cpuset_t all_cpu_workers_cpuset;
															
 
																+	/* Cpuset of all StarPU OpenCL workers. */
															
 
																+	hwloc_cpuset_t all_opencl_device_workers_cpuset;
															
 
																+
															
 
																+	/* Cpuset of all StarPU CUDA workers. */
															
 
																+	hwloc_cpuset_t all_cuda_device_workers_cpuset;
															
 
																+
															
 
																+	/* Cpuset of all StarPU MIC workers. */
															
 
																+	hwloc_cpuset_t all_mic_device_workers_cpuset;
															
 
																+
															
 
																 	/* Cpuset of all StarPU device workers. */
															
 
																 	hwloc_cpuset_t all_device_workers_cpuset;