лет назад: 7 · eb5c169aeb
--- a/configure.ac
+++ b/configure.ac
@@ -1327,10 +1327,19 @@ if test x$enable_cuda = xyes; then
 
				 		NVCCFLAGS="${NVCCFLAGS} -m64"
			
 
				 	fi
			
 
				 
			
 
				+	SAVED_CPPFLAGS="${CPPFLAGS}"
			
 
				+	CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
			
 
				+	SAVED_LDFLAGS="${LDFLAGS}"
			
 
				+	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
			
 
				 	AC_CHECK_HEADERS([cuda_gl_interop.h])
			
 
				 
			
 
				 	AC_CHECK_LIB([cusparse], [cusparseCreate])
			
 
				 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
			
 
				+
			
 
				+	AC_CHECK_HEADER([nvml.h],
			
 
				+	  [AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetTotalEnergyConsumption])])
			
 
				+        CPPFLAGS="${SAVED_CPPFLAGS}"
			
 
				+	LDFLAGS="${SAVED_LDFLAGS}"
			
 
				 fi
			
 
				 
			
 
				 dnl Hey dude, are you around?
			
--- a/contrib/ci.inria.fr/Jenkinsfile-basic
+++ b/contrib/ci.inria.fr/Jenkinsfile-basic
@@ -0,0 +1,132 @@
 
				+#!groovy
			
 
				+// StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+//
			
 
				+// Copyright (C) 2018                                CNRS
			
 
				+//
			
 
				+// StarPU is free software; you can redistribute it and/or modify
			
 
				+// it under the terms of the GNU Lesser General Public License as published by
			
 
				+// the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+// your option) any later version.
			
 
				+//
			
 
				+// StarPU is distributed in the hope that it will be useful, but
			
 
				+// WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+//
			
 
				+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+//
			
 
				+
			
 
				+def statusHasChanged = false
			
 
				+
			
 
				+pipeline
			
 
				+{
			
 
				+	agent none
			
 
				+
			
 
				+	// Trigger the build
			
 
				+	triggers
			
 
				+	{
			
 
				+		// Poll gitlab explicitly every 15mn
			
 
				+		pollSCM('00-59/15 * * * *')
			
 
				+	}
			
 
				+
			
 
				+	stages
			
 
				+	{
			
 
				+		stage('Tarball')
			
 
				+		{
			
 
				+			steps
			
 
				+			{
			
 
				+				node('autotools')
			
 
				+				{
			
 
				+					checkout scm
			
 
				+					sh 'contrib/ci.inria.fr/job-0-tarball.sh'
			
 
				+					script
			
 
				+					{
			
 
				+					       env.tarballgz = sh (script: 'ls *.tar.gz', returnStdout: true).trim()
			
 
				+					}
			
 
				+					stash includes: "${env.tarballgz}", name: 'tarballgz'
			
 
				+					stash includes: "starpu.pdf", name: 'doc'
			
 
				+					// Stash those scripts because they are not in make dist
			
 
				+					dir('contrib/ci.inria.fr')
			
 
				+					{
			
 
				+						stash includes: "job-1-check.sh", name: 'script-unix-check'
			
 
				+					}
			
 
				+					archiveArtifacts artifacts: "${env.tarballgz},starpu.pdf", fingerprint: true, onlyIfSuccessful: true
			
 
				+					deleteDir()
			
 
				+
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		stage('Check')
			
 
				+		{
			
 
				+			steps
			
 
				+			{
			
 
				+				script
			
 
				+				{
			
 
				+					labelToSelect = 'unix'
			
 
				+					listOfNodeNames = jenkins.model.Jenkins.instance.nodes.collect
			
 
				+					{
			
 
				+						node -> node.getLabelString().contains(labelToSelect) ? node.name : null
			
 
				+					}
			
 
				+					listOfNodeNames.removeAll(Collections.singleton(null))
			
 
				+
			
 
				+					def p = listOfNodeNames.collectEntries
			
 
				+					{
			
 
				+						[ (it):
			
 
				+						{
			
 
				+							node(it)
			
 
				+							{
			
 
				+								dir('check-unix')
			
 
				+								{
			
 
				+									unstash 'tarballgz'
			
 
				+									unstash 'script-unix-check'
			
 
				+									sh 'chmod 755 job-1-check.sh && ./job-1-check.sh'
			
 
				+									deleteDir()
			
 
				+								}
			
 
				+							}
			
 
				+						}
			
 
				+					]}
			
 
				+					parallel p;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	post
			
 
				+	{
			
 
				+		// hooks are called in order: always, changed, aborted, failure, success, unstable
			
 
				+		changed
			
 
				+		{
			
 
				+			echo "Build status has changed."
			
 
				+			script
			
 
				+			{
			
 
				+
			
 
				+				statusHasChanged = true
			
 
				+			}
			
 
				+		}
			
 
				+		success
			
 
				+		{
			
 
				+			echo "Build success."
			
 
				+			// email when changed to success
			
 
				+			script
			
 
				+			{
			
 
				+				if (statusHasChanged)
			
 
				+				{
			
 
				+					emailext(body: '${DEFAULT_CONTENT}',
			
 
				+						 subject: '${DEFAULT_SUBJECT}',
			
 
				+						 replyTo: '$DEFAULT_REPLYTO',
			
 
				+						 to: '$DEFAULT_RECIPIENTS',
			
 
				+						 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		failure
			
 
				+		{
			
 
				+			echo "Build failure."
			
 
				+			// always email on failure
			
 
				+			emailext(body: '${DEFAULT_CONTENT}',
			
 
				+				 subject: '${DEFAULT_SUBJECT}',
			
 
				+				 replyTo: '$DEFAULT_REPLYTO',
			
 
				+				 to: '$DEFAULT_RECIPIENTS',
			
 
				+				 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/contrib/ci.inria.fr/job-0-tarball.sh
+++ b/contrib/ci.inria.fr/job-0-tarball.sh
@@ -0,0 +1,30 @@
 
				+#!/bin/sh
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2018                                CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
			
 
				+
			
 
				+./autogen.sh
			
 
				+if test -d build ; then chmod -R 777 build && rm -rf build ; fi
			
 
				+mkdir build && cd build
			
 
				+../configure
			
 
				+make V=1
			
 
				+make dist
			
 
				+cp *gz ..
			
 
				+cp doc/doxygen/starpu.pdf ..
			
 
				+make clean
			
 
				+
			
--- a/contrib/ci.inria.fr/job-1-check.sh
+++ b/contrib/ci.inria.fr/job-1-check.sh
@@ -0,0 +1,85 @@
 
				+#!/bin/sh
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2013-2018                                CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+set -e
			
 
				+set -x
			
 
				+
			
 
				+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
			
 
				+
			
 
				+tarball=$(ls -tr starpu-*.tar.gz | tail -1)
			
 
				+
			
 
				+if test -z "$tarball"
			
 
				+then
			
 
				+    echo Error. No tar.gz file
			
 
				+    ls
			
 
				+    pwd
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+basename=$(basename $tarball .tar.gz)
			
 
				+export STARPU_HOME=$PWD/$basename/home
			
 
				+mkdir -p $basename
			
 
				+cd $basename
			
 
				+env > $PWD/env
			
 
				+
			
 
				+test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
			
 
				+tar xfz ../$tarball
			
 
				+cd $basename
			
 
				+mkdir build
			
 
				+cd build
			
 
				+
			
 
				+STARPU_CONFIGURE_OPTIONS=""
			
 
				+suname=$(uname)
			
 
				+if test "$suname" == "Darwin"
			
 
				+then
			
 
				+    STARPU_CONFIGURE_OPTIONS="--without-hwloc"
			
 
				+fi
			
 
				+if test "$suname" == "OpenBSD"
			
 
				+then
			
 
				+    STARPU_CONFIGURE_OPTIONS="--without-hwloc --disable-mlr"
			
 
				+fi
			
 
				+if test "$suname" == "FreeBSD"
			
 
				+then
			
 
				+    STARPU_CONFIGURE_OPTIONS="--disable-fortran"
			
 
				+fi
			
 
				+
			
 
				+export CC=gcc
			
 
				+
			
 
				+day=$(date +%u)
			
 
				+if test $day -le 5
			
 
				+then
			
 
				+    ../configure --enable-quick-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
			
 
				+else
			
 
				+    ../configure --enable-long-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
			
 
				+fi
			
 
				+
			
 
				+make
			
 
				+#make check
			
 
				+(make -k check || true) > ../check_$$ 2>&1
			
 
				+cat ../check_$$
			
 
				+make showcheck
			
 
				+
			
 
				+grep "^FAIL:" ../check_$$ || true
			
 
				+
			
 
				+make clean
			
 
				+
			
 
				+grep "^FAIL:" ../check_$$ || true
			
 
				+
			
 
				+echo "Running on $(uname -a)"
			
 
				+exit $(grep "^FAIL:" ../check_$$ | wc -l)
			
 
				+
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      Inria
			
 
				  * Copyright (C) 2010-2018                                CNRS
			
 
				- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,6 +26,26 @@ performance, we give below a list of features which should be checked.
 
				 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
			
 
				 will show roughly where time is spent, and focus correspondingly.
			
 
				 
			
 
				+\section CheckTaskSize Check Task Size
			
 
				+
			
 
				+Make sure that your tasks are not too small, because the StarPU runtime overhead
			
 
				+is not completely zero. You can run the tasks_size_overhead.sh script to get an
			
 
				+idea of the scalability of tasks depending on their duration (in µs), on your
			
 
				+own system.
			
 
				+
			
 
				+Typically, 10µs-ish tasks are definitely too small, the CUDA overhead itself is
			
 
				+much bigger than this.
			
 
				+
			
 
				+1ms-ish tasks may be a good start, but will not necessarily scale to many dozens
			
 
				+of cores, so it's better to try to get 10ms-ish tasks.
			
 
				+
			
 
				+Tasks durations can easily be observed when performance models are defined (see
			
 
				+\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
			
 
				+<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
			
 
				+
			
 
				+When using parallel tasks, the problem is even worse since StarPU has to
			
 
				+synchronize the execution of tasks.
			
 
				+
			
 
				 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
			
 
				 
			
 
				 The \ref enable-fast "--enable-fast" configuration option disables all
			
@@ -116,6 +136,16 @@ enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
 
				 number of kernels to execute concurrently.  This is useful when kernels are
			
 
				 small and do not feed the whole GPU with threads to run.
			
 
				 
			
 
				+Concerning memory allocation, you should really not use cudaMalloc/cudaFree
			
 
				+within the kernel, since cudaFree introduces a awfully lot of synchronizations
			
 
				+within CUDA itself. You should instead add a parameter to the codelet with the
			
 
				+STARPU_SCRATCH mode access. You can then pass to the task a handle registered
			
 
				+with the desired size but with the NULL pointer, that handle can even be the
			
 
				+shared between tasks, StarPU will allocate per-task data on the fly before task
			
 
				+execution, and reuse the allocated data between tasks.
			
 
				+
			
 
				+See <c>examples/pi/pi_redux.c</c> for an example of use.
			
 
				+
			
 
				 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
			
 
				 
			
 
				 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015-2017                      Inria
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
			
 
				  * Copyright (C) 2016                                     Uppsala University
			
 
				  *
			
@@ -1173,6 +1173,14 @@ If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the
 
				 discovered by StarPU.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_IDLE_FILE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_IDLE_FILE
			
 
				+\addindex __env__STARPU_IDLE_FILE
			
 
				+If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
			
 
				+The file will contain the sum of the idle times of all the workers.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheHypervisor Configuring The Hypervisor
			
--- a/doc/doxygen/chapters/api/profiling.doxy
+++ b/doc/doxygen/chapters/api/profiling.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2015,2017                           CNRS
			
 
				- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -72,7 +72,7 @@ profiling was enabled.
 
				     Number of cycles stalled within the task, only available in the MoviSim
			
 
				 
			
 
				 \var double starpu_profiling_task_info::energy_consumed
			
 
				-Energy consumed by the task, only available in the MoviSim
			
 
				+Energy consumed by the task, in Joules
			
 
				 
			
 
				 \struct starpu_profiling_worker_info
			
 
				 This structure contains the profiling information associated to
			
@@ -94,7 +94,7 @@ starpu_profiling_worker_get_info()
 
				 \var uint64_t starpu_profiling_worker_info::stall_cycles
			
 
				         Number of cycles stalled within the worker, only available in the MoviSim
			
 
				 \var double starpu_profiling_worker_info::energy_consumed
			
 
				-        Energy consumed by the worker, only available in the MoviSim
			
 
				+        Energy consumed by the worker, in Joules
			
 
				 
			
 
				 \struct starpu_profiling_bus_info
			
 
				 todo
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -3,7 +3,7 @@
 
				 # Copyright (C) 2011-2017                                Inria
			
 
				 # Copyright (C) 2017                                     Erwan Leria
			
 
				 # Copyright (C) 2009-2018                                Université de Bordeaux
			
 
				-# Copyright (C) 2010-2015,2017                           CNRS
			
 
				+# Copyright (C) 2010-2015,2017,2018                           CNRS
			
 
				 # Copyright (C) 2011                                     Télécom-SudParis
			
 
				 # Copyright (C) 2016                                     Uppsala University
			
 
				 #
			
@@ -227,6 +227,7 @@ STARPU_EXAMPLES +=				\
 
				 	filters/fmultiple_submit		\
			
 
				 	filters/fmultiple_submit_readonly	\
			
 
				 	filters/fmultiple_submit_implicit	\
			
 
				+	filters/frecursive			\
			
 
				 	tag_example/tag_example			\
			
 
				 	tag_example/tag_example2		\
			
 
				 	tag_example/tag_example3		\
			
--- a/examples/filters/frecursive.c
+++ b/examples/filters/frecursive.c
@@ -0,0 +1,170 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018                                     CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void cpu_codelet(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+        unsigned i, j;
			
 
				+        int factor;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &factor, 0);
			
 
				+        /* length of the matrix */
			
 
				+        unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
			
 
				+        unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
			
 
				+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
			
 
				+        /* local copy of the matrix pointer */
			
 
				+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
			
 
				+
			
 
				+	FPRINTF(stderr, "computing on matrix with nx=%d, ny=%d, ld=%d\n", nx, ny, ld);
			
 
				+        for(j=0; j<ny ; j++)
			
 
				+	{
			
 
				+                for(i=0; i<nx ; i++)
			
 
				+                        val[(j*ld)+i] *= factor;
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+        .cpu_funcs[0] = cpu_codelet,
			
 
				+        .nbuffers = 1,
			
 
				+	.modes[0] = STARPU_RW,
			
 
				+};
			
 
				+
			
 
				+#define NX 400
			
 
				+#define NY 80
			
 
				+#define LD NX
			
 
				+#define PARTS 4
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+        int *matrix;
			
 
				+	starpu_data_handle_t matrix_handle;
			
 
				+	starpu_data_handle_t subhandle_l1[PARTS];
			
 
				+	starpu_data_handle_t subhandle_l2[PARTS][PARTS];
			
 
				+	starpu_data_handle_t subhandle_l3[PARTS][PARTS][PARTS];
			
 
				+	int ret, submit;
			
 
				+
			
 
				+	int factor = 12;
			
 
				+	int n=1;
			
 
				+	int i,j,k;
			
 
				+
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	if (starpu_cpu_worker_get_count() < 1)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "This application requires at least 1 cpu worker\n");
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	matrix = (int*)malloc(NX * NY * sizeof(int));
			
 
				+        assert(matrix);
			
 
				+	starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, LD, NX, NY, sizeof(int));
			
 
				+
			
 
				+        for(j=0 ; j<NY ; j++)
			
 
				+	{
			
 
				+                for(i=0 ; i<NX ; i++)
			
 
				+		{
			
 
				+                        matrix[(j*LD)+i] = n++;
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	/* Split the matrix in PARTS sub-matrices, each sub-matrix in PARTS sub-sub-matrices, and each sub-sub matrix in PARTS sub-sub-sub-matrices */
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				+		.nchildren = PARTS
			
 
				+	};
			
 
				+	starpu_data_partition_plan(matrix_handle, &f, subhandle_l1);
			
 
				+	for(i=0 ; i<PARTS ; i++)
			
 
				+	{
			
 
				+		starpu_data_partition_plan(subhandle_l1[i], &f2, subhandle_l2[i]);
			
 
				+		for(j=0 ; j<PARTS ; j++)
			
 
				+		{
			
 
				+			starpu_data_partition_plan(subhandle_l2[i][j], &f, subhandle_l3[i][j]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+        /* Submit a task on the first sub-matrix and sub-sub matrix, and on all others sub-sub-matrices */
			
 
				+	ret = starpu_task_insert(&cl,
			
 
				+				 STARPU_RW, subhandle_l1[0],
			
 
				+				 STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+	for (i=1; i<PARTS; i++)
			
 
				+	{
			
 
				+		ret = starpu_task_insert(&cl,
			
 
				+					 STARPU_RW, subhandle_l2[i][0],
			
 
				+					 STARPU_VALUE, &factor, sizeof(factor),
			
 
				+					 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+		for (j=1; j<PARTS; j++)
			
 
				+		{
			
 
				+			for (k=0; k<PARTS; k++)
			
 
				+			{
			
 
				+				ret = starpu_task_insert(&cl,
			
 
				+							 STARPU_RW, subhandle_l3[i][j][k],
			
 
				+							 STARPU_VALUE, &factor, sizeof(factor),
			
 
				+							 0);
			
 
				+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(i=0 ; i<PARTS ; i++)
			
 
				+	{
			
 
				+		for(j=0 ; j<PARTS ; j++)
			
 
				+		{
			
 
				+			starpu_data_partition_clean(subhandle_l2[i][j], PARTS, subhandle_l3[i][j]);
			
 
				+
			
 
				+		}
			
 
				+		starpu_data_partition_clean(subhandle_l1[i], PARTS, subhandle_l2[i]);
			
 
				+	}
			
 
				+	starpu_data_partition_clean(matrix_handle, PARTS, subhandle_l1);
			
 
				+	starpu_data_unregister(matrix_handle);
			
 
				+
			
 
				+	/* Print result matrix */
			
 
				+	n=1;
			
 
				+	for(j=0 ; j<NY ; j++)
			
 
				+	{
			
 
				+		for(i=0 ; i<NX ; i++)
			
 
				+		{
			
 
				+			if (matrix[(j*LD)+i] != (int) n*12)
			
 
				+			{
			
 
				+				FPRINTF(stderr, "Incorrect result %4d != %4d", matrix[(j*LD)+i], n*12);
			
 
				+				ret=1;
			
 
				+			}
			
 
				+			n++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	free(matrix);
			
 
				+        starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -620,7 +620,7 @@ do {									\
 
				 		}							\
			
 
				 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
			
 
				 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
			
 
				-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
			
 
				 	}								\
			
 
				 } while(0);
			
 
				 
			
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -167,7 +167,12 @@
 
				 	{ \
			
 
				 		/* Sort by decreasing order */ \
			
 
				 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
			
 
				-		return (e2->prio - prio); \
			
 
				+		if (e2->prio < prio) \
			
 
				+			return -1; \
			
 
				+		if (e2->prio == prio) \
			
 
				+			return 0; \
			
 
				+		/* e2->prio > prio */ \
			
 
				+		return 1; \
			
 
				 	} \
			
 
				 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
			
 
				 	{ \
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2016-2017                      Inria
			
 
				- * Copyright (C) 2008-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2008-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2010-2017                                CNRS
			
 
				  * Copyright (C) 2013                                     Thibaut Lambert
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
@@ -477,6 +477,8 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 
				 			multi_invalid = (multi_invalid||isnan(reg_model->coeff[i]));
			
 
				 		}
			
 
				 		reg_model->multi_valid = !multi_invalid;
			
 
				+		res = fscanf(f, "\n");
			
 
				+		STARPU_ASSERT_MSG(res == 0, "Incorrect performance model file %s", path);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1763,7 +1765,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 
			
 
				 					unsigned n = entry->nsample;
			
 
				 					entry->mean = entry->sum / n;
			
 
				-					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
			
 
				+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum)/n))/n);
			
 
				 				}
			
 
				 
			
 
				 				if (j->task->flops != 0.)
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -397,6 +397,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
				 			struct starpu_task *alias = starpu_task_dup(task);
			
 
				 			alias->destroy = 1;
			
 
				 
			
 
				+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 			worker = _starpu_get_worker_struct(combined_workerid[j]);
			
 
				 			ret |= _starpu_push_local_task(worker, alias, 0);
			
 
				 		}
			
@@ -581,6 +582,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 					if (job->task_size > 1)
			
 
				 					{
			
 
				 						alias = starpu_task_dup(task);
			
 
				+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 						alias->destroy = 1;
			
 
				 					}
			
 
				 					else
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -360,7 +360,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 				ret = -ENOMEM;
			
 
				 		}
			
 
				 
			
 
				-#if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
			
 
				+#if (defined(STARPU_SIMGRID) && (SIMGRID_VERSION < 31500 || SIMGRID_VERSION == 31559)) || defined(STARPU_USE_CUDA)
			
 
				 end:
			
 
				 #endif
			
 
				 	if (ret == 0)
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -107,12 +107,22 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 			/* rebind to single CPU */
			
 
				 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
			
 
				 	}
			
 
				+	else
			
 
				+	{
			
 
				+		_STARPU_TRACE_START_EXECUTING();
			
 
				+	}
			
 
				+
			
 
				+	if (is_parallel_task)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				+		if (rank != 0)
			
 
				+			_STARPU_TRACE_END_EXECUTING();
			
 
				+	}
			
 
				 
			
 
				 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
			
 
				 
			
 
				 	if (is_parallel_task)
			
 
				 	{
			
 
				-		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		if (rank == 0)
			
 
				 		{
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2012,2014,2016-2017                 Inria
			
 
				- * Copyright (C) 2008-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2008-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2010                                     Mehdi Juhoor
			
 
				  * Copyright (C) 2010-2017                                CNRS
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
@@ -31,6 +31,9 @@
 
				 #ifdef HAVE_CUDA_GL_INTEROP_H
			
 
				 #include <cuda_gl_interop.h>
			
 
				 #endif
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+#include <nvml.h>
			
 
				+#endif
			
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <datawizard/malloc.h>
			
@@ -53,9 +56,13 @@
 
				 static int ncudagpus = -1;
			
 
				 
			
 
				 static size_t global_mem[STARPU_MAXCUDADEVS];
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
			
 
				+#endif
			
 
				 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				+static char used_stream[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 /* Note: streams are not thread-safe, so we define them for each CUDA worker
			
@@ -106,6 +113,9 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 
				 	if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				 		cnt = 0;
			
 
				 	config->topology.nhwcudagpus = cnt;
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+	nvmlInit();
			
 
				+#endif
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -215,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 
				 {
			
 
				 	int worker = starpu_worker_get_id_check();
			
 
				 
			
 
				+	used_stream[worker] = 1;
			
 
				 	return streams[worker];
			
 
				 }
			
 
				 
			
@@ -520,7 +531,30 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
			
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
 
				 #else
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+		unsigned long long energy_start = 0;
			
 
				+		nvmlReturn_t nvmlRet = -1;
			
 
				+		if (profiling || (cl->energy_model && cl->energy_model->benchmarking))
			
 
				+		{
			
 
				+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_start);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+		if (nvmlRet == NVML_SUCCESS &&
			
 
				+			(profiling || (cl->energy_model && cl->energy_model->benchmarking)))
			
 
				+		{
			
 
				+			unsigned long long energy_end;
			
 
				+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_end);
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning TODO: measure idle consumption to subtract it
			
 
				+#endif
			
 
				+			if (nvmlRet == NVML_SUCCESS)
			
 
				+				task->profiling_info->energy_consumed += (energy_end - energy_start) / 1000.;
			
 
				+		}
			
 
				+#endif
			
 
				 #endif
			
 
				 		_STARPU_TRACE_END_EXECUTING();
			
 
				 	}
			
@@ -581,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	if (!used_stream[workerid])
			
 
				+	{
			
 
				+		used_stream[workerid] = 1;
			
 
				+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				 	{
			
 
				 		if (worker->pipeline_length == 0)
			
@@ -682,6 +724,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 
			
 
				 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
			
 
				 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
			
 
				+#ifdef HAVE_LIBNVIDIA_ML
			
 
				+		char busid[13];
			
 
				+		snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
			
 
				+		nvmlDeviceGetHandleByPciBusId(busid, &nvmlDev[devid]);
			
 
				+#endif
			
 
				 		if (props[devid].pciDomainID)
			
 
				 			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
			
 
				 		else
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2011-2014,2017                           Inria
			
 
				  * Copyright (C) 2010-2012,2014-2017                      CNRS
			
 
				- * Copyright (C) 2010-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  * Copyright (C) 2013                                     Simon Archipoff
			
 
				  *
			
@@ -631,6 +631,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
				 	task_alias[0]->task->destroy = 1;
			
 
				 	task_alias[0]->left = NULL;
			
 
				 	task_alias[0]->ntasks = combined_worker->worker_size;
			
 
				+	_STARPU_TRACE_JOB_PUSH(task_alias[0]->task, task_alias[0]->task->priority > 0);
			
 
				 	int i;
			
 
				 	for(i = 1; i < combined_worker->worker_size; i++)
			
 
				 	{
			
@@ -641,6 +642,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 
				 		task_alias[i]->left = task_alias[i-1];
			
 
				 		task_alias[i - 1]->right = task_alias[i];
			
 
				 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
			
 
				+		_STARPU_TRACE_JOB_PUSH(task_alias[i]->task, task_alias[i]->task->priority > 0);
			
 
				 	}
			
 
				 
			
 
				 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;
			
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -342,6 +342,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 		struct starpu_task *alias = starpu_task_dup(task);
			
 
				 		int local_worker = combined_workerid[i];
			
 
				 		alias->destroy = 1;
			
 
				+		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
			
 
				 	}
			
 
				 
			
@@ -352,6 +353,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 
			
 
				+	_STARPU_TRACE_JOB_PUSH(master_alias, master_alias->priority > 0);
			
 
				+
			
 
				 	for (i = 1; i < worker_size; i++)
			
 
				 	{
			
 
				 		int local_worker = combined_workerid[i];
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      Inria
			
 
				- * Copyright (C) 2010-2017                                Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2018                                Université de Bordeaux
			
 
				  * Copyright (C) 2011-2017                                CNRS
			
 
				  * Copyright (C) 2011                                     Télécom-SudParis
			
 
				  *
			
@@ -175,6 +175,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 			ntasks[local_combined_workerid]++;
			
 
				 			_starpu_worker_unlock(local_combined_workerid);
			
 
				 
			
 
				+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
			
 
				 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
			
 
				 		}
			
 
				 
			
--- a/starpurm/dev/chameleon_test/dgemm.c
+++ b/starpurm/dev/chameleon_test/dgemm.c
@@ -1,3 +1,21 @@
 
				+/* StarPURM --- StarPU Resource Management Layer.
			
 
				+ *
			
 
				+ * Copyright (C) 2017, 2018  Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */
			
 
				+
			
 
				 #define _GNU_SOURCE
			
 
				 #include <sched.h>
			
 
				 #include <stdio.h>
			
@@ -6,18 +24,56 @@
 
				 #include <morse.h>
			
 
				 #include <starpurm.h>
			
 
				 #include <hwloc.h>
			
 
				+#include <pthread.h>
			
 
				+
			
 
				+#define CHECK
			
 
				 
			
 
				 static int rm_cpu_type_id = -1;
			
 
				+static int rm_cuda_type_id = -1;
			
 
				 static int rm_nb_cpu_units = 0;
			
 
				+static int rm_nb_cuda_units = 0;
			
 
				+static const int nb_random_tests = 10;
			
 
				 
			
 
				-static void test1();
			
 
				-static void init_rm_infos(void);
			
 
				+static unsigned spawn_pending = 0;
			
 
				+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
			
 
				+static pthread_cond_t spawn_pending_cond;
			
 
				 
			
 
				-static const int nb_random_tests = 10;
			
 
				+static void _inc_spawn_pending(void)
			
 
				+{
			
 
				+	pthread_mutex_lock(&spawn_pending_mutex);
			
 
				+	assert(spawn_pending < UINT_MAX);
			
 
				+	spawn_pending++;
			
 
				+	pthread_mutex_unlock(&spawn_pending_mutex);
			
 
				+}
			
 
				 
			
 
				-static void test1()
			
 
				+static void _dec_spawn_pending(void)
			
 
				 {
			
 
				-	int i;
			
 
				+	pthread_mutex_lock(&spawn_pending_mutex);
			
 
				+	assert(spawn_pending > 0);
			
 
				+	spawn_pending--;
			
 
				+	if (spawn_pending == 0)
			
 
				+		pthread_cond_broadcast(&spawn_pending_cond);
			
 
				+	pthread_mutex_unlock(&spawn_pending_mutex);
			
 
				+}
			
 
				+
			
 
				+static void _wait_pending_spawns(void)
			
 
				+{
			
 
				+	pthread_mutex_lock(&spawn_pending_mutex);
			
 
				+	while (spawn_pending > 0)
			
 
				+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
			
 
				+	pthread_mutex_unlock(&spawn_pending_mutex);
			
 
				+}
			
 
				+
			
 
				+static void spawn_callback(void *_arg)
			
 
				+{
			
 
				+	assert(42 == (uintptr_t)_arg);
			
 
				+	_dec_spawn_pending();
			
 
				+}
			
 
				+
			
 
				+static void usage(void)
			
 
				+{
			
 
				+	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
			
 
				+	exit(EXIT_FAILURE);
			
 
				 }
			
 
				 
			
 
				 static void init_rm_infos(void)
			
@@ -30,236 +86,223 @@ static void init_rm_infos(void)
 
				 		exit(77);
			
 
				 	}
			
 
				 
			
 
				+	int cuda_type = starpurm_get_device_type_id("cuda");
			
 
				+	int nb_cuda_units = starpurm_get_nb_devices_by_type(cuda_type);
			
 
				+
			
 
				 	rm_cpu_type_id = cpu_type;
			
 
				+	rm_cuda_type_id = cuda_type;
			
 
				 	rm_nb_cpu_units = nb_cpu_units;
			
 
				+	rm_nb_cuda_units = nb_cuda_units;
			
 
				 }
			
 
				 
			
 
				-static void disp_selected_cpuset(void)
			
 
				+
			
 
				+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
			
 
				 {
			
 
				-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
			
 
				+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
			
 
				 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
			
 
				 	char str[strl+1];
			
 
				 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
			
 
				-	printf("selected cpuset = %s\n", str);
			
 
				+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
			
 
				 }
			
 
				 
			
 
				-int main( int argc, char const *argv[])
			
 
				+struct s_test_args
			
 
				 {
			
 
				-	starpurm_initialize();
			
 
				-	init_rm_infos();
			
 
				-	printf("using default units\n");
			
 
				-	disp_selected_cpuset();
			
 
				-	test1();
			
 
				-	starpurm_shutdown();
			
 
				-#if 0
			
 
				+	const int m;
			
 
				+	const int n;
			
 
				+	const int k;
			
 
				+	int transA;
			
 
				+	int transB;
			
 
				+};
			
 
				 
			
 
				-	if(argc < 6 || argc > 6)
			
 
				-	{ 		
			
 
				-		fprintf(stderr, "Usage: ./test_dgemm M N K TRANS_A TRANS_B\n" );
			
 
				-		return 1;
			
 
				-	}
			
 
				-	
			
 
				-	// Local variables
			
 
				-	int i, j;
			
 
				-	int m, n, k;
			
 
				-	const char *transA_input = NULL;
			
 
				-	const char *transB_input = NULL;
			
 
				-	enum DDSS_TRANS transA = Trans;
			
 
				-	enum DDSS_TRANS transB = Trans;
			
 
				-	double alpha; 
			
 
				-	double beta;
			
 
				-	double error;
			
 
				-	double max_error;
			
 
				-	double count_error;	
			
 
				-	double *A;
			
 
				-	double *B;
			
 
				-	double *C;
			
 
				-	double *C_test;
			
 
				-	struct timeval start, end;
			
 
				-	double flops;
			
 
				-	double flops_ddss; 
			
 
				-	double flops_ref; 
			
 
				-	int ret;
			
 
				-	m = atoi( argv[1] );
			
 
				-	n = atoi( argv[2] );
			
 
				-	k = atoi( argv[3] );
			
 
				-	
			
 
				-	if ( strlen( argv[4] ) != 1 ) 
			
 
				-	{
			
 
				-		fprintf(stderr,"Illegal value of TRANS_A, TRANS_A can be T or N\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				-	transA_input = argv[4];	
			
 
				-	
			
 
				-	if ( strlen( argv[5] ) != 1 ) 
			
 
				+static void test(void *_args)
			
 
				+{
			
 
				+	struct s_test_args *args = _args;
			
 
				+	const int m = args->m;
			
 
				+	const int n = args->n;
			
 
				+	const int k = args->k;
			
 
				+	int transA = args->transA;
			
 
				+	int transB = args->transB;
			
 
				+	unsigned rand_seed = (unsigned)time(NULL);
			
 
				+	double *A = malloc(m * k * sizeof(double));
			
 
				+	double *B = malloc(k * n * sizeof(double));
			
 
				+	double *C = calloc(m * n, sizeof(double));
			
 
				+	double *C_test = calloc(m * n, sizeof(double));
			
 
				+
			
 
				+	const double alpha = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
			
 
				+	const double beta  = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
			
 
				+ 
			
 
				+	int i;
			
 
				+	for (i = 0; i < m; i++)
			
 
				 	{
			
 
				-		fprintf(stderr,"Illegal value of TRANS_B, TRANS_B can be T or N\n");
			
 
				-		return 1;
			
 
				+		int j;
			
 
				+		for (j = 0; j < n; j++)
			
 
				+		{
			
 
				+			A[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
			
 
				+			B[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
			
 
				+		}
			
 
				 	}
			
 
				-	transB_input = argv[5];	
			
 
				 
			
 
				-	// Set seed 
			
 
				-	srand(time(NULL));
			
 
				+	int res = MORSE_dgemm(transA, transB, m, n, k, alpha, A, k, B, n, beta, C, n);
			
 
				+#ifdef CHECK
			
 
				+	/* Check */
			
 
				+	cblas_dgemm( CblasColMajor, 
			
 
				+			( CBLAS_TRANSPOSE ) transA,
			
 
				+			( CBLAS_TRANSPOSE ) transB,
			
 
				+			m, n, k,
			
 
				+			alpha, A, k,
			
 
				+			B, n,
			
 
				+			beta, C_test, n );
			
 
				 
			
 
				-	max_error = 1.0;
			
 
				-	count_error = 0.0;
			
 
				+	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
			
 
				+	cblas_daxpy(m*n, -1, C, 1, C_test, 1);
			
 
				+	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
			
 
				+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
			
 
				+#endif
			
 
				+	free(A);
			
 
				+	free(B);
			
 
				+	free(C);
			
 
				+	free(C_test);
			
 
				+}
			
 
				 
			
 
				-	// Checking inputs
			
 
				-	if ( m < 0 )
			
 
				-	{
			
 
				-		fprintf(stderr, "Illegal value of M, M must be >= 0\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				-	if ( n < 0 )
			
 
				-	{
			
 
				-		fprintf(stderr, "Illegal value of N, N must be >= 0\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				-	if ( k < 0 )
			
 
				+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
			
 
				+{
			
 
				+	int first_idx = hwloc_bitmap_first(available_cpuset);
			
 
				+	int last_idx = hwloc_bitmap_last(available_cpuset);
			
 
				+	int count = 0;
			
 
				+	int idx = first_idx;
			
 
				+	while (idx != -1 && idx <= last_idx && count < offset+nb)
			
 
				 	{
			
 
				-		fprintf(stderr, "Illegal value of K, K must be >= 0\n");
			
 
				-		return 1;
			
 
				+		if (hwloc_bitmap_isset(available_cpuset, idx))
			
 
				+		{
			
 
				+			if (count >= offset)
			
 
				+			{
			
 
				+				hwloc_bitmap_set(selected_cpuset, idx);
			
 
				+			}
			
 
				+			count ++;
			
 
				+		}
			
 
				+		idx = hwloc_bitmap_next(available_cpuset, idx);
			
 
				 	}
			
 
				+	assert(count == offset+nb);
			
 
				+}
			
 
				+
			
 
				+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
			
 
				+{
			
 
				+	if (cpu_offset + cpu_nb > rm_nb_cpu_units)
			
 
				+		exit(77);
			
 
				+	if (cuda_offset + cuda_nb > rm_nb_cuda_units)
			
 
				+		exit(77);
			
 
				+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
			
 
				+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
			
 
				+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
			
 
				+	assert(sel_cpuset != NULL);
			
 
				+
			
 
				+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
			
 
				+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
			
 
				 
			
 
				-	if ( transA_input[0] == 'T' )
			
 
				-	{
			
 
				-		transA = Trans;
			
 
				-	}
			
 
				-	else if ( transA_input[0] == 'N' )
			
 
				 	{
			
 
				-		transA = NoTrans;
			
 
				+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
			
 
				+		char str1[strl1+1];
			
 
				+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
			
 
				+
			
 
				+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
			
 
				+		char str2[strl2+1];
			
 
				+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
			
 
				+		printf("all cpus cpuset = %s\n", str1);
			
 
				+		
			
 
				+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
			
 
				+		char str3[strl3+1];
			
 
				+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
			
 
				+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
			
 
				 	}
			
 
				+
			
 
				+	_inc_spawn_pending();
			
 
				+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
			
 
				+
			
 
				+	hwloc_bitmap_free(sel_cpuset);
			
 
				+	hwloc_bitmap_free(cpu_cpuset);
			
 
				+	hwloc_bitmap_free(cuda_cpuset);
			
 
				+}
			
 
				+
			
 
				+int main( int argc, char const *argv[])
			
 
				+{
			
 
				+	pthread_cond_init(&spawn_pending_cond, NULL);
			
 
				+
			
 
				+	int transA = MorseTrans;
			
 
				+	int transB = MorseTrans;
			
 
				+
			
 
				+	if (argc < 6 || argc > 6)
			
 
				+		usage();
			
 
				+
			
 
				+	int m = atoi(argv[1]);
			
 
				+	if (m < 1)
			
 
				+		usage();
			
 
				+	int n = atoi(argv[2]);
			
 
				+	if (n < 1)
			
 
				+		usage();
			
 
				+	int k = atoi(argv[3]);
			
 
				+	if (k < 1)
			
 
				+		usage();
			
 
				+	
			
 
				+	if (strcmp(argv[4], "T") == 0) 
			
 
				+		transA = MorseTrans;
			
 
				+	else if (strcmp(argv[4], "N") == 0) 
			
 
				+		transA = MorseNoTrans;
			
 
				 	else
			
 
				-	{
			
 
				-		fprintf(stderr, "Illegal value of TRANS_A, TRANS_A can be T or N\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				+		usage();
			
 
				 	
			
 
				-	if ( transB_input[0] == 'T' )
			
 
				-	{
			
 
				-		transB = Trans;
			
 
				-	}
			
 
				-	else if ( transB_input[0] == 'N' )
			
 
				-	{
			
 
				-		transB = NoTrans;
			
 
				-	}
			
 
				+	if (strcmp(argv[5], "T") == 0) 
			
 
				+		transB = MorseTrans;
			
 
				+	else if (strcmp(argv[5], "N") == 0) 
			
 
				+		transB = MorseNoTrans;
			
 
				 	else
			
 
				-	{
			
 
				-		fprintf(stderr, "Illegal value of TRANS_B, TRANS_B can be T or N\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				+		usage();
			
 
				 
			
 
				-	// Matrices allocation
			
 
				-	A = ( double * ) malloc( sizeof( double ) * m * k );
			
 
				-	B = ( double * ) malloc( sizeof( double ) * k * n );
			
 
				-	C = ( double * ) malloc( sizeof( double ) * m * n );
			
 
				-	C_test = ( double * ) malloc( sizeof( double ) * m * n );
			
 
				+	srand(time(NULL));
			
 
				 
			
 
				-	// Alpha and beta initialization
			
 
				-	alpha = ( double ) rand() / (double) rand() + DBL_MIN;
			
 
				-	beta  = ( double ) rand() / (double) rand() + DBL_MIN;
			
 
				- 
			
 
				-	// Matrix A, B, C and C_test initialization
			
 
				-	for ( i = 0; i < m; i++ )
			
 
				-	{
			
 
				-		for ( j = 0; j < n; j++ )
			
 
				-		{
			
 
				-			A[ i * n + j ] = ( double ) rand() / (double) rand() 
			
 
				-							  + DBL_MIN;
			
 
				-			B[ i * n + j ] = ( double ) rand() / (double) rand() 
			
 
				-							  + DBL_MIN;
			
 
				-			C[ i * n + j ] = 0.0;
			
 
				-			C_test[ i * n + j ] = 0.0;
			
 
				-		}
			
 
				-	}
			
 
				+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
			
 
				 
			
 
				 	/* Test case */
			
 
				-	{
			
 
				-		/* pocl_starpu_init */
			
 
				-		{
			
 
				-			hwloc_topology_init(&topology);
			
 
				-			hwloc_topology_load(topology);
			
 
				-			starpurm_initialize();
			
 
				-			starpurm_set_drs_enable(NULL);
			
 
				-		}
			
 
				+	starpurm_initialize();
			
 
				+	starpurm_set_drs_enable(NULL);
			
 
				+	init_rm_infos();
			
 
				+	printf("cpu units: %d\n", rm_nb_cpu_units);
			
 
				+	printf("cuda units: %d\n", rm_nb_cuda_units);
			
 
				+	printf("using default units\n");
			
 
				+	disp_cpuset(starpurm_get_selected_cpuset());
			
 
				 
			
 
				-		/* pocl_starpu_submit_task */
			
 
				+	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
			
 
				+	test(&test_args);
			
 
				+	{
			
 
				+		int cpu_offset = 0;
			
 
				+		int cpu_nb = rm_nb_cpu_units/2;
			
 
				+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
			
 
				 		{
			
 
				-			/* GLIBC cpu_mask as supplied by POCL */
			
 
				-			cpu_set_t cpu_mask;
			
 
				-			CPU_ZERO(&cpu_mask);
			
 
				-			CPU_SET (0, &cpu_mask);
			
 
				-			CPU_SET (1, &cpu_mask);
			
 
				-			CPU_SET (2, &cpu_mask);
			
 
				-			CPU_SET (3, &cpu_mask);
			
 
				-
			
 
				-			/* Convert GLIBC cpu_mask into HWLOC cpuset */
			
 
				-			hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc();
			
 
				-			int status = hwloc_cpuset_from_glibc_sched_affinity(topology, hwloc_cpuset, &cpu_mask, sizeof(cpu_set_t));
			
 
				-			assert(status == 0);
			
 
				-
			
 
				-			/* Reset any unit previously allocated to StarPU */
			
 
				-			starpurm_withdraw_all_cpus_from_starpu(NULL);
			
 
				-			/* Enforce new cpu mask */
			
 
				-			starpurm_assign_cpu_mask_to_starpu(NULL, hwloc_cpuset);
			
 
				-
			
 
				-			/* task function */
			
 
				-			{
			
 
				-				int TRANS_A = transA==NoTrans?MorseNoTrans:MorseTrans;
			
 
				-				int TRANS_B = transB==NoTrans?MorseNoTrans:MorseTrans;
			
 
				-				int M = m;
			
 
				-				int N = n;
			
 
				-				int K = k;
			
 
				-				double ALPHA = alpha;
			
 
				-				int LDA = k;
			
 
				-				int LDB = n;
			
 
				-				double BETA = beta;
			
 
				-				int LDC = n;
			
 
				-
			
 
				-				MORSE_Init(4, 0);
			
 
				-				int res = MORSE_dgemm(TRANS_A, TRANS_B, M, N, K,
			
 
				-						ALPHA, A, LDA, B, LDB,
			
 
				-						BETA, C, LDC);
			
 
				-				MORSE_Finalize();
			
 
				-			}
			
 
				-
			
 
				-			/* Withdraw all CPU units from StarPU */
			
 
				-			starpurm_withdraw_all_cpus_from_starpu(NULL);
			
 
				-
			
 
				-			hwloc_bitmap_free(hwloc_cpuset);
			
 
				+			cpu_nb = 1;
			
 
				 		}
			
 
				-
			
 
				-		/* pocl_starpu_shutdown() */
			
 
				+		int cuda_offset = 0;
			
 
				+		int cuda_nb = rm_nb_cuda_units/2;
			
 
				+		if (cuda_nb == 0 && rm_nb_cuda_units > 0)
			
 
				 		{
			
 
				-			starpurm_shutdown();
			
 
				+			cuda_nb = 1;
			
 
				 		}
			
 
				+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
			
 
				 	}
			
 
				-
			
 
				-#if 0
			
 
				-	/* Check */
			
 
				-	cblas_dgemm( CblasColMajor, 
			
 
				-				 ( CBLAS_TRANSPOSE ) transA,
			
 
				-				 ( CBLAS_TRANSPOSE ) transB,
			
 
				-									 m, n, k,
			
 
				-							 		 alpha, A, k,
			
 
				-							 			    B, n,
			
 
				-							 		  beta, C_test, n );
			
 
				-	// Error computation
			
 
				-	for ( i = 0; i < m; i++ )
			
 
				 	{
			
 
				-		for ( j = 0; j < n; j++ )
			
 
				+		int cpu_offset = rm_nb_cpu_units/2;
			
 
				+		int cpu_nb = rm_nb_cpu_units/2;
			
 
				+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
			
 
				 		{
			
 
				-			error = abs( C[ i * n + j ] - C_test[ i * n + j ] );
			
 
				-			if ( max_error > error )
			
 
				-				max_error = error;
			
 
				-			count_error += error;
			
 
				+			cpu_nb = 1;
			
 
				 		}
			
 
				+		int cuda_offset = rm_nb_cuda_units/2;
			
 
				+		int cuda_nb = rm_nb_cuda_units/2;
			
 
				+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
			
 
				 	}
			
 
				+	_wait_pending_spawns();
			
 
				+	MORSE_Finalize();
			
 
				 
			
 
				-	fprintf(stdout, "Max. error = %1.2f\n", max_error );
			
 
				-	fprintf(stdout, "Av. error = %1.2f\n", count_error / ( m * n ) );
			
 
				-#endif
			
 
				-#endif
			
 
				+	starpurm_shutdown();
			
 
				+	pthread_cond_destroy(&spawn_pending_cond);
			
 
				 
			
 
				 	return 0;
			
 
				 
			
--- a/starpurm/dev/cuda_vector_scale/vector_scale.c
+++ b/starpurm/dev/cuda_vector_scale/vector_scale.c
@@ -238,12 +238,15 @@ int main(int argc, char *argv[])
 
				 
			
 
				 	if (rm_nb_cpu_units > 1 && rm_nb_cuda_units > 1)
			
 
				 	{
			
 
				-		const int nb_cpus = rm_nb_cpu_units;
			
 
				+		int nb_cpus = rm_nb_cpu_units;
			
 
				 		const int nb_cudas = rm_nb_cuda_units;
			
 
				 		const int cuda_type = rm_cuda_type_id;
			
 
				 		printf("nb_cpu_units = %d\n", nb_cpus);
			
 
				 		printf("nb_cuda_units = %d\n", nb_cudas);
			
 
				 
			
 
				+		/* Keep at least one CPU core */
			
 
				+		nb_cpus--;
			
 
				+
			
 
				 		starpurm_set_drs_enable(NULL);
			
 
				 		drs_enabled = starpurm_drs_enabled_p();
			
 
				 		assert(drs_enabled != 0);
			
--- a/starpurm/include/starpurm.h
+++ b/starpurm/include/starpurm.h
@@ -138,6 +138,7 @@ hwloc_cpuset_t starpurm_get_global_cpuset(void);
 
				 hwloc_cpuset_t starpurm_get_selected_cpuset(void);
			
 
				 hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void);
			
 
				 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void);
			
 
				+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/starpurm/src/starpurm.c
+++ b/starpurm/src/starpurm.c
@@ -603,8 +603,19 @@ void starpurm_initialize(void)
 
				 	hwloc_topology_load(rm->topology);
			
 
				 	rm->global_cpuset = hwloc_bitmap_alloc();
			
 
				 	hwloc_bitmap_zero(rm->global_cpuset);
			
 
				+	
			
 
				 	rm->all_cpu_workers_cpuset = hwloc_bitmap_alloc();
			
 
				 	hwloc_bitmap_zero(rm->all_cpu_workers_cpuset);
			
 
				+	
			
 
				+	rm->all_opencl_device_workers_cpuset = hwloc_bitmap_alloc();
			
 
				+	hwloc_bitmap_zero(rm->all_opencl_device_workers_cpuset);
			
 
				+	
			
 
				+	rm->all_cuda_device_workers_cpuset = hwloc_bitmap_alloc();
			
 
				+	hwloc_bitmap_zero(rm->all_cuda_device_workers_cpuset);
			
 
				+	
			
 
				+	rm->all_mic_device_workers_cpuset = hwloc_bitmap_alloc();
			
 
				+	hwloc_bitmap_zero(rm->all_mic_device_workers_cpuset);
			
 
				+
			
 
				 	rm->all_device_workers_cpuset = hwloc_bitmap_alloc();
			
 
				 	hwloc_bitmap_zero(rm->all_device_workers_cpuset);
			
 
				 
			
@@ -705,6 +716,7 @@ void starpurm_initialize(void)
 
				 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
			
 
				 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
			
 
				 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
			
 
				+		hwloc_bitmap_or(rm->all_opencl_device_workers_cpuset, rm->all_opencl_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		unitid++;
			
 
				 	}
			
@@ -725,6 +737,7 @@ void starpurm_initialize(void)
 
				 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
			
 
				 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
			
 
				 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
			
 
				+		hwloc_bitmap_or(rm->all_cuda_device_workers_cpuset, rm->all_cuda_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		unitid++;
			
 
				 	}
			
@@ -745,6 +758,7 @@ void starpurm_initialize(void)
 
				 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
			
 
				 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
			
 
				 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
			
 
				+		hwloc_bitmap_or(rm->all_mic_device_workers_cpuset, rm->all_mic_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
			
 
				 		unitid++;
			
 
				 	}
			
@@ -851,6 +865,9 @@ void starpurm_shutdown(void)
 
				 
			
 
				 	hwloc_bitmap_free(rm->global_cpuset);
			
 
				 	hwloc_bitmap_free(rm->all_cpu_workers_cpuset);
			
 
				+	hwloc_bitmap_free(rm->all_opencl_device_workers_cpuset);
			
 
				+	hwloc_bitmap_free(rm->all_cuda_device_workers_cpuset);
			
 
				+	hwloc_bitmap_free(rm->all_mic_device_workers_cpuset);
			
 
				 	hwloc_bitmap_free(rm->all_device_workers_cpuset);
			
 
				 	hwloc_bitmap_free(rm->selected_cpuset);
			
 
				 
			
@@ -1576,6 +1593,33 @@ hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void)
 
				 	return hwloc_bitmap_dup(rm->all_cpu_workers_cpuset);
			
 
				 }
			
 
				 
			
 
				+static hwloc_cpuset_t starpurm_get_all_opencl_device_workers_cpuset(void)
			
 
				+{
			
 
				+	assert(_starpurm != NULL);
			
 
				+	assert(_starpurm->state != state_uninitialized);
			
 
				+	struct s_starpurm *rm = _starpurm;
			
 
				+
			
 
				+	return hwloc_bitmap_dup(rm->all_opencl_device_workers_cpuset);
			
 
				+}
			
 
				+
			
 
				+static hwloc_cpuset_t starpurm_get_all_cuda_device_workers_cpuset(void)
			
 
				+{
			
 
				+	assert(_starpurm != NULL);
			
 
				+	assert(_starpurm->state != state_uninitialized);
			
 
				+	struct s_starpurm *rm = _starpurm;
			
 
				+
			
 
				+	return hwloc_bitmap_dup(rm->all_cuda_device_workers_cpuset);
			
 
				+}
			
 
				+
			
 
				+static hwloc_cpuset_t starpurm_get_all_mic_device_workers_cpuset(void)
			
 
				+{
			
 
				+	assert(_starpurm != NULL);
			
 
				+	assert(_starpurm->state != state_uninitialized);
			
 
				+	struct s_starpurm *rm = _starpurm;
			
 
				+
			
 
				+	return hwloc_bitmap_dup(rm->all_mic_device_workers_cpuset);
			
 
				+}
			
 
				+
			
 
				 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
			
 
				 {
			
 
				 	assert(_starpurm != NULL);
			
@@ -1585,3 +1629,18 @@ hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 
				 	return hwloc_bitmap_dup(rm->all_device_workers_cpuset);
			
 
				 }
			
 
				 
			
 
				+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid)
			
 
				+{
			
 
				+	assert(_starpurm != NULL);
			
 
				+	assert(_starpurm->state != state_uninitialized);
			
 
				+	assert(typeid != starpurm_unit_cpu);
			
 
				+	if (typeid == starpurm_unit_opencl)
			
 
				+		return starpurm_get_all_opencl_device_workers_cpuset();
			
 
				+	if (typeid == starpurm_unit_cuda)
			
 
				+		return starpurm_get_all_cuda_device_workers_cpuset();
			
 
				+	if (typeid == starpurm_unit_mic)
			
 
				+		return starpurm_get_all_mic_device_workers_cpuset();
			
 
				+	hwloc_cpuset_t empty_bitmap = hwloc_bitmap_alloc();
			
 
				+	hwloc_bitmap_zero(empty_bitmap);
			
 
				+	return empty_bitmap;
			
 
				+}
			
--- a/starpurm/src/starpurm_dlb.c
+++ b/starpurm/src/starpurm_dlb.c
@@ -22,12 +22,15 @@
 
				 #include <stdio.h>
			
 
				 #include <string.h>
			
 
				 #include <assert.h>
			
 
				+#include <config.h>
			
 
				+
			
 
				 #include <hwloc.h>
			
 
				+#ifdef HAVE_HWLOC_GLIBC_SCHED_H
			
 
				 #include <hwloc/glibc-sched.h>
			
 
				+#endif
			
 
				 #include <pthread.h>
			
 
				 #include <starpu.h>
			
 
				 #include <starpurm.h>
			
 
				-#include <config.h>
			
 
				 #include <starpurm_private.h>
			
 
				 
			
 
				 #ifndef STARPURM_HAVE_DLB
			
--- a/starpurm/src/starpurm_private.h
+++ b/starpurm/src/starpurm_private.h
@@ -79,6 +79,15 @@ struct s_starpurm
 
				 	/* Cpuset of all StarPU CPU workers. */
			
 
				 	hwloc_cpuset_t all_cpu_workers_cpuset;
			
 
				 
			
 
				+	/* Cpuset of all StarPU OpenCL workers. */
			
 
				+	hwloc_cpuset_t all_opencl_device_workers_cpuset;
			
 
				+
			
 
				+	/* Cpuset of all StarPU CUDA workers. */
			
 
				+	hwloc_cpuset_t all_cuda_device_workers_cpuset;
			
 
				+
			
 
				+	/* Cpuset of all StarPU MIC workers. */
			
 
				+	hwloc_cpuset_t all_mic_device_workers_cpuset;
			
 
				+
			
 
				 	/* Cpuset of all StarPU device workers. */
			
 
				 	hwloc_cpuset_t all_device_workers_cpuset;