瀏覽代碼

Merge branch 'starpurm' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into starpurm

Olivier Aumage 7 年之前
父節點
當前提交
eb5c169aeb

+ 9 - 0
configure.ac

@@ -1327,10 +1327,19 @@ if test x$enable_cuda = xyes; then
 		NVCCFLAGS="${NVCCFLAGS} -m64"
 		NVCCFLAGS="${NVCCFLAGS} -m64"
 	fi
 	fi
 
 
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
+	SAVED_LDFLAGS="${LDFLAGS}"
+	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
 
 
 	AC_CHECK_LIB([cusparse], [cusparseCreate])
 	AC_CHECK_LIB([cusparse], [cusparseCreate])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
+
+	AC_CHECK_HEADER([nvml.h],
+	  [AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetTotalEnergyConsumption])])
+        CPPFLAGS="${SAVED_CPPFLAGS}"
+	LDFLAGS="${SAVED_LDFLAGS}"
 fi
 fi
 
 
 dnl Hey dude, are you around?
 dnl Hey dude, are you around?

+ 132 - 0
contrib/ci.inria.fr/Jenkinsfile-basic

@@ -0,0 +1,132 @@
+#!groovy
+// StarPU --- Runtime system for heterogeneous multicore architectures.
+//
+// Copyright (C) 2018                                CNRS
+//
+// StarPU is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation; either version 2.1 of the License, or (at
+// your option) any later version.
+//
+// StarPU is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
+//
+
+def statusHasChanged = false
+
+pipeline
+{
+	agent none
+
+	// Trigger the build
+	triggers
+	{
+		// Poll gitlab explicitly every 15mn
+		pollSCM('00-59/15 * * * *')
+	}
+
+	stages
+	{
+		stage('Tarball')
+		{
+			steps
+			{
+				node('autotools')
+				{
+					checkout scm
+					sh 'contrib/ci.inria.fr/job-0-tarball.sh'
+					script
+					{
+					       env.tarballgz = sh (script: 'ls *.tar.gz', returnStdout: true).trim()
+					}
+					stash includes: "${env.tarballgz}", name: 'tarballgz'
+					stash includes: "starpu.pdf", name: 'doc'
+					// Stash those scripts because they are not in make dist
+					dir('contrib/ci.inria.fr')
+					{
+						stash includes: "job-1-check.sh", name: 'script-unix-check'
+					}
+					archiveArtifacts artifacts: "${env.tarballgz},starpu.pdf", fingerprint: true, onlyIfSuccessful: true
+					deleteDir()
+
+				}
+			}
+		}
+		stage('Check')
+		{
+			steps
+			{
+				script
+				{
+					labelToSelect = 'unix'
+					listOfNodeNames = jenkins.model.Jenkins.instance.nodes.collect
+					{
+						node -> node.getLabelString().contains(labelToSelect) ? node.name : null
+					}
+					listOfNodeNames.removeAll(Collections.singleton(null))
+
+					def p = listOfNodeNames.collectEntries
+					{
+						[ (it):
+						{
+							node(it)
+							{
+								dir('check-unix')
+								{
+									unstash 'tarballgz'
+									unstash 'script-unix-check'
+									sh 'chmod 755 job-1-check.sh && ./job-1-check.sh'
+									deleteDir()
+								}
+							}
+						}
+					]}
+					parallel p;
+				}
+			}
+		}
+	}
+
+	post
+	{
+		// hooks are called in order: always, changed, aborted, failure, success, unstable
+		changed
+		{
+			echo "Build status has changed."
+			script
+			{
+
+				statusHasChanged = true
+			}
+		}
+		success
+		{
+			echo "Build success."
+			// email when changed to success
+			script
+			{
+				if (statusHasChanged)
+				{
+					emailext(body: '${DEFAULT_CONTENT}',
+						 subject: '${DEFAULT_SUBJECT}',
+						 replyTo: '$DEFAULT_REPLYTO',
+						 to: '$DEFAULT_RECIPIENTS',
+						 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
+				}
+			}
+		}
+		failure
+		{
+			echo "Build failure."
+			// always email on failure
+			emailext(body: '${DEFAULT_CONTENT}',
+				 subject: '${DEFAULT_SUBJECT}',
+				 replyTo: '$DEFAULT_REPLYTO',
+				 to: '$DEFAULT_RECIPIENTS',
+				 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
+		}
+	}
+}

+ 30 - 0
contrib/ci.inria.fr/job-0-tarball.sh

@@ -0,0 +1,30 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2018                                CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
+
+./autogen.sh
+if test -d build ; then chmod -R 777 build && rm -rf build ; fi
+mkdir build && cd build
+../configure
+make V=1
+make dist
+cp *gz ..
+cp doc/doxygen/starpu.pdf ..
+make clean
+

+ 85 - 0
contrib/ci.inria.fr/job-1-check.sh

@@ -0,0 +1,85 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2013-2018                                CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -e
+set -x
+
+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
+
+tarball=$(ls -tr starpu-*.tar.gz | tail -1)
+
+if test -z "$tarball"
+then
+    echo Error. No tar.gz file
+    ls
+    pwd
+    exit 1
+fi
+
+basename=$(basename $tarball .tar.gz)
+export STARPU_HOME=$PWD/$basename/home
+mkdir -p $basename
+cd $basename
+env > $PWD/env
+
+test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
+tar xfz ../$tarball
+cd $basename
+mkdir build
+cd build
+
+STARPU_CONFIGURE_OPTIONS=""
+suname=$(uname)
+if test "$suname" == "Darwin"
+then
+    STARPU_CONFIGURE_OPTIONS="--without-hwloc"
+fi
+if test "$suname" == "OpenBSD"
+then
+    STARPU_CONFIGURE_OPTIONS="--without-hwloc --disable-mlr"
+fi
+if test "$suname" == "FreeBSD"
+then
+    STARPU_CONFIGURE_OPTIONS="--disable-fortran"
+fi
+
+export CC=gcc
+
+day=$(date +%u)
+if test $day -le 5
+then
+    ../configure --enable-quick-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
+else
+    ../configure --enable-long-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
+fi
+
+make
+#make check
+(make -k check || true) > ../check_$$ 2>&1
+cat ../check_$$
+make showcheck
+
+grep "^FAIL:" ../check_$$ || true
+
+make clean
+
+grep "^FAIL:" ../check_$$ || true
+
+echo "Running on $(uname -a)"
+exit $(grep "^FAIL:" ../check_$$ | wc -l)
+

+ 31 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2011-2013,2015,2017                      Inria
  * Copyright (C) 2011-2013,2015,2017                      Inria
  * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2010-2018                                CNRS
- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,26 @@ performance, we give below a list of features which should be checked.
 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
 will show roughly where time is spent, and focus correspondingly.
 will show roughly where time is spent, and focus correspondingly.
 
 
+\section CheckTaskSize Check Task Size
+
+Make sure that your tasks are not too small, because the StarPU runtime overhead
+is not completely zero. You can run the tasks_size_overhead.sh script to get an
+idea of the scalability of tasks depending on their duration (in µs), on your
+own system.
+
+Typically, 10µs-ish tasks are definitely too small, the CUDA overhead itself is
+much bigger than this.
+
+1ms-ish tasks may be a good start, but will not necessarily scale to many dozens
+of cores, so it's better to try to get 10ms-ish tasks.
+
+Tasks durations can easily be observed when performance models are defined (see
+\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
+<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
+
+When using parallel tasks, the problem is even worse since StarPU has to
+synchronize the execution of tasks.
+
 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
 
 
 The \ref enable-fast "--enable-fast" configuration option disables all
 The \ref enable-fast "--enable-fast" configuration option disables all
@@ -116,6 +136,16 @@ enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
 number of kernels to execute concurrently.  This is useful when kernels are
 number of kernels to execute concurrently.  This is useful when kernels are
 small and do not feed the whole GPU with threads to run.
 small and do not feed the whole GPU with threads to run.
 
 
+Concerning memory allocation, you should really not use cudaMalloc/cudaFree
+within the kernel, since cudaFree introduces a awfully lot of synchronizations
+within CUDA itself. You should instead add a parameter to the codelet with the
+STARPU_SCRATCH mode access. You can then pass to the task a handle registered
+with the desired size but with the NULL pointer, that handle can even be the
+shared between tasks, StarPU will allocate per-task data on the fly before task
+execution, and reuse the allocated data between tasks.
+
+See <c>examples/pi/pi_redux.c</c> for an example of use.
+
 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
 
 
 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated
 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated

+ 9 - 1
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
  * Copyright (C) 2011-2013,2015-2017                      Inria
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  * Copyright (C) 2016                                     Uppsala University
  *
  *
@@ -1173,6 +1173,14 @@ If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the
 discovered by StarPU.
 discovered by StarPU.
 </dd>
 </dd>
 
 
+<dt>STARPU_IDLE_FILE</dt>
+<dd>
+\anchor STARPU_IDLE_FILE
+\addindex __env__STARPU_IDLE_FILE
+If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
+The file will contain the sum of the idle times of all the workers.
+</dd>
+
 </dl>
 </dl>
 
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 3 - 3
doc/doxygen/chapters/api/profiling.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010-2015,2017                           CNRS
  * Copyright (C) 2010-2015,2017                           CNRS
- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
  * Copyright (C) 2011-2012                                Inria
  * Copyright (C) 2011-2012                                Inria
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -72,7 +72,7 @@ profiling was enabled.
     Number of cycles stalled within the task, only available in the MoviSim
     Number of cycles stalled within the task, only available in the MoviSim
 
 
 \var double starpu_profiling_task_info::energy_consumed
 \var double starpu_profiling_task_info::energy_consumed
-Energy consumed by the task, only available in the MoviSim
+Energy consumed by the task, in Joules
 
 
 \struct starpu_profiling_worker_info
 \struct starpu_profiling_worker_info
 This structure contains the profiling information associated to
 This structure contains the profiling information associated to
@@ -94,7 +94,7 @@ starpu_profiling_worker_get_info()
 \var uint64_t starpu_profiling_worker_info::stall_cycles
 \var uint64_t starpu_profiling_worker_info::stall_cycles
         Number of cycles stalled within the worker, only available in the MoviSim
         Number of cycles stalled within the worker, only available in the MoviSim
 \var double starpu_profiling_worker_info::energy_consumed
 \var double starpu_profiling_worker_info::energy_consumed
-        Energy consumed by the worker, only available in the MoviSim
+        Energy consumed by the worker, in Joules
 
 
 \struct starpu_profiling_bus_info
 \struct starpu_profiling_bus_info
 todo
 todo

+ 2 - 1
examples/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2011-2017                                Inria
 # Copyright (C) 2011-2017                                Inria
 # Copyright (C) 2017                                     Erwan Leria
 # Copyright (C) 2017                                     Erwan Leria
 # Copyright (C) 2009-2018                                Université de Bordeaux
 # Copyright (C) 2009-2018                                Université de Bordeaux
-# Copyright (C) 2010-2015,2017                           CNRS
+# Copyright (C) 2010-2015,2017,2018                           CNRS
 # Copyright (C) 2011                                     Télécom-SudParis
 # Copyright (C) 2011                                     Télécom-SudParis
 # Copyright (C) 2016                                     Uppsala University
 # Copyright (C) 2016                                     Uppsala University
 #
 #
@@ -227,6 +227,7 @@ STARPU_EXAMPLES +=				\
 	filters/fmultiple_submit		\
 	filters/fmultiple_submit		\
 	filters/fmultiple_submit_readonly	\
 	filters/fmultiple_submit_readonly	\
 	filters/fmultiple_submit_implicit	\
 	filters/fmultiple_submit_implicit	\
+	filters/frecursive			\
 	tag_example/tag_example			\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\
 	tag_example/tag_example3		\

+ 170 - 0
examples/filters/frecursive.c

@@ -0,0 +1,170 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void cpu_codelet(void *buffers[], void *cl_arg)
+{
+        unsigned i, j;
+        int factor;
+
+	starpu_codelet_unpack_args(cl_arg, &factor, 0);
+        /* length of the matrix */
+        unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        /* local copy of the matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+	FPRINTF(stderr, "computing on matrix with nx=%d, ny=%d, ld=%d\n", nx, ny, ld);
+        for(j=0; j<ny ; j++)
+	{
+                for(i=0; i<nx ; i++)
+                        val[(j*ld)+i] *= factor;
+        }
+}
+
+static struct starpu_codelet cl =
+{
+        .cpu_funcs[0] = cpu_codelet,
+        .nbuffers = 1,
+	.modes[0] = STARPU_RW,
+};
+
+#define NX 400
+#define NY 80
+#define LD NX
+#define PARTS 4
+
+int main(void)
+{
+        int *matrix;
+	starpu_data_handle_t matrix_handle;
+	starpu_data_handle_t subhandle_l1[PARTS];
+	starpu_data_handle_t subhandle_l2[PARTS][PARTS];
+	starpu_data_handle_t subhandle_l3[PARTS][PARTS][PARTS];
+	int ret, submit;
+
+	int factor = 12;
+	int n=1;
+	int i,j,k;
+
+        ret = starpu_init(NULL);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		return 77;
+	}
+
+	if (starpu_cpu_worker_get_count() < 1)
+	{
+		FPRINTF(stderr, "This application requires at least 1 cpu worker\n");
+		starpu_shutdown();
+		return 77;
+	}
+
+	matrix = (int*)malloc(NX * NY * sizeof(int));
+        assert(matrix);
+	starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, LD, NX, NY, sizeof(int));
+
+        for(j=0 ; j<NY ; j++)
+	{
+                for(i=0 ; i<NX ; i++)
+		{
+                        matrix[(j*LD)+i] = n++;
+                }
+        }
+
+	/* Split the matrix in PARTS sub-matrices, each sub-matrix in PARTS sub-sub-matrices, and each sub-sub matrix in PARTS sub-sub-sub-matrices */
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_matrix_filter_block,
+		.nchildren = PARTS
+	};
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_matrix_filter_vertical_block,
+		.nchildren = PARTS
+	};
+	starpu_data_partition_plan(matrix_handle, &f, subhandle_l1);
+	for(i=0 ; i<PARTS ; i++)
+	{
+		starpu_data_partition_plan(subhandle_l1[i], &f2, subhandle_l2[i]);
+		for(j=0 ; j<PARTS ; j++)
+		{
+			starpu_data_partition_plan(subhandle_l2[i][j], &f, subhandle_l3[i][j]);
+		}
+	}
+
+        /* Submit a task on the first sub-matrix and sub-sub matrix, and on all others sub-sub-matrices */
+	ret = starpu_task_insert(&cl,
+				 STARPU_RW, subhandle_l1[0],
+				 STARPU_VALUE, &factor, sizeof(factor),
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	for (i=1; i<PARTS; i++)
+	{
+		ret = starpu_task_insert(&cl,
+					 STARPU_RW, subhandle_l2[i][0],
+					 STARPU_VALUE, &factor, sizeof(factor),
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		for (j=1; j<PARTS; j++)
+		{
+			for (k=0; k<PARTS; k++)
+			{
+				ret = starpu_task_insert(&cl,
+							 STARPU_RW, subhandle_l3[i][j][k],
+							 STARPU_VALUE, &factor, sizeof(factor),
+							 0);
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			}
+		}
+	}
+
+	for(i=0 ; i<PARTS ; i++)
+	{
+		for(j=0 ; j<PARTS ; j++)
+		{
+			starpu_data_partition_clean(subhandle_l2[i][j], PARTS, subhandle_l3[i][j]);
+
+		}
+		starpu_data_partition_clean(subhandle_l1[i], PARTS, subhandle_l2[i]);
+	}
+	starpu_data_partition_clean(matrix_handle, PARTS, subhandle_l1);
+	starpu_data_unregister(matrix_handle);
+
+	/* Print result matrix */
+	n=1;
+	for(j=0 ; j<NY ; j++)
+	{
+		for(i=0 ; i<NX ; i++)
+		{
+			if (matrix[(j*LD)+i] != (int) n*12)
+			{
+				FPRINTF(stderr, "Incorrect result %4d != %4d", matrix[(j*LD)+i], n*12);
+				ret=1;
+			}
+			n++;
+		}
+	}
+
+	free(matrix);
+        starpu_shutdown();
+
+	return ret;
+}

+ 1 - 1
src/common/fxt.h

@@ -620,7 +620,7 @@ do {									\
 		}							\
 		}							\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
 	}								\
 } while(0);
 } while(0);
 
 

+ 6 - 1
src/common/prio_list.h

@@ -167,7 +167,12 @@
 	{ \
 	{ \
 		/* Sort by decreasing order */ \
 		/* Sort by decreasing order */ \
 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
-		return (e2->prio - prio); \
+		if (e2->prio < prio) \
+			return -1; \
+		if (e2->prio == prio) \
+			return 0; \
+		/* e2->prio > prio */ \
+		return 1; \
 	} \
 	} \
 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
 	{ \
 	{ \

+ 4 - 2
src/core/perfmodel/perfmodel_history.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2013,2016-2017                      Inria
  * Copyright (C) 2011-2013,2016-2017                      Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -477,6 +477,8 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 			multi_invalid = (multi_invalid||isnan(reg_model->coeff[i]));
 			multi_invalid = (multi_invalid||isnan(reg_model->coeff[i]));
 		}
 		}
 		reg_model->multi_valid = !multi_invalid;
 		reg_model->multi_valid = !multi_invalid;
+		res = fscanf(f, "\n");
+		STARPU_ASSERT_MSG(res == 0, "Incorrect performance model file %s", path);
 	}
 	}
 }
 }
 
 
@@ -1763,7 +1765,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 
 					unsigned n = entry->nsample;
 					unsigned n = entry->nsample;
 					entry->mean = entry->sum / n;
 					entry->mean = entry->sum / n;
-					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum)/n))/n);
 				}
 				}
 
 
 				if (j->task->flops != 0.)
 				if (j->task->flops != 0.)

+ 2 - 0
src/core/sched_policy.c

@@ -397,6 +397,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			struct starpu_task *alias = starpu_task_dup(task);
 			struct starpu_task *alias = starpu_task_dup(task);
 			alias->destroy = 1;
 			alias->destroy = 1;
 
 
+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 		}
 		}
@@ -581,6 +582,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 					if (job->task_size > 1)
 					if (job->task_size > 1)
 					{
 					{
 						alias = starpu_task_dup(task);
 						alias = starpu_task_dup(task);
+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 						alias->destroy = 1;
 						alias->destroy = 1;
 					}
 					}
 					else
 					else

+ 1 - 1
src/datawizard/malloc.c

@@ -360,7 +360,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 				ret = -ENOMEM;
 				ret = -ENOMEM;
 		}
 		}
 
 
-#if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
+#if (defined(STARPU_SIMGRID) && (SIMGRID_VERSION < 31500 || SIMGRID_VERSION == 31559)) || defined(STARPU_USE_CUDA)
 end:
 end:
 #endif
 #endif
 	if (ret == 0)
 	if (ret == 0)

+ 11 - 1
src/drivers/cpu/driver_cpu.c

@@ -107,12 +107,22 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 			/* rebind to single CPU */
 			/* rebind to single CPU */
 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
 	}
 	}
+	else
+	{
+		_STARPU_TRACE_START_EXECUTING();
+	}
+
+	if (is_parallel_task)
+	{
+		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
+		if (rank != 0)
+			_STARPU_TRACE_END_EXECUTING();
+	}
 
 
 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
 
 
 	if (is_parallel_task)
 	if (is_parallel_task)
 	{
 	{
-		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 		if (rank == 0)
 		if (rank == 0)
 		{
 		{

+ 48 - 1
src/drivers/cuda/driver_cuda.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2012,2014,2016-2017                 Inria
  * Copyright (C) 2011-2012,2014,2016-2017                 Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -31,6 +31,9 @@
 #ifdef HAVE_CUDA_GL_INTEROP_H
 #ifdef HAVE_CUDA_GL_INTEROP_H
 #include <cuda_gl_interop.h>
 #include <cuda_gl_interop.h>
 #endif
 #endif
+#ifdef HAVE_LIBNVIDIA_ML
+#include <nvml.h>
+#endif
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <datawizard/malloc.h>
@@ -53,9 +56,13 @@
 static int ncudagpus = -1;
 static int ncudagpus = -1;
 
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
 static size_t global_mem[STARPU_MAXCUDADEVS];
+#ifdef HAVE_LIBNVIDIA_ML
+static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
+#endif
 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t streams[STARPU_NMAXWORKERS];
+static char used_stream[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 /* Note: streams are not thread-safe, so we define them for each CUDA worker
 /* Note: streams are not thread-safe, so we define them for each CUDA worker
@@ -106,6 +113,9 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 	if (STARPU_UNLIKELY(cures != cudaSuccess))
 	if (STARPU_UNLIKELY(cures != cudaSuccess))
 		cnt = 0;
 		cnt = 0;
 	config->topology.nhwcudagpus = cnt;
 	config->topology.nhwcudagpus = cnt;
+#ifdef HAVE_LIBNVIDIA_ML
+	nvmlInit();
+#endif
 #endif
 #endif
 }
 }
 
 
@@ -215,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 {
 {
 	int worker = starpu_worker_get_id_check();
 	int worker = starpu_worker_get_id_check();
 
 
+	used_stream[worker] = 1;
 	return streams[worker];
 	return streams[worker];
 }
 }
 
 
@@ -520,7 +531,30 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 #else
 #else
+#ifdef HAVE_LIBNVIDIA_ML
+		unsigned long long energy_start = 0;
+		nvmlReturn_t nvmlRet = -1;
+		if (profiling || (cl->energy_model && cl->energy_model->benchmarking))
+		{
+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_start);
+		}
+#endif
+
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+
+#ifdef HAVE_LIBNVIDIA_ML
+		if (nvmlRet == NVML_SUCCESS &&
+			(profiling || (cl->energy_model && cl->energy_model->benchmarking)))
+		{
+			unsigned long long energy_end;
+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_end);
+#ifdef STARPU_DEVEL
+#warning TODO: measure idle consumption to subtract it
+#endif
+			if (nvmlRet == NVML_SUCCESS)
+				task->profiling_info->energy_consumed += (energy_end - energy_start) / 1000.;
+		}
+#endif
 #endif
 #endif
 		_STARPU_TRACE_END_EXECUTING();
 		_STARPU_TRACE_END_EXECUTING();
 	}
 	}
@@ -581,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 		}
 		}
 	}
 	}
 
 
+#ifndef STARPU_SIMGRID
+	if (!used_stream[workerid])
+	{
+		used_stream[workerid] = 1;
+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
+	}
+#endif
+
 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
 	{
 	{
 		if (worker->pipeline_length == 0)
 		if (worker->pipeline_length == 0)
@@ -682,6 +724,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
+#ifdef HAVE_LIBNVIDIA_ML
+		char busid[13];
+		snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+		nvmlDeviceGetHandleByPciBusId(busid, &nvmlDev[devid]);
+#endif
 		if (props[devid].pciDomainID)
 		if (props[devid].pciDomainID)
 			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
 			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
 		else
 		else

+ 3 - 1
src/sched_policies/component_worker.c

@@ -2,7 +2,7 @@
  *
  *
  * Copyright (C) 2011-2014,2017                           Inria
  * Copyright (C) 2011-2014,2017                           Inria
  * Copyright (C) 2010-2012,2014-2017                      CNRS
  * Copyright (C) 2010-2012,2014-2017                      CNRS
- * Copyright (C) 2010-2017                                Université de Bordeaux
+ * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2013                                     Simon Archipoff
  * Copyright (C) 2013                                     Simon Archipoff
  *
  *
@@ -631,6 +631,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 	task_alias[0]->task->destroy = 1;
 	task_alias[0]->task->destroy = 1;
 	task_alias[0]->left = NULL;
 	task_alias[0]->left = NULL;
 	task_alias[0]->ntasks = combined_worker->worker_size;
 	task_alias[0]->ntasks = combined_worker->worker_size;
+	_STARPU_TRACE_JOB_PUSH(task_alias[0]->task, task_alias[0]->task->priority > 0);
 	int i;
 	int i;
 	for(i = 1; i < combined_worker->worker_size; i++)
 	for(i = 1; i < combined_worker->worker_size; i++)
 	{
 	{
@@ -641,6 +642,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 		task_alias[i]->left = task_alias[i-1];
 		task_alias[i]->left = task_alias[i-1];
 		task_alias[i - 1]->right = task_alias[i];
 		task_alias[i - 1]->right = task_alias[i];
 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
+		_STARPU_TRACE_JOB_PUSH(task_alias[i]->task, task_alias[i]->task->priority > 0);
 	}
 	}
 
 
 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;
 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;

+ 3 - 0
src/sched_policies/parallel_eager.c

@@ -342,6 +342,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		struct starpu_task *alias = starpu_task_dup(task);
 		struct starpu_task *alias = starpu_task_dup(task);
 		int local_worker = combined_workerid[i];
 		int local_worker = combined_workerid[i];
 		alias->destroy = 1;
 		alias->destroy = 1;
+		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 	}
 	}
 
 
@@ -352,6 +353,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 
+	_STARPU_TRACE_JOB_PUSH(master_alias, master_alias->priority > 0);
+
 	for (i = 1; i < worker_size; i++)
 	for (i = 1; i < worker_size; i++)
 	{
 	{
 		int local_worker = combined_workerid[i];
 		int local_worker = combined_workerid[i];

+ 2 - 1
src/sched_policies/parallel_heft.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2011-2013,2015,2017                      Inria
  * Copyright (C) 2011-2013,2015,2017                      Inria
- * Copyright (C) 2010-2017                                Université de Bordeaux
+ * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2011-2017                                CNRS
  * Copyright (C) 2011-2017                                CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2011                                     Télécom-SudParis
  *
  *
@@ -175,6 +175,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			ntasks[local_combined_workerid]++;
 			ntasks[local_combined_workerid]++;
 			_starpu_worker_unlock(local_combined_workerid);
 			_starpu_worker_unlock(local_combined_workerid);
 
 
+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
 		}
 		}
 
 

+ 238 - 195
starpurm/dev/chameleon_test/dgemm.c

@@ -1,3 +1,21 @@
+/* StarPURM --- StarPU Resource Management Layer.
+ *
+ * Copyright (C) 2017, 2018  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */
+
 #define _GNU_SOURCE
 #define _GNU_SOURCE
 #include <sched.h>
 #include <sched.h>
 #include <stdio.h>
 #include <stdio.h>
@@ -6,18 +24,56 @@
 #include <morse.h>
 #include <morse.h>
 #include <starpurm.h>
 #include <starpurm.h>
 #include <hwloc.h>
 #include <hwloc.h>
+#include <pthread.h>
+
+#define CHECK
 
 
 static int rm_cpu_type_id = -1;
 static int rm_cpu_type_id = -1;
+static int rm_cuda_type_id = -1;
 static int rm_nb_cpu_units = 0;
 static int rm_nb_cpu_units = 0;
+static int rm_nb_cuda_units = 0;
+static const int nb_random_tests = 10;
 
 
-static void test1();
-static void init_rm_infos(void);
+static unsigned spawn_pending = 0;
+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t spawn_pending_cond;
 
 
-static const int nb_random_tests = 10;
+static void _inc_spawn_pending(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending < UINT_MAX);
+	spawn_pending++;
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
 
 
-static void test1()
+static void _dec_spawn_pending(void)
 {
 {
-	int i;
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending > 0);
+	spawn_pending--;
+	if (spawn_pending == 0)
+		pthread_cond_broadcast(&spawn_pending_cond);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void _wait_pending_spawns(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	while (spawn_pending > 0)
+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void spawn_callback(void *_arg)
+{
+	assert(42 == (uintptr_t)_arg);
+	_dec_spawn_pending();
+}
+
+static void usage(void)
+{
+	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
+	exit(EXIT_FAILURE);
 }
 }
 
 
 static void init_rm_infos(void)
 static void init_rm_infos(void)
@@ -30,236 +86,223 @@ static void init_rm_infos(void)
 		exit(77);
 		exit(77);
 	}
 	}
 
 
+	int cuda_type = starpurm_get_device_type_id("cuda");
+	int nb_cuda_units = starpurm_get_nb_devices_by_type(cuda_type);
+
 	rm_cpu_type_id = cpu_type;
 	rm_cpu_type_id = cpu_type;
+	rm_cuda_type_id = cuda_type;
 	rm_nb_cpu_units = nb_cpu_units;
 	rm_nb_cpu_units = nb_cpu_units;
+	rm_nb_cuda_units = nb_cuda_units;
 }
 }
 
 
-static void disp_selected_cpuset(void)
+
+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
 {
 {
-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
 	char str[strl+1];
 	char str[strl+1];
 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
-	printf("selected cpuset = %s\n", str);
+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
 }
 }
 
 
-int main( int argc, char const *argv[])
+struct s_test_args
 {
 {
-	starpurm_initialize();
-	init_rm_infos();
-	printf("using default units\n");
-	disp_selected_cpuset();
-	test1();
-	starpurm_shutdown();
-#if 0
+	const int m;
+	const int n;
+	const int k;
+	int transA;
+	int transB;
+};
 
 
-	if(argc < 6 || argc > 6)
-	{ 		
-		fprintf(stderr, "Usage: ./test_dgemm M N K TRANS_A TRANS_B\n" );
-		return 1;
-	}
-	
-	// Local variables
-	int i, j;
-	int m, n, k;
-	const char *transA_input = NULL;
-	const char *transB_input = NULL;
-	enum DDSS_TRANS transA = Trans;
-	enum DDSS_TRANS transB = Trans;
-	double alpha; 
-	double beta;
-	double error;
-	double max_error;
-	double count_error;	
-	double *A;
-	double *B;
-	double *C;
-	double *C_test;
-	struct timeval start, end;
-	double flops;
-	double flops_ddss; 
-	double flops_ref; 
-	int ret;
-	m = atoi( argv[1] );
-	n = atoi( argv[2] );
-	k = atoi( argv[3] );
-	
-	if ( strlen( argv[4] ) != 1 ) 
-	{
-		fprintf(stderr,"Illegal value of TRANS_A, TRANS_A can be T or N\n");
-		return 1;
-	}
-	transA_input = argv[4];	
-	
-	if ( strlen( argv[5] ) != 1 ) 
+static void test(void *_args)
+{
+	struct s_test_args *args = _args;
+	const int m = args->m;
+	const int n = args->n;
+	const int k = args->k;
+	int transA = args->transA;
+	int transB = args->transB;
+	unsigned rand_seed = (unsigned)time(NULL);
+	double *A = malloc(m * k * sizeof(double));
+	double *B = malloc(k * n * sizeof(double));
+	double *C = calloc(m * n, sizeof(double));
+	double *C_test = calloc(m * n, sizeof(double));
+
+	const double alpha = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+	const double beta  = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+ 
+	int i;
+	for (i = 0; i < m; i++)
 	{
 	{
-		fprintf(stderr,"Illegal value of TRANS_B, TRANS_B can be T or N\n");
-		return 1;
+		int j;
+		for (j = 0; j < n; j++)
+		{
+			A[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+			B[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+		}
 	}
 	}
-	transB_input = argv[5];	
 
 
-	// Set seed 
-	srand(time(NULL));
+	int res = MORSE_dgemm(transA, transB, m, n, k, alpha, A, k, B, n, beta, C, n);
+#ifdef CHECK
+	/* Check */
+	cblas_dgemm( CblasColMajor, 
+			( CBLAS_TRANSPOSE ) transA,
+			( CBLAS_TRANSPOSE ) transB,
+			m, n, k,
+			alpha, A, k,
+			B, n,
+			beta, C_test, n );
 
 
-	max_error = 1.0;
-	count_error = 0.0;
+	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
+	cblas_daxpy(m*n, -1, C, 1, C_test, 1);
+	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
+#endif
+	free(A);
+	free(B);
+	free(C);
+	free(C_test);
+}
 
 
-	// Checking inputs
-	if ( m < 0 )
-	{
-		fprintf(stderr, "Illegal value of M, M must be >= 0\n");
-		return 1;
-	}
-	if ( n < 0 )
-	{
-		fprintf(stderr, "Illegal value of N, N must be >= 0\n");
-		return 1;
-	}
-	if ( k < 0 )
+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
+{
+	int first_idx = hwloc_bitmap_first(available_cpuset);
+	int last_idx = hwloc_bitmap_last(available_cpuset);
+	int count = 0;
+	int idx = first_idx;
+	while (idx != -1 && idx <= last_idx && count < offset+nb)
 	{
 	{
-		fprintf(stderr, "Illegal value of K, K must be >= 0\n");
-		return 1;
+		if (hwloc_bitmap_isset(available_cpuset, idx))
+		{
+			if (count >= offset)
+			{
+				hwloc_bitmap_set(selected_cpuset, idx);
+			}
+			count ++;
+		}
+		idx = hwloc_bitmap_next(available_cpuset, idx);
 	}
 	}
+	assert(count == offset+nb);
+}
+
+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
+{
+	if (cpu_offset + cpu_nb > rm_nb_cpu_units)
+		exit(77);
+	if (cuda_offset + cuda_nb > rm_nb_cuda_units)
+		exit(77);
+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
+	assert(sel_cpuset != NULL);
+
+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
 
 
-	if ( transA_input[0] == 'T' )
-	{
-		transA = Trans;
-	}
-	else if ( transA_input[0] == 'N' )
 	{
 	{
-		transA = NoTrans;
+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
+		char str1[strl1+1];
+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
+
+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
+		char str2[strl2+1];
+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
+		printf("all cpus cpuset = %s\n", str1);
+		
+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
+		char str3[strl3+1];
+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
 	}
 	}
+
+	_inc_spawn_pending();
+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
+
+	hwloc_bitmap_free(sel_cpuset);
+	hwloc_bitmap_free(cpu_cpuset);
+	hwloc_bitmap_free(cuda_cpuset);
+}
+
+int main( int argc, char const *argv[])
+{
+	pthread_cond_init(&spawn_pending_cond, NULL);
+
+	int transA = MorseTrans;
+	int transB = MorseTrans;
+
+	if (argc < 6 || argc > 6)
+		usage();
+
+	int m = atoi(argv[1]);
+	if (m < 1)
+		usage();
+	int n = atoi(argv[2]);
+	if (n < 1)
+		usage();
+	int k = atoi(argv[3]);
+	if (k < 1)
+		usage();
+	
+	if (strcmp(argv[4], "T") == 0) 
+		transA = MorseTrans;
+	else if (strcmp(argv[4], "N") == 0) 
+		transA = MorseNoTrans;
 	else
 	else
-	{
-		fprintf(stderr, "Illegal value of TRANS_A, TRANS_A can be T or N\n");
-		return 1;
-	}
+		usage();
 	
 	
-	if ( transB_input[0] == 'T' )
-	{
-		transB = Trans;
-	}
-	else if ( transB_input[0] == 'N' )
-	{
-		transB = NoTrans;
-	}
+	if (strcmp(argv[5], "T") == 0) 
+		transB = MorseTrans;
+	else if (strcmp(argv[5], "N") == 0) 
+		transB = MorseNoTrans;
 	else
 	else
-	{
-		fprintf(stderr, "Illegal value of TRANS_B, TRANS_B can be T or N\n");
-		return 1;
-	}
+		usage();
 
 
-	// Matrices allocation
-	A = ( double * ) malloc( sizeof( double ) * m * k );
-	B = ( double * ) malloc( sizeof( double ) * k * n );
-	C = ( double * ) malloc( sizeof( double ) * m * n );
-	C_test = ( double * ) malloc( sizeof( double ) * m * n );
+	srand(time(NULL));
 
 
-	// Alpha and beta initialization
-	alpha = ( double ) rand() / (double) rand() + DBL_MIN;
-	beta  = ( double ) rand() / (double) rand() + DBL_MIN;
- 
-	// Matrix A, B, C and C_test initialization
-	for ( i = 0; i < m; i++ )
-	{
-		for ( j = 0; j < n; j++ )
-		{
-			A[ i * n + j ] = ( double ) rand() / (double) rand() 
-							  + DBL_MIN;
-			B[ i * n + j ] = ( double ) rand() / (double) rand() 
-							  + DBL_MIN;
-			C[ i * n + j ] = 0.0;
-			C_test[ i * n + j ] = 0.0;
-		}
-	}
+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
 
 
 	/* Test case */
 	/* Test case */
-	{
-		/* pocl_starpu_init */
-		{
-			hwloc_topology_init(&topology);
-			hwloc_topology_load(topology);
-			starpurm_initialize();
-			starpurm_set_drs_enable(NULL);
-		}
+	starpurm_initialize();
+	starpurm_set_drs_enable(NULL);
+	init_rm_infos();
+	printf("cpu units: %d\n", rm_nb_cpu_units);
+	printf("cuda units: %d\n", rm_nb_cuda_units);
+	printf("using default units\n");
+	disp_cpuset(starpurm_get_selected_cpuset());
 
 
-		/* pocl_starpu_submit_task */
+	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
+	test(&test_args);
+	{
+		int cpu_offset = 0;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
 		{
 		{
-			/* GLIBC cpu_mask as supplied by POCL */
-			cpu_set_t cpu_mask;
-			CPU_ZERO(&cpu_mask);
-			CPU_SET (0, &cpu_mask);
-			CPU_SET (1, &cpu_mask);
-			CPU_SET (2, &cpu_mask);
-			CPU_SET (3, &cpu_mask);
-
-			/* Convert GLIBC cpu_mask into HWLOC cpuset */
-			hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc();
-			int status = hwloc_cpuset_from_glibc_sched_affinity(topology, hwloc_cpuset, &cpu_mask, sizeof(cpu_set_t));
-			assert(status == 0);
-
-			/* Reset any unit previously allocated to StarPU */
-			starpurm_withdraw_all_cpus_from_starpu(NULL);
-			/* Enforce new cpu mask */
-			starpurm_assign_cpu_mask_to_starpu(NULL, hwloc_cpuset);
-
-			/* task function */
-			{
-				int TRANS_A = transA==NoTrans?MorseNoTrans:MorseTrans;
-				int TRANS_B = transB==NoTrans?MorseNoTrans:MorseTrans;
-				int M = m;
-				int N = n;
-				int K = k;
-				double ALPHA = alpha;
-				int LDA = k;
-				int LDB = n;
-				double BETA = beta;
-				int LDC = n;
-
-				MORSE_Init(4, 0);
-				int res = MORSE_dgemm(TRANS_A, TRANS_B, M, N, K,
-						ALPHA, A, LDA, B, LDB,
-						BETA, C, LDC);
-				MORSE_Finalize();
-			}
-
-			/* Withdraw all CPU units from StarPU */
-			starpurm_withdraw_all_cpus_from_starpu(NULL);
-
-			hwloc_bitmap_free(hwloc_cpuset);
+			cpu_nb = 1;
 		}
 		}
-
-		/* pocl_starpu_shutdown() */
+		int cuda_offset = 0;
+		int cuda_nb = rm_nb_cuda_units/2;
+		if (cuda_nb == 0 && rm_nb_cuda_units > 0)
 		{
 		{
-			starpurm_shutdown();
+			cuda_nb = 1;
 		}
 		}
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
 	}
 	}
-
-#if 0
-	/* Check */
-	cblas_dgemm( CblasColMajor, 
-				 ( CBLAS_TRANSPOSE ) transA,
-				 ( CBLAS_TRANSPOSE ) transB,
-									 m, n, k,
-							 		 alpha, A, k,
-							 			    B, n,
-							 		  beta, C_test, n );
-	// Error computation
-	for ( i = 0; i < m; i++ )
 	{
 	{
-		for ( j = 0; j < n; j++ )
+		int cpu_offset = rm_nb_cpu_units/2;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
 		{
 		{
-			error = abs( C[ i * n + j ] - C_test[ i * n + j ] );
-			if ( max_error > error )
-				max_error = error;
-			count_error += error;
+			cpu_nb = 1;
 		}
 		}
+		int cuda_offset = rm_nb_cuda_units/2;
+		int cuda_nb = rm_nb_cuda_units/2;
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
 	}
 	}
+	_wait_pending_spawns();
+	MORSE_Finalize();
 
 
-	fprintf(stdout, "Max. error = %1.2f\n", max_error );
-	fprintf(stdout, "Av. error = %1.2f\n", count_error / ( m * n ) );
-#endif
-#endif
+	starpurm_shutdown();
+	pthread_cond_destroy(&spawn_pending_cond);
 
 
 	return 0;
 	return 0;
 
 

+ 4 - 1
starpurm/dev/cuda_vector_scale/vector_scale.c

@@ -238,12 +238,15 @@ int main(int argc, char *argv[])
 
 
 	if (rm_nb_cpu_units > 1 && rm_nb_cuda_units > 1)
 	if (rm_nb_cpu_units > 1 && rm_nb_cuda_units > 1)
 	{
 	{
-		const int nb_cpus = rm_nb_cpu_units;
+		int nb_cpus = rm_nb_cpu_units;
 		const int nb_cudas = rm_nb_cuda_units;
 		const int nb_cudas = rm_nb_cuda_units;
 		const int cuda_type = rm_cuda_type_id;
 		const int cuda_type = rm_cuda_type_id;
 		printf("nb_cpu_units = %d\n", nb_cpus);
 		printf("nb_cpu_units = %d\n", nb_cpus);
 		printf("nb_cuda_units = %d\n", nb_cudas);
 		printf("nb_cuda_units = %d\n", nb_cudas);
 
 
+		/* Keep at least one CPU core */
+		nb_cpus--;
+
 		starpurm_set_drs_enable(NULL);
 		starpurm_set_drs_enable(NULL);
 		drs_enabled = starpurm_drs_enabled_p();
 		drs_enabled = starpurm_drs_enabled_p();
 		assert(drs_enabled != 0);
 		assert(drs_enabled != 0);

+ 1 - 0
starpurm/include/starpurm.h

@@ -138,6 +138,7 @@ hwloc_cpuset_t starpurm_get_global_cpuset(void);
 hwloc_cpuset_t starpurm_get_selected_cpuset(void);
 hwloc_cpuset_t starpurm_get_selected_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void);
+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid);
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }

+ 59 - 0
starpurm/src/starpurm.c

@@ -603,8 +603,19 @@ void starpurm_initialize(void)
 	hwloc_topology_load(rm->topology);
 	hwloc_topology_load(rm->topology);
 	rm->global_cpuset = hwloc_bitmap_alloc();
 	rm->global_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->global_cpuset);
 	hwloc_bitmap_zero(rm->global_cpuset);
+	
 	rm->all_cpu_workers_cpuset = hwloc_bitmap_alloc();
 	rm->all_cpu_workers_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->all_cpu_workers_cpuset);
 	hwloc_bitmap_zero(rm->all_cpu_workers_cpuset);
+	
+	rm->all_opencl_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_opencl_device_workers_cpuset);
+	
+	rm->all_cuda_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_cuda_device_workers_cpuset);
+	
+	rm->all_mic_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_mic_device_workers_cpuset);
+
 	rm->all_device_workers_cpuset = hwloc_bitmap_alloc();
 	rm->all_device_workers_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->all_device_workers_cpuset);
 	hwloc_bitmap_zero(rm->all_device_workers_cpuset);
 
 
@@ -705,6 +716,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_opencl_device_workers_cpuset, rm->all_opencl_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 		unitid++;
 	}
 	}
@@ -725,6 +737,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_cuda_device_workers_cpuset, rm->all_cuda_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 		unitid++;
 	}
 	}
@@ -745,6 +758,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_mic_device_workers_cpuset, rm->all_mic_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 		unitid++;
 	}
 	}
@@ -851,6 +865,9 @@ void starpurm_shutdown(void)
 
 
 	hwloc_bitmap_free(rm->global_cpuset);
 	hwloc_bitmap_free(rm->global_cpuset);
 	hwloc_bitmap_free(rm->all_cpu_workers_cpuset);
 	hwloc_bitmap_free(rm->all_cpu_workers_cpuset);
+	hwloc_bitmap_free(rm->all_opencl_device_workers_cpuset);
+	hwloc_bitmap_free(rm->all_cuda_device_workers_cpuset);
+	hwloc_bitmap_free(rm->all_mic_device_workers_cpuset);
 	hwloc_bitmap_free(rm->all_device_workers_cpuset);
 	hwloc_bitmap_free(rm->all_device_workers_cpuset);
 	hwloc_bitmap_free(rm->selected_cpuset);
 	hwloc_bitmap_free(rm->selected_cpuset);
 
 
@@ -1576,6 +1593,33 @@ hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void)
 	return hwloc_bitmap_dup(rm->all_cpu_workers_cpuset);
 	return hwloc_bitmap_dup(rm->all_cpu_workers_cpuset);
 }
 }
 
 
+static hwloc_cpuset_t starpurm_get_all_opencl_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_opencl_device_workers_cpuset);
+}
+
+static hwloc_cpuset_t starpurm_get_all_cuda_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_cuda_device_workers_cpuset);
+}
+
+static hwloc_cpuset_t starpurm_get_all_mic_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_mic_device_workers_cpuset);
+}
+
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 {
 {
 	assert(_starpurm != NULL);
 	assert(_starpurm != NULL);
@@ -1585,3 +1629,18 @@ hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 	return hwloc_bitmap_dup(rm->all_device_workers_cpuset);
 	return hwloc_bitmap_dup(rm->all_device_workers_cpuset);
 }
 }
 
 
+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	assert(typeid != starpurm_unit_cpu);
+	if (typeid == starpurm_unit_opencl)
+		return starpurm_get_all_opencl_device_workers_cpuset();
+	if (typeid == starpurm_unit_cuda)
+		return starpurm_get_all_cuda_device_workers_cpuset();
+	if (typeid == starpurm_unit_mic)
+		return starpurm_get_all_mic_device_workers_cpuset();
+	hwloc_cpuset_t empty_bitmap = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(empty_bitmap);
+	return empty_bitmap;
+}

+ 4 - 1
starpurm/src/starpurm_dlb.c

@@ -22,12 +22,15 @@
 #include <stdio.h>
 #include <stdio.h>
 #include <string.h>
 #include <string.h>
 #include <assert.h>
 #include <assert.h>
+#include <config.h>
+
 #include <hwloc.h>
 #include <hwloc.h>
+#ifdef HAVE_HWLOC_GLIBC_SCHED_H
 #include <hwloc/glibc-sched.h>
 #include <hwloc/glibc-sched.h>
+#endif
 #include <pthread.h>
 #include <pthread.h>
 #include <starpu.h>
 #include <starpu.h>
 #include <starpurm.h>
 #include <starpurm.h>
-#include <config.h>
 #include <starpurm_private.h>
 #include <starpurm_private.h>
 
 
 #ifndef STARPURM_HAVE_DLB
 #ifndef STARPURM_HAVE_DLB

+ 9 - 0
starpurm/src/starpurm_private.h

@@ -79,6 +79,15 @@ struct s_starpurm
 	/* Cpuset of all StarPU CPU workers. */
 	/* Cpuset of all StarPU CPU workers. */
 	hwloc_cpuset_t all_cpu_workers_cpuset;
 	hwloc_cpuset_t all_cpu_workers_cpuset;
 
 
+	/* Cpuset of all StarPU OpenCL workers. */
+	hwloc_cpuset_t all_opencl_device_workers_cpuset;
+
+	/* Cpuset of all StarPU CUDA workers. */
+	hwloc_cpuset_t all_cuda_device_workers_cpuset;
+
+	/* Cpuset of all StarPU MIC workers. */
+	hwloc_cpuset_t all_mic_device_workers_cpuset;
+
 	/* Cpuset of all StarPU device workers. */
 	/* Cpuset of all StarPU device workers. */
 	hwloc_cpuset_t all_device_workers_cpuset;
 	hwloc_cpuset_t all_device_workers_cpuset;