Преглед на файлове

Merge branch 'starpurm' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into starpurm

Olivier Aumage преди 6 години
родител
ревизия
eb5c169aeb

+ 9 - 0
configure.ac

@@ -1327,10 +1327,19 @@ if test x$enable_cuda = xyes; then
 		NVCCFLAGS="${NVCCFLAGS} -m64"
 	fi
 
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	CPPFLAGS="${CPPFLAGS} ${STARPU_CUDA_CPPFLAGS}"
+	SAVED_LDFLAGS="${LDFLAGS}"
+	LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
 
 	AC_CHECK_LIB([cusparse], [cusparseCreate])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
+
+	AC_CHECK_HEADER([nvml.h],
+	  [AC_CHECK_LIB([nvidia-ml], [nvmlDeviceGetTotalEnergyConsumption])])
+        CPPFLAGS="${SAVED_CPPFLAGS}"
+	LDFLAGS="${SAVED_LDFLAGS}"
 fi
 
 dnl Hey dude, are you around?

+ 132 - 0
contrib/ci.inria.fr/Jenkinsfile-basic

@@ -0,0 +1,132 @@
+#!groovy
+// StarPU --- Runtime system for heterogeneous multicore architectures.
+//
+// Copyright (C) 2018                                CNRS
+//
+// StarPU is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation; either version 2.1 of the License, or (at
+// your option) any later version.
+//
+// StarPU is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
+//
+
+def statusHasChanged = false
+
+pipeline
+{
+	agent none
+
+	// Trigger the build
+	triggers
+	{
+		// Poll gitlab explicitly every 15mn
+		pollSCM('00-59/15 * * * *')
+	}
+
+	stages
+	{
+		stage('Tarball')
+		{
+			steps
+			{
+				node('autotools')
+				{
+					checkout scm
+					sh 'contrib/ci.inria.fr/job-0-tarball.sh'
+					script
+					{
+					       env.tarballgz = sh (script: 'ls *.tar.gz', returnStdout: true).trim()
+					}
+					stash includes: "${env.tarballgz}", name: 'tarballgz'
+					stash includes: "starpu.pdf", name: 'doc'
+					// Stash those scripts because they are not in make dist
+					dir('contrib/ci.inria.fr')
+					{
+						stash includes: "job-1-check.sh", name: 'script-unix-check'
+					}
+					archiveArtifacts artifacts: "${env.tarballgz},starpu.pdf", fingerprint: true, onlyIfSuccessful: true
+					deleteDir()
+
+				}
+			}
+		}
+		stage('Check')
+		{
+			steps
+			{
+				script
+				{
+					labelToSelect = 'unix'
+					listOfNodeNames = jenkins.model.Jenkins.instance.nodes.collect
+					{
+						node -> node.getLabelString().contains(labelToSelect) ? node.name : null
+					}
+					listOfNodeNames.removeAll(Collections.singleton(null))
+
+					def p = listOfNodeNames.collectEntries
+					{
+						[ (it):
+						{
+							node(it)
+							{
+								dir('check-unix')
+								{
+									unstash 'tarballgz'
+									unstash 'script-unix-check'
+									sh 'chmod 755 job-1-check.sh && ./job-1-check.sh'
+									deleteDir()
+								}
+							}
+						}
+					]}
+					parallel p;
+				}
+			}
+		}
+	}
+
+	post
+	{
+		// hooks are called in order: always, changed, aborted, failure, success, unstable
+		changed
+		{
+			echo "Build status has changed."
+			script
+			{
+
+				statusHasChanged = true
+			}
+		}
+		success
+		{
+			echo "Build success."
+			// email when changed to success
+			script
+			{
+				if (statusHasChanged)
+				{
+					emailext(body: '${DEFAULT_CONTENT}',
+						 subject: '${DEFAULT_SUBJECT}',
+						 replyTo: '$DEFAULT_REPLYTO',
+						 to: '$DEFAULT_RECIPIENTS',
+						 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
+				}
+			}
+		}
+		failure
+		{
+			echo "Build failure."
+			// always email on failure
+			emailext(body: '${DEFAULT_CONTENT}',
+				 subject: '${DEFAULT_SUBJECT}',
+				 replyTo: '$DEFAULT_REPLYTO',
+				 to: '$DEFAULT_RECIPIENTS',
+				 recipientProviders: [[$class: 'CulpritsRecipientProvider'],[$class: 'RequesterRecipientProvider']])
+		}
+	}
+}

+ 30 - 0
contrib/ci.inria.fr/job-0-tarball.sh

@@ -0,0 +1,30 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2018                                CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
+
+./autogen.sh
+if test -d build ; then chmod -R 777 build && rm -rf build ; fi
+mkdir build && cd build
+../configure
+make V=1
+make dist
+cp *gz ..
+cp doc/doxygen/starpu.pdf ..
+make clean
+

+ 85 - 0
contrib/ci.inria.fr/job-1-check.sh

@@ -0,0 +1,85 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2013-2018                                CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+set -e
+set -x
+
+export PKG_CONFIG_PATH=/home/ci/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
+
+tarball=$(ls -tr starpu-*.tar.gz | tail -1)
+
+if test -z "$tarball"
+then
+    echo Error. No tar.gz file
+    ls
+    pwd
+    exit 1
+fi
+
+basename=$(basename $tarball .tar.gz)
+export STARPU_HOME=$PWD/$basename/home
+mkdir -p $basename
+cd $basename
+env > $PWD/env
+
+test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
+tar xfz ../$tarball
+cd $basename
+mkdir build
+cd build
+
+STARPU_CONFIGURE_OPTIONS=""
+suname=$(uname)
+if test "$suname" == "Darwin"
+then
+    STARPU_CONFIGURE_OPTIONS="--without-hwloc"
+fi
+if test "$suname" == "OpenBSD"
+then
+    STARPU_CONFIGURE_OPTIONS="--without-hwloc --disable-mlr"
+fi
+if test "$suname" == "FreeBSD"
+then
+    STARPU_CONFIGURE_OPTIONS="--disable-fortran"
+fi
+
+export CC=gcc
+
+day=$(date +%u)
+if test $day -le 5
+then
+    ../configure --enable-quick-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
+else
+    ../configure --enable-long-check --enable-verbose --enable-mpi-check --disable-build-doc $STARPU_CONFIGURE_OPTIONS
+fi
+
+make
+#make check
+(make -k check || true) > ../check_$$ 2>&1
+cat ../check_$$
+make showcheck
+
+grep "^FAIL:" ../check_$$ || true
+
+make clean
+
+grep "^FAIL:" ../check_$$ || true
+
+echo "Running on $(uname -a)"
+exit $(grep "^FAIL:" ../check_$$ | wc -l)
+

+ 31 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2013,2015,2017                      Inria
  * Copyright (C) 2010-2018                                CNRS
- * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,26 @@ performance, we give below a list of features which should be checked.
 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
 will show roughly where time is spent, and focus correspondingly.
 
+\section CheckTaskSize Check Task Size
+
+Make sure that your tasks are not too small, because the StarPU runtime overhead
+is not completely zero. You can run the tasks_size_overhead.sh script to get an
+idea of the scalability of tasks depending on their duration (in µs), on your
+own system.
+
+Typically, 10µs-ish tasks are definitely too small, the CUDA overhead itself is
+much bigger than this.
+
+1ms-ish tasks may be a good start, but will not necessarily scale to many dozens
+of cores, so it's better to try to get 10ms-ish tasks.
+
+Tasks durations can easily be observed when performance models are defined (see
+\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
+<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
+
+When using parallel tasks, the problem is even worse since StarPU has to
+synchronize the execution of tasks.
+
 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
 
 The \ref enable-fast "--enable-fast" configuration option disables all
@@ -116,6 +136,16 @@ enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
 number of kernels to execute concurrently.  This is useful when kernels are
 small and do not feed the whole GPU with threads to run.
 
+Concerning memory allocation, you should really not use cudaMalloc/cudaFree
+within the kernel, since cudaFree introduces a awfully lot of synchronizations
+within CUDA itself. You should instead add a parameter to the codelet with the
+STARPU_SCRATCH mode access. You can then pass to the task a handle registered
+with the desired size but with the NULL pointer, that handle can even be the
+shared between tasks, StarPU will allocate per-task data on the fly before task
+execution, and reuse the allocated data between tasks.
+
+See <c>examples/pi/pi_redux.c</c> for an example of use.
+
 \section OpenCL-specificOptimizations OpenCL-specific Optimizations
 
 If the kernel can be made to only use the StarPU-provided command queue or other self-allocated

+ 9 - 1
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2018                                CNRS
  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
@@ -1173,6 +1173,14 @@ If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the
 discovered by StarPU.
 </dd>
 
+<dt>STARPU_IDLE_FILE</dt>
+<dd>
+\anchor STARPU_IDLE_FILE
+\addindex __env__STARPU_IDLE_FILE
+If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
+The file will contain the sum of the idle times of all the workers.
+</dd>
+
 </dl>
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 3 - 3
doc/doxygen/chapters/api/profiling.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2015,2017                           CNRS
- * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2016, 2018                      Université de Bordeaux
  * Copyright (C) 2011-2012                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -72,7 +72,7 @@ profiling was enabled.
     Number of cycles stalled within the task, only available in the MoviSim
 
 \var double starpu_profiling_task_info::energy_consumed
-Energy consumed by the task, only available in the MoviSim
+Energy consumed by the task, in Joules
 
 \struct starpu_profiling_worker_info
 This structure contains the profiling information associated to
@@ -94,7 +94,7 @@ starpu_profiling_worker_get_info()
 \var uint64_t starpu_profiling_worker_info::stall_cycles
         Number of cycles stalled within the worker, only available in the MoviSim
 \var double starpu_profiling_worker_info::energy_consumed
-        Energy consumed by the worker, only available in the MoviSim
+        Energy consumed by the worker, in Joules
 
 \struct starpu_profiling_bus_info
 todo

+ 2 - 1
examples/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2011-2017                                Inria
 # Copyright (C) 2017                                     Erwan Leria
 # Copyright (C) 2009-2018                                Université de Bordeaux
-# Copyright (C) 2010-2015,2017                           CNRS
+# Copyright (C) 2010-2015,2017,2018                           CNRS
 # Copyright (C) 2011                                     Télécom-SudParis
 # Copyright (C) 2016                                     Uppsala University
 #
@@ -227,6 +227,7 @@ STARPU_EXAMPLES +=				\
 	filters/fmultiple_submit		\
 	filters/fmultiple_submit_readonly	\
 	filters/fmultiple_submit_implicit	\
+	filters/frecursive			\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\

+ 170 - 0
examples/filters/frecursive.c

@@ -0,0 +1,170 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2018                                     CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void cpu_codelet(void *buffers[], void *cl_arg)
+{
+        unsigned i, j;
+        int factor;
+
+	starpu_codelet_unpack_args(cl_arg, &factor, 0);
+        /* length of the matrix */
+        unsigned nx = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned ny = STARPU_MATRIX_GET_NY(buffers[0]);
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        /* local copy of the matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+	FPRINTF(stderr, "computing on matrix with nx=%d, ny=%d, ld=%d\n", nx, ny, ld);
+        for(j=0; j<ny ; j++)
+	{
+                for(i=0; i<nx ; i++)
+                        val[(j*ld)+i] *= factor;
+        }
+}
+
+static struct starpu_codelet cl =
+{
+        .cpu_funcs[0] = cpu_codelet,
+        .nbuffers = 1,
+	.modes[0] = STARPU_RW,
+};
+
+#define NX 400
+#define NY 80
+#define LD NX
+#define PARTS 4
+
+int main(void)
+{
+        int *matrix;
+	starpu_data_handle_t matrix_handle;
+	starpu_data_handle_t subhandle_l1[PARTS];
+	starpu_data_handle_t subhandle_l2[PARTS][PARTS];
+	starpu_data_handle_t subhandle_l3[PARTS][PARTS][PARTS];
+	int ret, submit;
+
+	int factor = 12;
+	int n=1;
+	int i,j,k;
+
+        ret = starpu_init(NULL);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		return 77;
+	}
+
+	if (starpu_cpu_worker_get_count() < 1)
+	{
+		FPRINTF(stderr, "This application requires at least 1 cpu worker\n");
+		starpu_shutdown();
+		return 77;
+	}
+
+	matrix = (int*)malloc(NX * NY * sizeof(int));
+        assert(matrix);
+	starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, LD, NX, NY, sizeof(int));
+
+        for(j=0 ; j<NY ; j++)
+	{
+                for(i=0 ; i<NX ; i++)
+		{
+                        matrix[(j*LD)+i] = n++;
+                }
+        }
+
+	/* Split the matrix in PARTS sub-matrices, each sub-matrix in PARTS sub-sub-matrices, and each sub-sub matrix in PARTS sub-sub-sub-matrices */
+	struct starpu_data_filter f =
+	{
+		.filter_func = starpu_matrix_filter_block,
+		.nchildren = PARTS
+	};
+	struct starpu_data_filter f2 =
+	{
+		.filter_func = starpu_matrix_filter_vertical_block,
+		.nchildren = PARTS
+	};
+	starpu_data_partition_plan(matrix_handle, &f, subhandle_l1);
+	for(i=0 ; i<PARTS ; i++)
+	{
+		starpu_data_partition_plan(subhandle_l1[i], &f2, subhandle_l2[i]);
+		for(j=0 ; j<PARTS ; j++)
+		{
+			starpu_data_partition_plan(subhandle_l2[i][j], &f, subhandle_l3[i][j]);
+		}
+	}
+
+        /* Submit a task on the first sub-matrix and sub-sub matrix, and on all others sub-sub-matrices */
+	ret = starpu_task_insert(&cl,
+				 STARPU_RW, subhandle_l1[0],
+				 STARPU_VALUE, &factor, sizeof(factor),
+				 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	for (i=1; i<PARTS; i++)
+	{
+		ret = starpu_task_insert(&cl,
+					 STARPU_RW, subhandle_l2[i][0],
+					 STARPU_VALUE, &factor, sizeof(factor),
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		for (j=1; j<PARTS; j++)
+		{
+			for (k=0; k<PARTS; k++)
+			{
+				ret = starpu_task_insert(&cl,
+							 STARPU_RW, subhandle_l3[i][j][k],
+							 STARPU_VALUE, &factor, sizeof(factor),
+							 0);
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			}
+		}
+	}
+
+	for(i=0 ; i<PARTS ; i++)
+	{
+		for(j=0 ; j<PARTS ; j++)
+		{
+			starpu_data_partition_clean(subhandle_l2[i][j], PARTS, subhandle_l3[i][j]);
+
+		}
+		starpu_data_partition_clean(subhandle_l1[i], PARTS, subhandle_l2[i]);
+	}
+	starpu_data_partition_clean(matrix_handle, PARTS, subhandle_l1);
+	starpu_data_unregister(matrix_handle);
+
+	/* Print result matrix */
+	n=1;
+	for(j=0 ; j<NY ; j++)
+	{
+		for(i=0 ; i<NX ; i++)
+		{
+			if (matrix[(j*LD)+i] != (int) n*12)
+			{
+				FPRINTF(stderr, "Incorrect result %4d != %4d", matrix[(j*LD)+i], n*12);
+				ret=1;
+			}
+			n++;
+		}
+	}
+
+	free(matrix);
+        starpu_shutdown();
+
+	return ret;
+}

+ 1 - 1
src/common/fxt.h

@@ -620,7 +620,7 @@ do {									\
 		}							\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
 } while(0);
 

+ 6 - 1
src/common/prio_list.h

@@ -167,7 +167,12 @@
 	{ \
 		/* Sort by decreasing order */ \
 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
-		return (e2->prio - prio); \
+		if (e2->prio < prio) \
+			return -1; \
+		if (e2->prio == prio) \
+			return 0; \
+		/* e2->prio > prio */ \
+		return 1; \
 	} \
 	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
 	{ \

+ 4 - 2
src/core/perfmodel/perfmodel_history.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2016-2017                      Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -477,6 +477,8 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 			multi_invalid = (multi_invalid||isnan(reg_model->coeff[i]));
 		}
 		reg_model->multi_valid = !multi_invalid;
+		res = fscanf(f, "\n");
+		STARPU_ASSERT_MSG(res == 0, "Incorrect performance model file %s", path);
 	}
 }
 
@@ -1763,7 +1765,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 					unsigned n = entry->nsample;
 					entry->mean = entry->sum / n;
-					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum)/n))/n);
 				}
 
 				if (j->task->flops != 0.)

+ 2 - 0
src/core/sched_policy.c

@@ -397,6 +397,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			struct starpu_task *alias = starpu_task_dup(task);
 			alias->destroy = 1;
 
+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 		}
@@ -581,6 +582,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 					if (job->task_size > 1)
 					{
 						alias = starpu_task_dup(task);
+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 						alias->destroy = 1;
 					}
 					else

+ 1 - 1
src/datawizard/malloc.c

@@ -360,7 +360,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 				ret = -ENOMEM;
 		}
 
-#if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
+#if (defined(STARPU_SIMGRID) && (SIMGRID_VERSION < 31500 || SIMGRID_VERSION == 31559)) || defined(STARPU_USE_CUDA)
 end:
 #endif
 	if (ret == 0)

+ 11 - 1
src/drivers/cpu/driver_cpu.c

@@ -107,12 +107,22 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 			/* rebind to single CPU */
 			_starpu_bind_thread_on_cpu(cpu_args->bindid, cpu_args->workerid);
 	}
+	else
+	{
+		_STARPU_TRACE_START_EXECUTING();
+	}
+
+	if (is_parallel_task)
+	{
+		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
+		if (rank != 0)
+			_STARPU_TRACE_END_EXECUTING();
+	}
 
 	_starpu_driver_end_job(cpu_args, j, perf_arch, rank, profiling);
 
 	if (is_parallel_task)
 	{
-		STARPU_PTHREAD_BARRIER_WAIT(&j->after_work_barrier);
 #ifdef STARPU_SIMGRID
 		if (rank == 0)
 		{

+ 48 - 1
src/drivers/cuda/driver_cuda.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2012,2014,2016-2017                 Inria
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017                                CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -31,6 +31,9 @@
 #ifdef HAVE_CUDA_GL_INTEROP_H
 #include <cuda_gl_interop.h>
 #endif
+#ifdef HAVE_LIBNVIDIA_ML
+#include <nvml.h>
+#endif
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
@@ -53,9 +56,13 @@
 static int ncudagpus = -1;
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
+#ifdef HAVE_LIBNVIDIA_ML
+static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
+#endif
 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
+static char used_stream[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
 /* Note: streams are not thread-safe, so we define them for each CUDA worker
@@ -106,6 +113,9 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 	if (STARPU_UNLIKELY(cures != cudaSuccess))
 		cnt = 0;
 	config->topology.nhwcudagpus = cnt;
+#ifdef HAVE_LIBNVIDIA_ML
+	nvmlInit();
+#endif
 #endif
 }
 
@@ -215,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 {
 	int worker = starpu_worker_get_id_check();
 
+	used_stream[worker] = 1;
 	return streams[worker];
 }
 
@@ -520,7 +531,30 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 #else
+#ifdef HAVE_LIBNVIDIA_ML
+		unsigned long long energy_start = 0;
+		nvmlReturn_t nvmlRet = -1;
+		if (profiling || (cl->energy_model && cl->energy_model->benchmarking))
+		{
+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_start);
+		}
+#endif
+
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+
+#ifdef HAVE_LIBNVIDIA_ML
+		if (nvmlRet == NVML_SUCCESS &&
+			(profiling || (cl->energy_model && cl->energy_model->benchmarking)))
+		{
+			unsigned long long energy_end;
+			nvmlRet = nvmlDeviceGetTotalEnergyConsumption(nvmlDev[worker->devid], &energy_end);
+#ifdef STARPU_DEVEL
+#warning TODO: measure idle consumption to subtract it
+#endif
+			if (nvmlRet == NVML_SUCCESS)
+				task->profiling_info->energy_consumed += (energy_end - energy_start) / 1000.;
+		}
+#endif
 #endif
 		_STARPU_TRACE_END_EXECUTING();
 	}
@@ -581,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 		}
 	}
 
+#ifndef STARPU_SIMGRID
+	if (!used_stream[workerid])
+	{
+		used_stream[workerid] = 1;
+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
+	}
+#endif
+
 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
 	{
 		if (worker->pipeline_length == 0)
@@ -682,6 +724,11 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
 #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
+#ifdef HAVE_LIBNVIDIA_ML
+		char busid[13];
+		snprintf(busid, sizeof(busid), "%04x:%02x:%02x.0", props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
+		nvmlDeviceGetHandleByPciBusId(busid, &nvmlDev[devid]);
+#endif
 		if (props[devid].pciDomainID)
 			snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
 		else

+ 3 - 1
src/sched_policies/component_worker.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2014,2017                           Inria
  * Copyright (C) 2010-2012,2014-2017                      CNRS
- * Copyright (C) 2010-2017                                Université de Bordeaux
+ * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2013                                     Simon Archipoff
  *
@@ -631,6 +631,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 	task_alias[0]->task->destroy = 1;
 	task_alias[0]->left = NULL;
 	task_alias[0]->ntasks = combined_worker->worker_size;
+	_STARPU_TRACE_JOB_PUSH(task_alias[0]->task, task_alias[0]->task->priority > 0);
 	int i;
 	for(i = 1; i < combined_worker->worker_size; i++)
 	{
@@ -641,6 +642,7 @@ static int combined_worker_push_task(struct starpu_sched_component * component,
 		task_alias[i]->left = task_alias[i-1];
 		task_alias[i - 1]->right = task_alias[i];
 		task_alias[i]->pntasks = &(task_alias[0]->ntasks);
+		_STARPU_TRACE_JOB_PUSH(task_alias[i]->task, task_alias[i]->task->priority > 0);
 	}
 
 	starpu_pthread_mutex_t * mutex_to_unlock = NULL;

+ 3 - 0
src/sched_policies/parallel_eager.c

@@ -342,6 +342,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		struct starpu_task *alias = starpu_task_dup(task);
 		int local_worker = combined_workerid[i];
 		alias->destroy = 1;
+		_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 		_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 	}
 
@@ -352,6 +353,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
+	_STARPU_TRACE_JOB_PUSH(master_alias, master_alias->priority > 0);
+
 	for (i = 1; i < worker_size; i++)
 	{
 		int local_worker = combined_workerid[i];

+ 2 - 1
src/sched_policies/parallel_heft.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015,2017                      Inria
- * Copyright (C) 2010-2017                                Université de Bordeaux
+ * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2011-2017                                CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -175,6 +175,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 			ntasks[local_combined_workerid]++;
 			_starpu_worker_unlock(local_combined_workerid);
 
+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
 			ret |= starpu_push_local_task(local_combined_workerid, alias, prio);
 		}
 

+ 238 - 195
starpurm/dev/chameleon_test/dgemm.c

@@ -1,3 +1,21 @@
+/* StarPURM --- StarPU Resource Management Layer.
+ *
+ * Copyright (C) 2017, 2018  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */
+
 #define _GNU_SOURCE
 #include <sched.h>
 #include <stdio.h>
@@ -6,18 +24,56 @@
 #include <morse.h>
 #include <starpurm.h>
 #include <hwloc.h>
+#include <pthread.h>
+
+#define CHECK
 
 static int rm_cpu_type_id = -1;
+static int rm_cuda_type_id = -1;
 static int rm_nb_cpu_units = 0;
+static int rm_nb_cuda_units = 0;
+static const int nb_random_tests = 10;
 
-static void test1();
-static void init_rm_infos(void);
+static unsigned spawn_pending = 0;
+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t spawn_pending_cond;
 
-static const int nb_random_tests = 10;
+static void _inc_spawn_pending(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending < UINT_MAX);
+	spawn_pending++;
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
 
-static void test1()
+static void _dec_spawn_pending(void)
 {
-	int i;
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending > 0);
+	spawn_pending--;
+	if (spawn_pending == 0)
+		pthread_cond_broadcast(&spawn_pending_cond);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void _wait_pending_spawns(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	while (spawn_pending > 0)
+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void spawn_callback(void *_arg)
+{
+	assert(42 == (uintptr_t)_arg);
+	_dec_spawn_pending();
+}
+
+static void usage(void)
+{
+	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
+	exit(EXIT_FAILURE);
 }
 
 static void init_rm_infos(void)
@@ -30,236 +86,223 @@ static void init_rm_infos(void)
 		exit(77);
 	}
 
+	int cuda_type = starpurm_get_device_type_id("cuda");
+	int nb_cuda_units = starpurm_get_nb_devices_by_type(cuda_type);
+
 	rm_cpu_type_id = cpu_type;
+	rm_cuda_type_id = cuda_type;
 	rm_nb_cpu_units = nb_cpu_units;
+	rm_nb_cuda_units = nb_cuda_units;
 }
 
-static void disp_selected_cpuset(void)
+
+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
 {
-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
 	char str[strl+1];
 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
-	printf("selected cpuset = %s\n", str);
+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
 }
 
-int main( int argc, char const *argv[])
+struct s_test_args
 {
-	starpurm_initialize();
-	init_rm_infos();
-	printf("using default units\n");
-	disp_selected_cpuset();
-	test1();
-	starpurm_shutdown();
-#if 0
+	const int m;
+	const int n;
+	const int k;
+	int transA;
+	int transB;
+};
 
-	if(argc < 6 || argc > 6)
-	{ 		
-		fprintf(stderr, "Usage: ./test_dgemm M N K TRANS_A TRANS_B\n" );
-		return 1;
-	}
-	
-	// Local variables
-	int i, j;
-	int m, n, k;
-	const char *transA_input = NULL;
-	const char *transB_input = NULL;
-	enum DDSS_TRANS transA = Trans;
-	enum DDSS_TRANS transB = Trans;
-	double alpha; 
-	double beta;
-	double error;
-	double max_error;
-	double count_error;	
-	double *A;
-	double *B;
-	double *C;
-	double *C_test;
-	struct timeval start, end;
-	double flops;
-	double flops_ddss; 
-	double flops_ref; 
-	int ret;
-	m = atoi( argv[1] );
-	n = atoi( argv[2] );
-	k = atoi( argv[3] );
-	
-	if ( strlen( argv[4] ) != 1 ) 
-	{
-		fprintf(stderr,"Illegal value of TRANS_A, TRANS_A can be T or N\n");
-		return 1;
-	}
-	transA_input = argv[4];	
-	
-	if ( strlen( argv[5] ) != 1 ) 
+static void test(void *_args)
+{
+	struct s_test_args *args = _args;
+	const int m = args->m;
+	const int n = args->n;
+	const int k = args->k;
+	int transA = args->transA;
+	int transB = args->transB;
+	unsigned rand_seed = (unsigned)time(NULL);
+	double *A = malloc(m * k * sizeof(double));
+	double *B = malloc(k * n * sizeof(double));
+	double *C = calloc(m * n, sizeof(double));
+	double *C_test = calloc(m * n, sizeof(double));
+
+	const double alpha = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+	const double beta  = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+ 
+	int i;
+	for (i = 0; i < m; i++)
 	{
-		fprintf(stderr,"Illegal value of TRANS_B, TRANS_B can be T or N\n");
-		return 1;
+		int j;
+		for (j = 0; j < n; j++)
+		{
+			A[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+			B[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN);
+		}
 	}
-	transB_input = argv[5];	
 
-	// Set seed 
-	srand(time(NULL));
+	int res = MORSE_dgemm(transA, transB, m, n, k, alpha, A, k, B, n, beta, C, n);
+#ifdef CHECK
+	/* Check */
+	cblas_dgemm( CblasColMajor, 
+			( CBLAS_TRANSPOSE ) transA,
+			( CBLAS_TRANSPOSE ) transB,
+			m, n, k,
+			alpha, A, k,
+			B, n,
+			beta, C_test, n );
 
-	max_error = 1.0;
-	count_error = 0.0;
+	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
+	cblas_daxpy(m*n, -1, C, 1, C_test, 1);
+	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
+#endif
+	free(A);
+	free(B);
+	free(C);
+	free(C_test);
+}
 
-	// Checking inputs
-	if ( m < 0 )
-	{
-		fprintf(stderr, "Illegal value of M, M must be >= 0\n");
-		return 1;
-	}
-	if ( n < 0 )
-	{
-		fprintf(stderr, "Illegal value of N, N must be >= 0\n");
-		return 1;
-	}
-	if ( k < 0 )
+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
+{
+	int first_idx = hwloc_bitmap_first(available_cpuset);
+	int last_idx = hwloc_bitmap_last(available_cpuset);
+	int count = 0;
+	int idx = first_idx;
+	while (idx != -1 && idx <= last_idx && count < offset+nb)
 	{
-		fprintf(stderr, "Illegal value of K, K must be >= 0\n");
-		return 1;
+		if (hwloc_bitmap_isset(available_cpuset, idx))
+		{
+			if (count >= offset)
+			{
+				hwloc_bitmap_set(selected_cpuset, idx);
+			}
+			count ++;
+		}
+		idx = hwloc_bitmap_next(available_cpuset, idx);
 	}
+	assert(count == offset+nb);
+}
+
+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
+{
+	if (cpu_offset + cpu_nb > rm_nb_cpu_units)
+		exit(77);
+	if (cuda_offset + cuda_nb > rm_nb_cuda_units)
+		exit(77);
+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
+	assert(sel_cpuset != NULL);
+
+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
 
-	if ( transA_input[0] == 'T' )
-	{
-		transA = Trans;
-	}
-	else if ( transA_input[0] == 'N' )
 	{
-		transA = NoTrans;
+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
+		char str1[strl1+1];
+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
+
+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
+		char str2[strl2+1];
+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
+		printf("all cpus cpuset = %s\n", str1);
+		
+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
+		char str3[strl3+1];
+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
 	}
+
+	_inc_spawn_pending();
+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
+
+	hwloc_bitmap_free(sel_cpuset);
+	hwloc_bitmap_free(cpu_cpuset);
+	hwloc_bitmap_free(cuda_cpuset);
+}
+
+int main( int argc, char const *argv[])
+{
+	pthread_cond_init(&spawn_pending_cond, NULL);
+
+	int transA = MorseTrans;
+	int transB = MorseTrans;
+
+	if (argc < 6 || argc > 6)
+		usage();
+
+	int m = atoi(argv[1]);
+	if (m < 1)
+		usage();
+	int n = atoi(argv[2]);
+	if (n < 1)
+		usage();
+	int k = atoi(argv[3]);
+	if (k < 1)
+		usage();
+	
+	if (strcmp(argv[4], "T") == 0) 
+		transA = MorseTrans;
+	else if (strcmp(argv[4], "N") == 0) 
+		transA = MorseNoTrans;
 	else
-	{
-		fprintf(stderr, "Illegal value of TRANS_A, TRANS_A can be T or N\n");
-		return 1;
-	}
+		usage();
 	
-	if ( transB_input[0] == 'T' )
-	{
-		transB = Trans;
-	}
-	else if ( transB_input[0] == 'N' )
-	{
-		transB = NoTrans;
-	}
+	if (strcmp(argv[5], "T") == 0) 
+		transB = MorseTrans;
+	else if (strcmp(argv[5], "N") == 0) 
+		transB = MorseNoTrans;
 	else
-	{
-		fprintf(stderr, "Illegal value of TRANS_B, TRANS_B can be T or N\n");
-		return 1;
-	}
+		usage();
 
-	// Matrices allocation
-	A = ( double * ) malloc( sizeof( double ) * m * k );
-	B = ( double * ) malloc( sizeof( double ) * k * n );
-	C = ( double * ) malloc( sizeof( double ) * m * n );
-	C_test = ( double * ) malloc( sizeof( double ) * m * n );
+	srand(time(NULL));
 
-	// Alpha and beta initialization
-	alpha = ( double ) rand() / (double) rand() + DBL_MIN;
-	beta  = ( double ) rand() / (double) rand() + DBL_MIN;
- 
-	// Matrix A, B, C and C_test initialization
-	for ( i = 0; i < m; i++ )
-	{
-		for ( j = 0; j < n; j++ )
-		{
-			A[ i * n + j ] = ( double ) rand() / (double) rand() 
-							  + DBL_MIN;
-			B[ i * n + j ] = ( double ) rand() / (double) rand() 
-							  + DBL_MIN;
-			C[ i * n + j ] = 0.0;
-			C_test[ i * n + j ] = 0.0;
-		}
-	}
+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
 
 	/* Test case */
-	{
-		/* pocl_starpu_init */
-		{
-			hwloc_topology_init(&topology);
-			hwloc_topology_load(topology);
-			starpurm_initialize();
-			starpurm_set_drs_enable(NULL);
-		}
+	starpurm_initialize();
+	starpurm_set_drs_enable(NULL);
+	init_rm_infos();
+	printf("cpu units: %d\n", rm_nb_cpu_units);
+	printf("cuda units: %d\n", rm_nb_cuda_units);
+	printf("using default units\n");
+	disp_cpuset(starpurm_get_selected_cpuset());
 
-		/* pocl_starpu_submit_task */
+	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
+	test(&test_args);
+	{
+		int cpu_offset = 0;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
 		{
-			/* GLIBC cpu_mask as supplied by POCL */
-			cpu_set_t cpu_mask;
-			CPU_ZERO(&cpu_mask);
-			CPU_SET (0, &cpu_mask);
-			CPU_SET (1, &cpu_mask);
-			CPU_SET (2, &cpu_mask);
-			CPU_SET (3, &cpu_mask);
-
-			/* Convert GLIBC cpu_mask into HWLOC cpuset */
-			hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc();
-			int status = hwloc_cpuset_from_glibc_sched_affinity(topology, hwloc_cpuset, &cpu_mask, sizeof(cpu_set_t));
-			assert(status == 0);
-
-			/* Reset any unit previously allocated to StarPU */
-			starpurm_withdraw_all_cpus_from_starpu(NULL);
-			/* Enforce new cpu mask */
-			starpurm_assign_cpu_mask_to_starpu(NULL, hwloc_cpuset);
-
-			/* task function */
-			{
-				int TRANS_A = transA==NoTrans?MorseNoTrans:MorseTrans;
-				int TRANS_B = transB==NoTrans?MorseNoTrans:MorseTrans;
-				int M = m;
-				int N = n;
-				int K = k;
-				double ALPHA = alpha;
-				int LDA = k;
-				int LDB = n;
-				double BETA = beta;
-				int LDC = n;
-
-				MORSE_Init(4, 0);
-				int res = MORSE_dgemm(TRANS_A, TRANS_B, M, N, K,
-						ALPHA, A, LDA, B, LDB,
-						BETA, C, LDC);
-				MORSE_Finalize();
-			}
-
-			/* Withdraw all CPU units from StarPU */
-			starpurm_withdraw_all_cpus_from_starpu(NULL);
-
-			hwloc_bitmap_free(hwloc_cpuset);
+			cpu_nb = 1;
 		}
-
-		/* pocl_starpu_shutdown() */
+		int cuda_offset = 0;
+		int cuda_nb = rm_nb_cuda_units/2;
+		if (cuda_nb == 0 && rm_nb_cuda_units > 0)
 		{
-			starpurm_shutdown();
+			cuda_nb = 1;
 		}
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
 	}
-
-#if 0
-	/* Check */
-	cblas_dgemm( CblasColMajor, 
-				 ( CBLAS_TRANSPOSE ) transA,
-				 ( CBLAS_TRANSPOSE ) transB,
-									 m, n, k,
-							 		 alpha, A, k,
-							 			    B, n,
-							 		  beta, C_test, n );
-	// Error computation
-	for ( i = 0; i < m; i++ )
 	{
-		for ( j = 0; j < n; j++ )
+		int cpu_offset = rm_nb_cpu_units/2;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
 		{
-			error = abs( C[ i * n + j ] - C_test[ i * n + j ] );
-			if ( max_error > error )
-				max_error = error;
-			count_error += error;
+			cpu_nb = 1;
 		}
+		int cuda_offset = rm_nb_cuda_units/2;
+		int cuda_nb = rm_nb_cuda_units/2;
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
 	}
+	_wait_pending_spawns();
+	MORSE_Finalize();
 
-	fprintf(stdout, "Max. error = %1.2f\n", max_error );
-	fprintf(stdout, "Av. error = %1.2f\n", count_error / ( m * n ) );
-#endif
-#endif
+	starpurm_shutdown();
+	pthread_cond_destroy(&spawn_pending_cond);
 
 	return 0;
 

+ 4 - 1
starpurm/dev/cuda_vector_scale/vector_scale.c

@@ -238,12 +238,15 @@ int main(int argc, char *argv[])
 
 	if (rm_nb_cpu_units > 1 && rm_nb_cuda_units > 1)
 	{
-		const int nb_cpus = rm_nb_cpu_units;
+		int nb_cpus = rm_nb_cpu_units;
 		const int nb_cudas = rm_nb_cuda_units;
 		const int cuda_type = rm_cuda_type_id;
 		printf("nb_cpu_units = %d\n", nb_cpus);
 		printf("nb_cuda_units = %d\n", nb_cudas);
 
+		/* Keep at least one CPU core */
+		nb_cpus--;
+
 		starpurm_set_drs_enable(NULL);
 		drs_enabled = starpurm_drs_enabled_p();
 		assert(drs_enabled != 0);

+ 1 - 0
starpurm/include/starpurm.h

@@ -138,6 +138,7 @@ hwloc_cpuset_t starpurm_get_global_cpuset(void);
 hwloc_cpuset_t starpurm_get_selected_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void);
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void);
+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid);
 
 #ifdef __cplusplus
 }

+ 59 - 0
starpurm/src/starpurm.c

@@ -603,8 +603,19 @@ void starpurm_initialize(void)
 	hwloc_topology_load(rm->topology);
 	rm->global_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->global_cpuset);
+	
 	rm->all_cpu_workers_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->all_cpu_workers_cpuset);
+	
+	rm->all_opencl_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_opencl_device_workers_cpuset);
+	
+	rm->all_cuda_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_cuda_device_workers_cpuset);
+	
+	rm->all_mic_device_workers_cpuset = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(rm->all_mic_device_workers_cpuset);
+
 	rm->all_device_workers_cpuset = hwloc_bitmap_alloc();
 	hwloc_bitmap_zero(rm->all_device_workers_cpuset);
 
@@ -705,6 +716,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_opencl_device_workers_cpuset, rm->all_opencl_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 	}
@@ -725,6 +737,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_cuda_device_workers_cpuset, rm->all_cuda_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 	}
@@ -745,6 +758,7 @@ void starpurm_initialize(void)
 		rm->units[unitid].worker_cpuset = starpu_worker_get_hwloc_cpuset(rm->units[unitid].workerid);
 		pthread_cond_init(&rm->units[unitid].unit_available_cond, NULL);
 		hwloc_bitmap_or(rm->global_cpuset, rm->global_cpuset, rm->units[unitid].worker_cpuset);
+		hwloc_bitmap_or(rm->all_mic_device_workers_cpuset, rm->all_mic_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		hwloc_bitmap_or(rm->all_device_workers_cpuset, rm->all_device_workers_cpuset, rm->units[unitid].worker_cpuset);
 		unitid++;
 	}
@@ -851,6 +865,9 @@ void starpurm_shutdown(void)
 
 	hwloc_bitmap_free(rm->global_cpuset);
 	hwloc_bitmap_free(rm->all_cpu_workers_cpuset);
+	hwloc_bitmap_free(rm->all_opencl_device_workers_cpuset);
+	hwloc_bitmap_free(rm->all_cuda_device_workers_cpuset);
+	hwloc_bitmap_free(rm->all_mic_device_workers_cpuset);
 	hwloc_bitmap_free(rm->all_device_workers_cpuset);
 	hwloc_bitmap_free(rm->selected_cpuset);
 
@@ -1576,6 +1593,33 @@ hwloc_cpuset_t starpurm_get_all_cpu_workers_cpuset(void)
 	return hwloc_bitmap_dup(rm->all_cpu_workers_cpuset);
 }
 
+static hwloc_cpuset_t starpurm_get_all_opencl_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_opencl_device_workers_cpuset);
+}
+
+static hwloc_cpuset_t starpurm_get_all_cuda_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_cuda_device_workers_cpuset);
+}
+
+static hwloc_cpuset_t starpurm_get_all_mic_device_workers_cpuset(void)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	struct s_starpurm *rm = _starpurm;
+
+	return hwloc_bitmap_dup(rm->all_mic_device_workers_cpuset);
+}
+
 hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 {
 	assert(_starpurm != NULL);
@@ -1585,3 +1629,18 @@ hwloc_cpuset_t starpurm_get_all_device_workers_cpuset(void)
 	return hwloc_bitmap_dup(rm->all_device_workers_cpuset);
 }
 
+hwloc_cpuset_t starpurm_get_all_device_workers_cpuset_by_type(int typeid)
+{
+	assert(_starpurm != NULL);
+	assert(_starpurm->state != state_uninitialized);
+	assert(typeid != starpurm_unit_cpu);
+	if (typeid == starpurm_unit_opencl)
+		return starpurm_get_all_opencl_device_workers_cpuset();
+	if (typeid == starpurm_unit_cuda)
+		return starpurm_get_all_cuda_device_workers_cpuset();
+	if (typeid == starpurm_unit_mic)
+		return starpurm_get_all_mic_device_workers_cpuset();
+	hwloc_cpuset_t empty_bitmap = hwloc_bitmap_alloc();
+	hwloc_bitmap_zero(empty_bitmap);
+	return empty_bitmap;
+}

+ 4 - 1
starpurm/src/starpurm_dlb.c

@@ -22,12 +22,15 @@
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
+#include <config.h>
+
 #include <hwloc.h>
+#ifdef HAVE_HWLOC_GLIBC_SCHED_H
 #include <hwloc/glibc-sched.h>
+#endif
 #include <pthread.h>
 #include <starpu.h>
 #include <starpurm.h>
-#include <config.h>
 #include <starpurm_private.h>
 
 #ifndef STARPURM_HAVE_DLB

+ 9 - 0
starpurm/src/starpurm_private.h

@@ -79,6 +79,15 @@ struct s_starpurm
 	/* Cpuset of all StarPU CPU workers. */
 	hwloc_cpuset_t all_cpu_workers_cpuset;
 
+	/* Cpuset of all StarPU OpenCL workers. */
+	hwloc_cpuset_t all_opencl_device_workers_cpuset;
+
+	/* Cpuset of all StarPU CUDA workers. */
+	hwloc_cpuset_t all_cuda_device_workers_cpuset;
+
+	/* Cpuset of all StarPU MIC workers. */
+	hwloc_cpuset_t all_mic_device_workers_cpuset;
+
 	/* Cpuset of all StarPU device workers. */
 	hwloc_cpuset_t all_device_workers_cpuset;