лет назад: 10 · 901652a877
--- a/Makefile.am
+++ b/Makefile.am
@@ -94,6 +94,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_thread.h			\
			
 
				 	include/starpu_thread_util.h		\
			
 
				 	include/starpu_tree.h			\
			
 
				+	include/starpu_clusters_util.h 		\
			
 
				 	include/starpu_simgrid_wrap.h		\
			
 
				 	include/starpu_mod.f90
			
 
				 
			
--- a/configure.ac
+++ b/configure.ac
@@ -2146,6 +2146,10 @@ AC_MSG_RESULT($enable_openmp)
 
				 
			
 
				 IS_SUPPORTED_CFLAG(-fopenmp, :)
			
 
				 AM_CONDITIONAL([STARPU_HAVE_OPENMP],[test "$option_available" = "1"])
			
 
				+if test $option_available = 1; then
			
 
				+   CFLAGS="${CFLAGS} -fopenmp"
			
 
				+   STARPU_OMP_LDFLAGS="-fopenmp"
			
 
				+fi
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
@@ -2473,6 +2477,7 @@ AS_IF([test "$use_hwloc" = "no"],
 
				 	   AC_HAVE_LIBRARY([hwloc],[have_valid_hwloc=yes],[have_valid_hwloc=no])
			
 
				 	  ])
			
 
				   ])
			
 
				+AM_CONDITIONAL(STARPU_HAVE_HWLOC, test "x$have_valid_hwloc" = "xyes")
			
 
				 # in case hwloc was explicitely required, but is not available, this is an error
			
 
				 AS_IF([test "$use_hwloc" = "yes" -a "$have_valid_hwloc" = "no"],
			
 
				       [AC_MSG_ERROR([cannot find hwloc])]
			
@@ -2597,7 +2602,7 @@ AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
 
				 ###############################################################################
			
 
				 
			
 
				 # these are the flags needed for linking libstarpu (and thus also for static linking)
			
 
				-LIBSTARPU_LDFLAGS="$HWLOC_LIBS $FXT_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS"
			
 
				+LIBSTARPU_LDFLAGS="$HWLOC_LIBS $FXT_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS"
			
 
				 AC_SUBST([LIBSTARPU_LDFLAGS])
			
 
				 
			
 
				 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -47,6 +47,7 @@ chapters =	\
 
				 	chapters/20socl_opencl_extensions.doxy \
			
 
				 	chapters/21simgrid.doxy \
			
 
				 	chapters/22openmp_runtime_support.doxy \
			
 
				+	chapters/23clustering_a_machine.doxy \
			
 
				 	chapters/40environment_variables.doxy \
			
 
				 	chapters/41configure_options.doxy \
			
 
				 	chapters/45files.doxy \
			
@@ -167,6 +168,18 @@ EXTRA_DIST	= 					\
 
				 	chapters/paje_draw_histogram.eps	\
			
 
				 	chapters/paje_draw_histogram.png	\
			
 
				 	chapters/paje_draw_histogram.pdf	\
			
 
				+	chapters/parallel_worker1.eps		\
			
 
				+	chapters/parallel_worker1.pdf		\
			
 
				+	chapters/parallel_worker1.png		\
			
 
				+	chapters/parallel_worker2.eps		\
			
 
				+	chapters/parallel_worker2.pdf		\
			
 
				+	chapters/parallel_worker2.png		\
			
 
				+	chapters/runtime-par.eps			\
			
 
				+	chapters/runtime-par.pdf			\
			
 
				+	chapters/runtime-par.png			\
			
 
				+	chapters/runtime-seq.eps			\
			
 
				+	chapters/runtime-seq.pdf			\
			
 
				+	chapters/runtime-seq.png			\
			
 
				 	chapters/starpu_chol_model_11_type.png	\
			
 
				 	chapters/starpu_chol_model_11_type.eps	\
			
 
				 	chapters/starpu_chol_model_11_type.pdf	\
			
@@ -216,6 +229,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 
				 	$(top_srcdir)/include/starpu_bound.h		\
			
 
				 	$(top_srcdir)/include/starpu_scheduler.h	\
			
 
				 	$(top_srcdir)/include/starpu_sched_ctx.h	\
			
 
				+	$(top_srcdir)/include/starpu_clusters_util.h			\
			
 
				 	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h		\
			
 
				 	$(top_srcdir)/include/starpu_top.h		\
			
 
				 	$(top_srcdir)/include/starpu_hash.h		\
			
--- a/doc/doxygen/chapters/23clustering_a_machine.doxy
+++ b/doc/doxygen/chapters/23clustering_a_machine.doxy
@@ -0,0 +1,165 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2015 Universit@'e de Bordeaux
			
 
				+ * Copyright (C) 2015 CNRS
			
 
				+ * Copyright (C) 2015 INRIA
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page ClusteringAMachine Clustering A Machine
			
 
				+
			
 
				+TODO: clarify and put more explanations, express how to create clusters
			
 
				+using the context API.
			
 
				+
			
 
				+\section GeneralIdeas General Ideas
			
 
				+Clusters are a concept introduced in this
			
 
				+<a href="https://hal.inria.fr/view/index/docid/1181135">paper</a>. This
			
 
				+comes from a basic idea, making use of two level of parallelism in a DAG.
			
 
				+We keep the DAG parallelism but consider on top of it that a task can
			
 
				+contain internal parallelism. A good example is if each task in the DAG
			
 
				+is OpenMP enabled.
			
 
				+
			
 
				+The particularity of such tasks is that we will combine the power of two
			
 
				+runtime systems: StarPU will manage the DAG parallelism and another
			
 
				+runtime (e.g. OpenMP) will manage the internal parallelism. The challenge
			
 
				+is in creating an interface between the two runtime systems so that StarPU
			
 
				+can regroup cores inside a machine (creating what we call a "cluster") on
			
 
				+top of which the parallel tasks (e.g. OpenMP tasks) will be ran in a
			
 
				+contained fashion.
			
 
				+
			
 
				+The aim of the cluster API is to facilitate this process in an automatic
			
 
				+fashion. For this purpose, we depend on the hwloc tool to detect the
			
 
				+machine configuration and then partition it into usable clusters.
			
 
				+
			
 
				+An example of code running on clusters is available in
			
 
				+<c>examples/sched_ctx/parallel_tasks_with_cluster_api.c</c>.
			
 
				+
			
 
				+Let's first look at how to create one in practice, then we will detail
			
 
				+their internals.
			
 
				+
			
 
				+\section CreatingClusters Creating Clusters
			
 
				+Partitioning a machine into clusters with the cluster API is fairly
			
 
				+straightforward. The simplest way is to state under which machine
			
 
				+topology level we wish to regroup all resources. This level is an HwLoc
			
 
				+object, of the type <c>hwloc_obj_type_t</c>. More can be found in the
			
 
				+<a href="https://www.open-mpi.org/projects/hwloc/doc/v1.11.0/a00076.php">hwloc
			
 
				+documentation</a>.
			
 
				+
			
 
				+Once a cluster is created, the full machine is represented with an opaque
			
 
				+structure named <c>starpu_cluster_machine</c>. This can be printed to show the
			
 
				+current machine state.
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_cluster_machine *clusters;
			
 
				+clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET, 0);
			
 
				+starpu_cluster_print(clusters);
			
 
				+
			
 
				+//... submit some tasks with OpenMP computations 
			
 
				+
			
 
				+starpu_uncluster_machine(clusters);
			
 
				+//... we are back in the default starpu state
			
 
				+\endcode
			
 
				+
			
 
				+The following graphic is an example of what a particular machine can
			
 
				+look like once clusterized. The main difference is that we have less
			
 
				+worker queues and tasks which will be executed on several resources at
			
 
				+once. The execution of these tasks will be left to the internal runtime
			
 
				+system, represented with a dashed box around the resources.
			
 
				+
			
 
				+\image latex runtime-par.eps "StarPU using parallel tasks" width=0.5\textwidth
			
 
				+\image html runtime-par.png "StarPU using parallel tasks"
			
 
				+
			
 
				+Creating clusters as shown in the example above will create workers able to
			
 
				+execute OpenMP code by default. The cluster API aims in allowing to
			
 
				+parametrize the cluster creation and can take a <c>va_list</c> of arguments
			
 
				+as input after the HwLoc object (always terminated by a 0 value). These can
			
 
				+help creating clusters of a type different from OpenMP, or create a more
			
 
				+precise partition of the machine.
			
 
				+
			
 
				+\section ExampleOfConstrainingOpenMP Example Of Constraining OpenMP
			
 
				+Clusters require being able to constrain the runtime managing the internal
			
 
				+task parallelism (internal runtime) to the resources set by StarPU. The
			
 
				+purpose of this is to express how StarPU must communicate with the internal
			
 
				+runtime to achieve the required cooperation. In the case of OpenMP, StarPU
			
 
				+will provide an awake thread from the cluster to execute this liaison. It
			
 
				+will then provide on demand the process ids of the other resources supposed
			
 
				+to be in the region. Finally, thanks to an OpenMP region we can create the
			
 
				+required number of threads and bind each of them on the correct region.
			
 
				+These will then be reused each time we encounter a <c>\#pragma omp
			
 
				+parallel</c> in the following computations of our program.
			
 
				+
			
 
				+The following graphic is an example of what an OpenMP-type cluster looks
			
 
				+like and how it represented in StarPU. We can see that one StarPU (black)
			
 
				+thread is awake, and we need to create on the other resources the OpenMP
			
 
				+threads (in pink).
			
 
				+
			
 
				+\image latex parallel_worker2.eps "StarPU with an OpenMP cluster" width=0.3\textwidth
			
 
				+\image html parallel_worker2.png "StarPU with an OpenMP cluster"
			
 
				+
			
 
				+Finally, the following code shows how to create OpenMP cooperate with StarPU
			
 
				+and create the aforementioned OpenMP threads constrained in the cluster's
			
 
				+resources set:
			
 
				+\code{.c}
			
 
				+void starpu_openmp_prologue(void * sched_ctx_id)
			
 
				+        int sched_ctx = *(int*)sched_ctx_id;
			
 
				+		int *cpuids = NULL;
			
 
				+		int ncpuids = 0;
			
 
				+		int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+        //we can target only CPU workers
			
 
				+		if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+                //grab all the ids inside the cluster
			
 
				+				starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
			
 
				+                //set the number of threads
			
 
				+				omp_set_num_threads(ncpuids);
			
 
				+#pragma omp parallel
			
 
				+                {
			
 
				+                        //bind each threads to its respective resource
			
 
				+						starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				+                }
			
 
				+                free(cpuids);
			
 
				+		}
			
 
				+		return;
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+This is in fact exactly the default function used when we don't specify
			
 
				+anything. As can be seen, we based the clusters on several tools and
			
 
				+models present in the StarPU contexts, and merely extended them to allow
			
 
				+to represent and carry clusters. More on contexts can be read here
			
 
				+\ref SchedulingContexts.
			
 
				+
			
 
				+\section CreatingCustomClusters Creating Custom Clusters
			
 
				+As was previously said it is possible to create clusters using another
			
 
				+cluster type, in order to bind another internal runtime inside StarPU.
			
 
				+This can be done with in several ways:
			
 
				+- By using the currently available functions
			
 
				+- By passing as argument a user defined function
			
 
				+
			
 
				+Here are two examples:
			
 
				+\code{.c}
			
 
				+struct starpu_cluster_machine *clusters;
			
 
				+clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET,
			
 
				+                                 STARPU_CLUSTER_TYPE, GNU_OPENMP_MKL,
			
 
				+                                 0);
			
 
				+\endcode
			
 
				+
			
 
				+This type of clusters is available by default only if StarPU is compiled
			
 
				+with MKL. It uses MKL functions to set the number of threads which is
			
 
				+more reliable when using an OpenMP implementation different from the
			
 
				+Intel one.
			
 
				+
			
 
				+\code{.c}
			
 
				+void foo_func(void* foo_arg);
			
 
				+
			
 
				+\\...
			
 
				+int foo_arg = 0;
			
 
				+struct starpu_cluster_machine *clusters;
			
 
				+clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET,
			
 
				+                                  STARPU_CLUSTER_CREATE_FUNC, &foo_func,
			
 
				+                                  STARPU_CLUSTER_CREATE_FUNC_ARG, &foo_arg,
			
 
				+                                  0);
			
 
				+\endcode
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/parallel_worker1.eps
+++ b/doc/doxygen/chapters/parallel_worker1.eps
--- a/doc/doxygen/chapters/parallel_worker1.pdf
+++ b/doc/doxygen/chapters/parallel_worker1.pdf
--- a/doc/doxygen/chapters/parallel_worker1.png
+++ b/doc/doxygen/chapters/parallel_worker1.png
--- a/doc/doxygen/chapters/parallel_worker2.eps
+++ b/doc/doxygen/chapters/parallel_worker2.eps
--- a/doc/doxygen/chapters/parallel_worker2.pdf
+++ b/doc/doxygen/chapters/parallel_worker2.pdf
--- a/doc/doxygen/chapters/parallel_worker2.png
+++ b/doc/doxygen/chapters/parallel_worker2.png
--- a/doc/doxygen/chapters/runtime-par.eps
+++ b/doc/doxygen/chapters/runtime-par.eps
--- a/doc/doxygen/chapters/runtime-par.pdf
+++ b/doc/doxygen/chapters/runtime-par.pdf
--- a/doc/doxygen/chapters/runtime-par.png
+++ b/doc/doxygen/chapters/runtime-par.png
--- a/doc/doxygen/chapters/runtime-seq.eps
+++ b/doc/doxygen/chapters/runtime-seq.eps
--- a/doc/doxygen/chapters/runtime-seq.pdf
+++ b/doc/doxygen/chapters/runtime-seq.pdf
--- a/doc/doxygen/chapters/runtime-seq.png
+++ b/doc/doxygen/chapters/runtime-seq.png
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -97,6 +97,11 @@ Documentation License”.
 
				 \hypertarget{SchedulingContextHypervisor}{}
			
 
				 \input{SchedulingContextHypervisor}
			
 
				 
			
 
				+\chapter{Clustering A Machine}
			
 
				+\label{ClusteringAMachine}
			
 
				+\hypertarget{ClusteringAMachine}{}
			
 
				+\input{ClusteringAMachine}
			
 
				+
			
 
				 \chapter{Modularized Scheduler}
			
 
				 \label{ModularizedScheduler}
			
 
				 \hypertarget{ModularizedScheduler}{}
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -277,6 +277,11 @@ if STARPU_LONG_CHECK
 
				 STARPU_EXAMPLES +=				\
			
 
				 	sched_ctx/parallel_code
			
 
				 endif
			
 
				+
			
 
				+if STARPU_HAVE_HWLOC
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	sched_ctx/parallel_tasks_with_cluster_api
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 ##################
			
--- a/examples/sched_ctx/parallel_tasks_with_cluster_api.c
+++ b/examples/sched_ctx/parallel_tasks_with_cluster_api.c
@@ -0,0 +1,124 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Université de Bordeaux
			
 
				+ * Copyright (C) 2015 INRIA
			
 
				+ * Copyright (C) 2015 CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+#ifdef STARPU_QUICK_CHECK
			
 
				+#define NTASKS 8
			
 
				+#else
			
 
				+#define NTASKS 32
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#define SIZE 4000
			
 
				+
			
 
				+/* Codelet SUM */
			
 
				+static void sum_cpu(void * descr[], void *cl_arg)
			
 
				+{
			
 
				+	double * v_dst = (double *) STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	double * v_src0 = (double *) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+	double * v_src1 = (double *) STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int size;
			
 
				+	starpu_codelet_unpack_args(cl_arg, &size);
			
 
				+	int i, k;
			
 
				+	for (k=0;k<10;k++)
			
 
				+	{
			
 
				+#pragma omp parallel for
			
 
				+		for (i=0; i<size; i++)
			
 
				+		{
			
 
				+			v_dst[i]+=v_src0[i]+v_src1[i];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet sum_cl =
			
 
				+{
			
 
				+	.cpu_funcs = {sum_cpu, NULL},
			
 
				+	.nbuffers = 3,
			
 
				+	.modes={STARPU_RW,STARPU_R, STARPU_R}
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ntasks = NTASKS;
			
 
				+	int ret, i;
			
 
				+	struct starpu_cluster_machine *clusters;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	/* We regroup resources under each sockets into a cluster. We express a partition
			
 
				+	 * of one socket to create two internal clusters */
			
 
				+	clusters = starpu_cluster_machine(HWLOC_OBJ_SOCKET,
			
 
				+									  STARPU_CLUSTER_PARTITION_ONE, STARPU_CLUSTER_NB, 2,
			
 
				+									  0);
			
 
				+	starpu_cluster_print(clusters);
			
 
				+
			
 
				+	/* Data preparation */
			
 
				+	double array1[SIZE];
			
 
				+	double array2[SIZE];
			
 
				+
			
 
				+	memset(array1, 0, sizeof(double));
			
 
				+	for (i=0;i<SIZE;i++)
			
 
				+	{
			
 
				+		array2[i]=i*2;
			
 
				+	}
			
 
				+
			
 
				+	starpu_data_handle_t handle1;
			
 
				+	starpu_data_handle_t handle2;
			
 
				+
			
 
				+	starpu_vector_data_register(&handle1, 0, (uintptr_t)array1, SIZE, sizeof(double));
			
 
				+	starpu_vector_data_register(&handle2, 0, (uintptr_t)array2, SIZE, sizeof(double));
			
 
				+
			
 
				+	int size = SIZE;
			
 
				+
			
 
				+	for (i = 0; i < ntasks; i++)
			
 
				+	{
			
 
				+		struct starpu_task * t;
			
 
				+		t=starpu_task_build(&sum_cl,
			
 
				+				    STARPU_RW,handle1,
			
 
				+				    STARPU_R,handle2,
			
 
				+				    STARPU_R,handle1,
			
 
				+				    STARPU_VALUE,&size,sizeof(int),
			
 
				+				    0);
			
 
				+		t->destroy = 1;
			
 
				+		t->possibly_parallel = 1;
			
 
				+
			
 
				+		ret=starpu_task_submit(t);
			
 
				+		if (ret == -ENODEV)
			
 
				+			goto out;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+
			
 
				+out:
			
 
				+	/* wait for all tasks at the end*/
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unregister(handle1);
			
 
				+	starpu_data_unregister(handle2);
			
 
				+	starpu_uncluster_machine(clusters);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -71,6 +71,10 @@ typedef UINT_PTR uintptr_t;
 
				 #include <starpu_simgrid_wrap.h>
			
 
				 #include <starpu_bitmap.h>
			
 
				 
			
 
				+#if defined(STARPU_HAVE_HWLOC) && defined(STARPU_OPENMP)
			
 
				+#include <starpu_clusters_util.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
 
				 {
			
--- a/include/starpu_clusters_util.h
+++ b/include/starpu_clusters_util.h
@@ -0,0 +1,87 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2015  CNRS
			
 
				+ * Copyright (C) 2015  INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_CLUSTERS_UTIL_H__
			
 
				+#define __STARPU_CLUSTERS_UTIL_H__
			
 
				+
			
 
				+#include <stdarg.h>
			
 
				+#include <starpu.h>
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+#include <hwloc.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+#define STARPU_CLUSTER_MIN_NB			(1<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_MAX_NB			(2<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_NB				(3<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_POLICY_NAME		(4<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_POLICY_STRUCT	(5<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_KEEP_HOMOGENEOUS	(6<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_PREFERE_MIN		(7<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_CREATE_FUNC		(8<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_CREATE_FUNC_ARG	(9<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_TYPE				(10<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_AWAKE_WORKERS	(11<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_PARTITION_ONE	(12<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_NEW				(13<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CLUSTER_NCORES			(14<<STARPU_MODE_SHIFT)
			
 
				+
			
 
				+/* These represent the default available functions to enforce cluster
			
 
				+ * use by the sub-runtime */
			
 
				+typedef enum
			
 
				+{
			
 
				+		OPENMP,
			
 
				+		INTEL_OPENMP_MKL,
			
 
				+#ifdef STARPU_MKL
			
 
				+		GNU_OPENMP_MKL,
			
 
				+#endif
			
 
				+} starpu_cluster_types;
			
 
				+
			
 
				+
			
 
				+typedef struct _starpu_cluster_group_list starpu_cluster_group_list_t;
			
 
				+struct _starpu_cluster_parameters;
			
 
				+typedef struct starpu_cluster_machine
			
 
				+{
			
 
				+		unsigned id;
			
 
				+		hwloc_topology_t topology;
			
 
				+		unsigned nclusters;
			
 
				+		unsigned ngroups;
			
 
				+		starpu_cluster_group_list_t* groups;
			
 
				+		struct _starpu_cluster_parameters* params;
			
 
				+}starpu_clusters;
			
 
				+
			
 
				+struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_level, ...);
			
 
				+int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
			
 
				+void starpu_cluster_print(struct starpu_cluster_machine* clusters);
			
 
				+
			
 
				+/* Prologue functions */
			
 
				+void starpu_openmp_prologue(void * sched_ctx_id);
			
 
				+#define starpu_intel_openmp_mkl_prologue starpu_openmp_prologue
			
 
				+#ifdef STARPU_MKL
			
 
				+void starpu_gnu_openmp_mkl_prologue(void * sched_ctx_id);
			
 
				+#endif /* STARPU_MKL */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_CLUSTERS_UTIL_H__ */
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -139,6 +139,7 @@ noinst_HEADERS = 						\
 
				 	profiling/bound.h					\
			
 
				 	profiling/profiling.h					\
			
 
				 	util/openmp_runtime_support.h				\
			
 
				+	util/starpu_clusters_create.h				\
			
 
				 	util/starpu_task_insert_utils.h				\
			
 
				 	util/starpu_data_cpy.h					\
			
 
				 	starpu_parameters.h					\
			
@@ -327,6 +328,13 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_unistd_o_d
 
				 endif
			
 
				 
			
 
				 
			
 
				+if STARPU_HAVE_HWLOC
			
 
				+if STARPU_HAVE_OPENMP
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += util/starpu_clusters_create.c
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+
			
 
				 #########################################
			
 
				 #										#
			
 
				 #        Generic MP compilation			#
			
--- a/src/util/starpu_clusters_create.c
+++ b/src/util/starpu_clusters_create.c
@@ -0,0 +1,720 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2015  INRIA
			
 
				+ * Copyright (C) 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This file creates an interface to manage clustering resources and make use
			
 
				+ * of parallel tasks. It entirely depends on the hwloc software. */
			
 
				+
			
 
				+#include "starpu_clusters_create.h"
			
 
				+
			
 
				+starpu_binding_function _starpu_cluster_type_get_func(starpu_cluster_types type)
			
 
				+{
			
 
				+		starpu_binding_function prologue_func;
			
 
				+
			
 
				+		switch (type)
			
 
				+		{
			
 
				+		case OPENMP:
			
 
				+				prologue_func = &starpu_openmp_prologue;
			
 
				+				break;
			
 
				+		case INTEL_OPENMP_MKL:
			
 
				+				prologue_func = &starpu_intel_openmp_mkl_prologue;
			
 
				+				break;
			
 
				+#ifdef STARPU_MKL
			
 
				+		case GNU_OPENMP_MKL:
			
 
				+				prologue_func = &starpu_gnu_openmp_mkl_prologue;
			
 
				+				break;
			
 
				+#endif
			
 
				+		default:
			
 
				+				prologue_func = NULL;
			
 
				+		}
			
 
				+
			
 
				+		return prologue_func;
			
 
				+}
			
 
				+
			
 
				+void starpu_openmp_prologue(void * sched_ctx_id)
			
 
				+{
			
 
				+		int sched_ctx = *(int*)sched_ctx_id;
			
 
				+		int *cpuids = NULL;
			
 
				+		int ncpuids = 0;
			
 
				+		int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+		if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+				starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
			
 
				+				omp_set_num_threads(ncpuids);
			
 
				+#pragma omp parallel
			
 
				+				{
			
 
				+						starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				+				}
			
 
				+				free(cpuids);
			
 
				+		}
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_MKL
			
 
				+void starpu_gnu_openmp_mkl_prologue(void * sched_ctx_id)
			
 
				+{
			
 
				+		int sched_ctx = *(int*)sched_ctx_id;
			
 
				+		int *cpuids = NULL;
			
 
				+		int ncpuids = 0;
			
 
				+		int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+		if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+				starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
			
 
				+				omp_set_num_threads(ncpuids);
			
 
				+				mkl_set_num_threads(ncpuids);
			
 
				+				mkl_set_dynamic(0);
			
 
				+#pragma omp parallel
			
 
				+				{
			
 
				+						starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				+				}
			
 
				+				free(cpuids);
			
 
				+		}
			
 
				+		return;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Main interface function to create a cluster view of the machine.
			
 
				+ * Its job is to capture what the user wants and store it in a standard view. */
			
 
				+struct starpu_cluster_machine* starpu_cluster_machine(hwloc_obj_type_t cluster_level, ...)
			
 
				+{
			
 
				+		va_list varg_list;
			
 
				+		int arg_type;
			
 
				+		struct _starpu_cluster_parameters *params;
			
 
				+		struct starpu_cluster_machine* machine = malloc(sizeof(struct starpu_cluster_machine));
			
 
				+		machine->params = malloc(sizeof(struct _starpu_cluster_parameters));
			
 
				+		machine->groups = _starpu_cluster_group_list_new();
			
 
				+
			
 
				+		_starpu_cluster_init_parameters(machine->params);
			
 
				+		params = machine->params;
			
 
				+
			
 
				+		va_start(varg_list, cluster_level);
			
 
				+		while ((arg_type = va_arg(varg_list, int)) != 0)
			
 
				+		{
			
 
				+				if (arg_type == STARPU_CLUSTER_MIN_NB)
			
 
				+				{
			
 
				+						params->min_nb = va_arg(varg_list, int);
			
 
				+						if (params->min_nb <= 0)
			
 
				+								fprintf(stderr, "Caution min number of contexts shouldn't "
			
 
				+										"be negative or null\n");
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_MAX_NB)
			
 
				+				{
			
 
				+						params->max_nb = va_arg(varg_list, int);
			
 
				+						if (params->max_nb <= 0)
			
 
				+								fprintf(stderr, "Caution max number of contexts shouldn't "
			
 
				+										"be negative or null\n");
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_NB)
			
 
				+				{
			
 
				+						params->nb = va_arg(varg_list, int);
			
 
				+						if (params->nb <= 0)
			
 
				+								fprintf(stderr, "Caution number of contexts shouldn't "
			
 
				+										"be negative or null\n");
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_POLICY_NAME)
			
 
				+				{
			
 
				+						params->sched_policy_name = va_arg(varg_list, char*);
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_POLICY_STRUCT)
			
 
				+				{
			
 
				+						params->sched_policy_struct = va_arg(varg_list,
			
 
				+															 struct starpu_sched_policy*);
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_KEEP_HOMOGENEOUS)
			
 
				+				{
			
 
				+						params->keep_homogeneous = va_arg(varg_list, int); /* 0=off, other=on */
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_PREFERE_MIN)
			
 
				+				{
			
 
				+						params->prefere_min = va_arg(varg_list, int); /* 0=off, other=on */
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_CREATE_FUNC)
			
 
				+				{
			
 
				+						params->create_func = va_arg(varg_list, void (*)(void*));
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_CREATE_FUNC_ARG)
			
 
				+				{
			
 
				+						params->create_func_arg = va_arg(varg_list, void*);
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_TYPE)
			
 
				+				{
			
 
				+						params->type = va_arg(varg_list, starpu_cluster_types);
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_AWAKE_WORKERS)
			
 
				+				{
			
 
				+						params->awake_workers = va_arg(varg_list, unsigned);
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_PARTITION_ONE)
			
 
				+				{
			
 
				+						struct _starpu_cluster_group* group = _starpu_cluster_group_new();
			
 
				+						_starpu_cluster_group_init(group, machine);
			
 
				+						_starpu_cluster_group_list_push_back(machine->groups, group);
			
 
				+						params = group->params;
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_NEW)
			
 
				+				{
			
 
				+						struct _starpu_cluster* cluster = _starpu_cluster_new();
			
 
				+						struct _starpu_cluster_group* group = _starpu_cluster_group_list_back(machine->groups);
			
 
				+						if (group == NULL)
			
 
				+						{
			
 
				+								group = _starpu_cluster_group_new();
			
 
				+								_starpu_cluster_group_init(group, machine);
			
 
				+								_starpu_cluster_group_list_push_back(machine->groups, group);
			
 
				+						}
			
 
				+						_starpu_cluster_init(cluster, group);
			
 
				+						_starpu_cluster_list_push_back(group->clusters, cluster);
			
 
				+						params = cluster->params;
			
 
				+				}
			
 
				+				else if (arg_type == STARPU_CLUSTER_NCORES)
			
 
				+				{
			
 
				+						struct _starpu_cluster_group* group =
			
 
				+								_starpu_cluster_group_list_back(machine->groups);
			
 
				+						struct _starpu_cluster* cluster =
			
 
				+								_starpu_cluster_list_back(group->clusters);
			
 
				+						cluster->ncores = va_arg(varg_list, unsigned);
			
 
				+
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+						STARPU_ABORT_MSG("Unrecognized argument %d\n", arg_type);
			
 
				+				}
			
 
				+		}
			
 
				+		va_end(varg_list);
			
 
				+
			
 
				+		switch(cluster_level)
			
 
				+		{
			
 
				+		case HWLOC_OBJ_MISC:
			
 
				+		case HWLOC_OBJ_BRIDGE:
			
 
				+		case HWLOC_OBJ_PCI_DEVICE:
			
 
				+		case HWLOC_OBJ_OS_DEVICE:
			
 
				+				STARPU_ABORT_MSG("Cluster aggregation isn't supported for level %s\n",
			
 
				+								 hwloc_obj_type_string(cluster_level));
			
 
				+				break;
			
 
				+		default: /* others can pass */
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		_starpu_cluster_machine(cluster_level, machine);
			
 
				+		return machine;
			
 
				+}
			
 
				+
			
 
				+int starpu_uncluster_machine(struct starpu_cluster_machine* machine)
			
 
				+{
			
 
				+		struct _starpu_cluster_group *g, *tmp;
			
 
				+		struct _starpu_cluster_group_list* group_list = machine->groups;
			
 
				+		starpu_sched_ctx_delete(machine->id);
			
 
				+		g = _starpu_cluster_group_list_begin(group_list);
			
 
				+		while (g != _starpu_cluster_group_list_end(group_list))
			
 
				+		{
			
 
				+				tmp = g;
			
 
				+				g = _starpu_cluster_group_list_next(g);
			
 
				+				_starpu_cluster_group_remove(group_list, tmp);
			
 
				+		}
			
 
				+		hwloc_topology_destroy(machine->topology);
			
 
				+		free(machine->params);
			
 
				+		free(machine);
			
 
				+		starpu_sched_ctx_set_context(0);
			
 
				+
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+void starpu_cluster_print(struct starpu_cluster_machine* clusters)
			
 
				+{
			
 
				+		int cnt, w;
			
 
				+		struct _starpu_cluster_group *group;
			
 
				+		struct _starpu_cluster *cluster;
			
 
				+
			
 
				+		printf("Number of clusters created: %d\n", clusters->nclusters);
			
 
				+		cnt=0;
			
 
				+		for (group = _starpu_cluster_group_list_begin(clusters->groups);
			
 
				+			 group != _starpu_cluster_group_list_end(clusters->groups);
			
 
				+			 group = _starpu_cluster_group_list_next(group))
			
 
				+		{
			
 
				+				for (cluster = _starpu_cluster_list_begin(group->clusters);
			
 
				+					 cluster != _starpu_cluster_list_end(group->clusters);
			
 
				+					 cluster = _starpu_cluster_list_next(cluster))
			
 
				+				{
			
 
				+						printf("Cluster %d contains the following logical indexes:\n\t", cnt);
			
 
				+						for (w=0; w < cluster->ncores; w++)
			
 
				+								printf("%d ", cluster->cores[w]);
			
 
				+						printf("\n");
			
 
				+						cnt++;
			
 
				+				}
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_create(struct _starpu_cluster* cluster)
			
 
				+{
			
 
				+		unsigned main_ctx_id = cluster->father->father->id;
			
 
				+		if (cluster->params->awake_workers)
			
 
				+				cluster->id = starpu_sched_ctx_create(cluster->workerids, cluster->ncores,
			
 
				+													  "clusters", STARPU_SCHED_CTX_NESTED,
			
 
				+													  main_ctx_id,
			
 
				+													  STARPU_SCHED_CTX_AWAKE_WORKERS,
			
 
				+													  0);
			
 
				+		else
			
 
				+				cluster->id = starpu_sched_ctx_create(cluster->workerids, cluster->ncores,
			
 
				+													  "clusters", STARPU_SCHED_CTX_NESTED,
			
 
				+													  main_ctx_id,
			
 
				+													  0);
			
 
				+
			
 
				+		/* cluster priority can be the lowest, so let's enforce it */
			
 
				+		starpu_sched_ctx_set_priority(cluster->workerids, cluster->ncores, cluster->id, 0);
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_group_create(struct _starpu_cluster_group *group)
			
 
				+{
			
 
				+		struct _starpu_cluster* c;
			
 
				+		for (c = _starpu_cluster_list_begin(group->clusters) ;
			
 
				+			 c != _starpu_cluster_list_end(group->clusters) ;
			
 
				+			 c = _starpu_cluster_list_next(c))
			
 
				+		{
			
 
				+				_starpu_cluster_create(c);
			
 
				+				if (!c->params->awake_workers)
			
 
				+						_starpu_cluster_bind(c);
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+int _starpu_cluster_bind(struct _starpu_cluster* cluster)
			
 
				+{
			
 
				+		starpu_binding_function func;
			
 
				+		void* func_arg;
			
 
				+		if (cluster->params->create_func)
			
 
				+		{
			
 
				+				func = cluster->params->create_func;
			
 
				+				func_arg = (void*) cluster->params->create_func_arg;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+				func = _starpu_cluster_type_get_func(cluster->params->type);
			
 
				+				func_arg = (void*) &cluster->id;
			
 
				+		}
			
 
				+
			
 
				+		return starpu_task_insert(&_starpu_cluster_bind_cl,
			
 
				+								  STARPU_SCHED_CTX, cluster->id,
			
 
				+								  STARPU_POSSIBLY_PARALLEL, 1,
			
 
				+								  STARPU_PROLOGUE_CALLBACK_POP, func,
			
 
				+								  STARPU_PROLOGUE_CALLBACK_POP_ARG, func_arg,
			
 
				+								  0);
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_group_init(struct _starpu_cluster_group* group,
			
 
				+								struct starpu_cluster_machine* father)
			
 
				+{
			
 
				+		group->id = 0;
			
 
				+		group->nclusters = 0;
			
 
				+		group->clusters = _starpu_cluster_list_new();
			
 
				+		group->father = father;
			
 
				+		group->params = malloc(sizeof(struct _starpu_cluster_parameters));
			
 
				+		_starpu_cluster_copy_parameters(group->params,
			
 
				+										father->params);
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_init(struct _starpu_cluster* cluster,
			
 
				+						  struct _starpu_cluster_group* father)
			
 
				+{
			
 
				+		cluster->id = 0;
			
 
				+		cluster->cpuset = hwloc_bitmap_alloc();
			
 
				+		cluster->ncores = 0;
			
 
				+		cluster->cores = NULL;
			
 
				+		cluster->workerids = NULL;
			
 
				+		cluster->father = father;
			
 
				+		cluster->params = malloc(sizeof(struct _starpu_cluster_parameters));
			
 
				+		_starpu_cluster_copy_parameters(cluster->params,
			
 
				+										father->params);
			
 
				+}
			
 
				+
			
 
				+int _starpu_cluster_remove(struct _starpu_cluster_list* cluster_list,
			
 
				+						   struct _starpu_cluster* cluster)
			
 
				+{
			
 
				+		if (cluster && cluster->id != STARPU_NMAX_SCHED_CTXS)
			
 
				+				starpu_sched_ctx_delete(cluster->id);
			
 
				+		else
			
 
				+				return -1;
			
 
				+
			
 
				+		if (cluster->cores != NULL)
			
 
				+				free(cluster->cores);
			
 
				+		if (cluster->workerids != NULL)
			
 
				+				free(cluster->workerids);
			
 
				+		hwloc_bitmap_free(cluster->cpuset);
			
 
				+		free(cluster->params);
			
 
				+		_starpu_cluster_list_erase(cluster_list, cluster);
			
 
				+		_starpu_cluster_delete(cluster);
			
 
				+
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+int _starpu_cluster_group_remove(struct _starpu_cluster_group_list* group_list,
			
 
				+								 struct _starpu_cluster_group* group)
			
 
				+{
			
 
				+		struct _starpu_cluster* tmp;
			
 
				+		struct _starpu_cluster_list* cluster_list = group->clusters;
			
 
				+		struct _starpu_cluster* c = _starpu_cluster_list_begin(cluster_list);
			
 
				+		while (c != _starpu_cluster_list_end(cluster_list))
			
 
				+		{
			
 
				+				tmp = c;
			
 
				+				c = _starpu_cluster_list_next(c);
			
 
				+				_starpu_cluster_remove(cluster_list, tmp);
			
 
				+		}
			
 
				+		free(group->params);
			
 
				+		_starpu_cluster_group_list_erase(group_list, group);
			
 
				+		_starpu_cluster_group_delete(group);
			
 
				+
			
 
				+		return 0;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_init_parameters(struct _starpu_cluster_parameters* params)
			
 
				+{
			
 
				+		params->min_nb = 0;
			
 
				+		params->max_nb = 0;
			
 
				+		params->nb = 0;
			
 
				+		params->sched_policy_name = NULL;
			
 
				+		params->sched_policy_struct = NULL;
			
 
				+		params->keep_homogeneous = 0;
			
 
				+		params->prefere_min = 0;
			
 
				+		params->create_func = NULL;
			
 
				+		params->create_func_arg = NULL;
			
 
				+		params->type = OPENMP;
			
 
				+		params->awake_workers = 0;
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_copy_parameters(struct _starpu_cluster_parameters* dst,
			
 
				+									 struct _starpu_cluster_parameters* src)
			
 
				+{
			
 
				+		dst->min_nb = src->min_nb;
			
 
				+		dst->max_nb = src->max_nb;
			
 
				+		dst->nb = src->nb;
			
 
				+		dst->sched_policy_name = src->sched_policy_name;
			
 
				+		dst->sched_policy_struct = src->sched_policy_struct;
			
 
				+		dst->keep_homogeneous = src->keep_homogeneous;
			
 
				+		dst->prefere_min = src->prefere_min;
			
 
				+		dst->create_func = src->create_func;
			
 
				+		dst->create_func_arg = src->create_func_arg;
			
 
				+		dst->type = src->type;
			
 
				+		dst->awake_workers = src->awake_workers;
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+/* Considering the resources and parameters, how many clusters should we take? */
			
 
				+int _starpu_cluster_analyze_parameters(struct _starpu_cluster_parameters* params, int npus)
			
 
				+{
			
 
				+		int nb_clusters = 1, j;
			
 
				+		if (params->nb)
			
 
				+		{
			
 
				+				nb_clusters = params->nb;
			
 
				+		}
			
 
				+		else if (params->min_nb && params->max_nb)
			
 
				+		{
			
 
				+				if (!params->keep_homogeneous)
			
 
				+				{
			
 
				+						if (params->prefere_min)
			
 
				+								nb_clusters = params->min_nb;
			
 
				+						else
			
 
				+								nb_clusters = params->max_nb;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+						int begin = params->prefere_min? params->min_nb:params->max_nb;
			
 
				+						int end = params->prefere_min? params->max_nb+1:params->min_nb-1;
			
 
				+						j=begin;
			
 
				+						int best = 0, second_best = 0, cpu_loss = INT_MAX;
			
 
				+						while (j != end)
			
 
				+						{
			
 
				+								if (npus%j == 0)
			
 
				+								{
			
 
				+										best = j;
			
 
				+										break;
			
 
				+								}
			
 
				+								if (npus%j < cpu_loss)
			
 
				+								{
			
 
				+										cpu_loss = npus%j;
			
 
				+										second_best = j;
			
 
				+								}
			
 
				+								j = params->prefere_min? j++:j--;
			
 
				+						}
			
 
				+
			
 
				+						if (best)
			
 
				+								nb_clusters = best;
			
 
				+						else if (second_best)
			
 
				+								nb_clusters = second_best;
			
 
				+				}
			
 
				+		}
			
 
				+
			
 
				+		return nb_clusters;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
			
 
				+							 struct starpu_cluster_machine* machine)
			
 
				+{
			
 
				+		struct _starpu_cluster_group* g;
			
 
				+
			
 
				+		_starpu_cluster_topology(cluster_level, machine);
			
 
				+
			
 
				+		if (machine->params->sched_policy_struct != NULL)
			
 
				+		{
			
 
				+				machine->id = starpu_sched_ctx_create(NULL, -1, "main sched ctx",
			
 
				+													  STARPU_SCHED_CTX_POLICY_STRUCT,
			
 
				+													  machine->params->sched_policy_struct,
			
 
				+													  0);
			
 
				+		}
			
 
				+		else if (machine->params->sched_policy_name != NULL)
			
 
				+		{
			
 
				+				machine->id = starpu_sched_ctx_create(NULL, -1, "main sched ctx",
			
 
				+													  STARPU_SCHED_CTX_POLICY_NAME,
			
 
				+													  machine->params->sched_policy_name,
			
 
				+													  0);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+				struct starpu_sched_policy* sched_policy;
			
 
				+				struct _starpu_sched_ctx* global_ctx =
			
 
				+						_starpu_get_sched_ctx_struct(STARPU_GLOBAL_SCHED_CTX);
			
 
				+				sched_policy = _starpu_get_sched_policy(global_ctx);
			
 
				+				machine->id = starpu_sched_ctx_create(NULL, -1, "main sched ctx",
			
 
				+													  STARPU_SCHED_CTX_POLICY_STRUCT,
			
 
				+													  sched_policy, 0);
			
 
				+		}
			
 
				+
			
 
				+
			
 
				+		for (g = _starpu_cluster_group_list_begin(machine->groups) ;
			
 
				+			 g != _starpu_cluster_group_list_end(machine->groups) ;
			
 
				+			 g = _starpu_cluster_group_list_next(g))
			
 
				+				_starpu_cluster_group_create(g);
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+		starpu_sched_ctx_set_context(&machine->id);
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
			
 
				+							  struct starpu_cluster_machine* machine)
			
 
				+{
			
 
				+		int w;
			
 
				+		hwloc_topology_t topology;
			
 
				+		hwloc_cpuset_t avail_cpus;
			
 
				+		char *buf;
			
 
				+
			
 
				+		struct _starpu_machine_config* config = _starpu_get_machine_config();
			
 
				+		STARPU_ASSERT_MSG(config->topology.hwtopology != NULL, "STARPU_CLUSTER: You "
			
 
				+						  "need to call starpu_init() or make sure to activate hwloc.");
			
 
				+		hwloc_topology_dup(&topology, config->topology.hwtopology);
			
 
				+
			
 
				+		avail_cpus = hwloc_bitmap_alloc();
			
 
				+		hwloc_bitmap_zero(avail_cpus);
			
 
				+
			
 
				+		int nworkers = starpu_worker_get_count_by_type(STARPU_CPU_WORKER);
			
 
				+		int *workers = (int*) malloc(sizeof(int) * nworkers);
			
 
				+		starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, nworkers);
			
 
				+
			
 
				+		for (w = 0; w < nworkers ; w++)
			
 
				+		{
			
 
				+				struct _starpu_worker *worker_str = _starpu_get_worker_struct(workers[w]);
			
 
				+				hwloc_bitmap_or(avail_cpus, avail_cpus, worker_str->hwloc_cpu_set);
			
 
				+		}
			
 
				+		hwloc_bitmap_list_asprintf(&buf, avail_cpus);
			
 
				+
			
 
				+		hwloc_topology_restrict(topology, avail_cpus, 0);
			
 
				+		free(workers);
			
 
				+
			
 
				+		/* Use new topology to fill in the cluster list */
			
 
				+		hwloc_topology_dup(&machine->topology, topology);
			
 
				+		_starpu_cluster_group(cluster_level, machine);
			
 
				+
			
 
				+		hwloc_bitmap_free(avail_cpus);
			
 
				+		hwloc_topology_destroy(topology);
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster_group(hwloc_obj_type_t cluster_level,
			
 
				+						   struct starpu_cluster_machine* machine)
			
 
				+{
			
 
				+		unsigned nb_objects;
			
 
				+		int i;
			
 
				+		struct _starpu_cluster_group* group = NULL;
			
 
				+
			
 
				+		if (machine->groups == NULL)
			
 
				+				machine->groups = _starpu_cluster_group_list_new();
			
 
				+
			
 
				+		nb_objects = hwloc_get_nbobjs_by_type(machine->topology, cluster_level);
			
 
				+		if (nb_objects == 0)
			
 
				+				return;
			
 
				+
			
 
				+		group = _starpu_cluster_group_list_begin(machine->groups);
			
 
				+		for (i = 0 ; i < nb_objects ; i++)
			
 
				+		{
			
 
				+				hwloc_obj_t cluster_obj = hwloc_get_obj_by_type(machine->topology,
			
 
				+																cluster_level, i);
			
 
				+
			
 
				+				if (group == NULL)
			
 
				+				{
			
 
				+						group = _starpu_cluster_group_new();
			
 
				+						_starpu_cluster_group_init(group, machine);
			
 
				+						_starpu_cluster_group_list_push_back(machine->groups, group);
			
 
				+				}
			
 
				+
			
 
				+				group->group_obj = cluster_obj;
			
 
				+
			
 
				+				_starpu_cluster(group);
			
 
				+				machine->ngroups++;
			
 
				+				machine->nclusters += group->nclusters;
			
 
				+				group = _starpu_cluster_group_list_next(group);
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_cluster(struct _starpu_cluster_group* group)
			
 
				+{
			
 
				+		int i, avail_pus, npus, npreset=0;
			
 
				+		struct _starpu_cluster* cluster;
			
 
				+		char* buf;
			
 
				+		npus = hwloc_get_nbobjs_inside_cpuset_by_type(group->father->topology,
			
 
				+													  group->group_obj->cpuset,
			
 
				+													  HWLOC_OBJ_PU);
			
 
				+
			
 
				+		/* Preset clusters */
			
 
				+		avail_pus = npus;
			
 
				+		for (cluster=_starpu_cluster_list_begin(group->clusters);
			
 
				+			 cluster!=_starpu_cluster_list_end(group->clusters);
			
 
				+			 cluster=_starpu_cluster_list_next(cluster))
			
 
				+		{
			
 
				+				if (cluster->ncores > avail_pus)
			
 
				+						cluster->ncores = avail_pus;
			
 
				+				else if (avail_pus == 0)
			
 
				+						cluster->ncores = 0;
			
 
				+
			
 
				+				if (cluster->ncores > 0)
			
 
				+				{
			
 
				+						cluster->cores = malloc(sizeof(int)*cluster->ncores);
			
 
				+						cluster->workerids = malloc(sizeof(int)*cluster->ncores);
			
 
				+						avail_pus -= cluster->ncores;
			
 
				+						npreset++;
			
 
				+				}
			
 
				+		}
			
 
				+
			
 
				+		/* Automatic clusters */
			
 
				+		group->nclusters = _starpu_cluster_analyze_parameters(group->params, avail_pus);
			
 
				+		for (i=0 ; i<group->nclusters && avail_pus>0 ; i++)
			
 
				+		{
			
 
				+				if (cluster == NULL)
			
 
				+				{
			
 
				+						cluster = _starpu_cluster_new();
			
 
				+						_starpu_cluster_init(cluster, group);
			
 
				+						_starpu_cluster_list_push_back(group->clusters, cluster);
			
 
				+				}
			
 
				+
			
 
				+				if (cluster->ncores != 0 && cluster->ncores > avail_pus)
			
 
				+				{
			
 
				+						cluster->ncores = avail_pus;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+						if (cluster->params->keep_homogeneous)
			
 
				+								cluster->ncores = avail_pus/(group->nclusters-i);
			
 
				+						else
			
 
				+								cluster->ncores = i==group->nclusters-1?
			
 
				+										avail_pus:
			
 
				+										avail_pus/(group->nclusters-i);
			
 
				+				}
			
 
				+				avail_pus -= cluster->ncores;
			
 
				+				cluster->cores = malloc(sizeof(int)*cluster->ncores);
			
 
				+				cluster->workerids = malloc(sizeof(int)*cluster->ncores);
			
 
				+
			
 
				+				cluster = _starpu_cluster_list_next(cluster);
			
 
				+		}
			
 
				+		group->nclusters += npreset;
			
 
				+
			
 
				+		cluster = _starpu_cluster_list_begin(group->clusters);
			
 
				+		int count = 0;
			
 
				+		static int starpu_cluster_warned = 0;
			
 
				+		hwloc_bitmap_list_asprintf(&buf, group->group_obj->cpuset);
			
 
				+
			
 
				+		for (i=0 ; i<npus ; i++)
			
 
				+		{
			
 
				+				hwloc_obj_t pu = hwloc_get_obj_inside_cpuset_by_type(group->father->topology,
			
 
				+																	 group->group_obj->cpuset,
			
 
				+																	 HWLOC_OBJ_PU, i);
			
 
				+
			
 
				+				/* If we have more than one worker on this resource, let's add them too --
			
 
				+				   even if it's bad (they'll all be boud on the same PU) */
			
 
				+				int size = 0, j;
			
 
				+				struct _starpu_worker* worker_str = _starpu_worker_list_front(pu->userdata);
			
 
				+				for (j = 0; j < _starpu_worker_list_size(pu->userdata) ; j++)
			
 
				+				{
			
 
				+						if (worker_str->arch == STARPU_CPU_WORKER)
			
 
				+								size++;
			
 
				+						worker_str = _starpu_worker_list_next(worker_str);
			
 
				+				}
			
 
				+
			
 
				+				if (size > 1)
			
 
				+				{
			
 
				+						if (!starpu_cluster_warned)
			
 
				+						{
			
 
				+								fprintf(stderr, "STARPU CLUSTERS: Caution! It seems that you have"
			
 
				+								" multiple workers bound to the same PU. If you have"
			
 
				+								" multithreading on your cores it is greatly adviced"
			
 
				+								" to export STARPU_NTHREADS_PER_CORE=nb.");
			
 
				+								starpu_cluster_warned = 1;
			
 
				+						}
			
 
				+						cluster->ncores += size-1;
			
 
				+						cluster->cores = realloc(cluster->cores,
			
 
				+												 sizeof(int)*cluster->ncores);
			
 
				+						cluster->workerids = realloc(cluster->workerids,
			
 
				+													 sizeof(int)*cluster->ncores);
			
 
				+				}
			
 
				+
			
 
				+				/* grab workerid list and return first cpu */
			
 
				+				worker_str = _starpu_worker_list_front(pu->userdata);
			
 
				+				if (worker_str)
			
 
				+						hwloc_bitmap_or(cluster->cpuset, cluster->cpuset,
			
 
				+										worker_str->hwloc_cpu_set);
			
 
				+				j = 0;
			
 
				+				while (worker_str)
			
 
				+				{
			
 
				+						if (worker_str->arch == STARPU_CPU_WORKER)
			
 
				+						{
			
 
				+								cluster->cores[count+j] = worker_str->bindid;
			
 
				+								cluster->workerids[count+j] = worker_str->workerid;
			
 
				+								j++;
			
 
				+						}
			
 
				+						worker_str = _starpu_worker_list_next(worker_str);
			
 
				+				}
			
 
				+
			
 
				+				count+=size;
			
 
				+				if (cluster->ncores == count)
			
 
				+				{
			
 
				+						count = 0;
			
 
				+						cluster = _starpu_cluster_list_next(cluster);
			
 
				+				}
			
 
				+		}
			
 
				+
			
 
				+		return;
			
 
				+}
			
--- a/src/util/starpu_clusters_create.h
+++ b/src/util/starpu_clusters_create.h
@@ -0,0 +1,118 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2015  INRIA
			
 
				+ * Copyright (C) 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_CLUSTERS_CREATE_H__
			
 
				+#define __STARPU_CLUSTERS_CREATE_H__
			
 
				+
			
 
				+#include "starpu_clusters_util.h"
			
 
				+#include <core/workers.h>
			
 
				+#include <common/list.h>
			
 
				+#include <string.h>
			
 
				+#ifdef STARPU_OPENMP
			
 
				+#include <omp.h>
			
 
				+#endif
			
 
				+#ifdef STARPU_MKL
			
 
				+#include <mkl_service.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern
			
 
				+#endif
			
 
				+
			
 
				+struct _starpu_cluster_parameters
			
 
				+{
			
 
				+		int min_nb;
			
 
				+		int max_nb;
			
 
				+		int nb;
			
 
				+		char* sched_policy_name;
			
 
				+		struct starpu_sched_policy* sched_policy_struct;
			
 
				+		unsigned keep_homogeneous;
			
 
				+		unsigned prefere_min;
			
 
				+		void (*create_func)(void*);
			
 
				+		void* create_func_arg;
			
 
				+		int type;
			
 
				+		unsigned awake_workers;
			
 
				+};
			
 
				+
			
 
				+LIST_TYPE(_starpu_cluster_group,
			
 
				+		  unsigned id;
			
 
				+		  hwloc_obj_t group_obj;
			
 
				+		  int nclusters;
			
 
				+		  struct _starpu_cluster_list* clusters;
			
 
				+		  struct starpu_cluster_machine* father;
			
 
				+		  struct _starpu_cluster_parameters* params;
			
 
				+		);
			
 
				+
			
 
				+LIST_TYPE(_starpu_cluster,
			
 
				+		  unsigned id;
			
 
				+		  hwloc_cpuset_t cpuset;
			
 
				+		  int ncores;
			
 
				+		  int* cores;
			
 
				+		  int* workerids;
			
 
				+		  struct _starpu_cluster_group* father;
			
 
				+		  struct _starpu_cluster_parameters* params;
			
 
				+		);
			
 
				+
			
 
				+
			
 
				+/* Machine discovery and cluster creation main funcitons */
			
 
				+void _starpu_cluster_machine(hwloc_obj_type_t cluster_level,
			
 
				+							 struct starpu_cluster_machine* machine);
			
 
				+void _starpu_cluster_topology(hwloc_obj_type_t cluster_level,
			
 
				+							  struct starpu_cluster_machine* machine);
			
 
				+void _starpu_cluster_group(hwloc_obj_type_t cluster_level,
			
 
				+						   struct starpu_cluster_machine* machine);
			
 
				+void _starpu_cluster(struct _starpu_cluster_group* group);
			
 
				+
			
 
				+/* Parameter functions */
			
 
				+void _starpu_cluster_init_parameters(struct _starpu_cluster_parameters* globals);
			
 
				+void _starpu_cluster_copy_parameters(struct _starpu_cluster_parameters* src,
			
 
				+									 struct _starpu_cluster_parameters* dst);
			
 
				+int _starpu_cluster_analyze_parameters(struct _starpu_cluster_parameters* params, int npus);
			
 
				+
			
 
				+/* Cluster helper functions */
			
 
				+void _starpu_cluster_init(struct _starpu_cluster* cluster, struct _starpu_cluster_group* father);
			
 
				+void _starpu_cluster_create(struct _starpu_cluster* cluster);
			
 
				+
			
 
				+int _starpu_cluster_bind(struct _starpu_cluster* cluster);
			
 
				+int _starpu_cluster_remove(struct _starpu_cluster_list* cluster_list,
			
 
				+						   struct _starpu_cluster* cluster);
			
 
				+
			
 
				+/* Cluster group helper function */
			
 
				+void _starpu_cluster_group_init(struct _starpu_cluster_group* group,
			
 
				+								struct starpu_cluster_machine* father);
			
 
				+void _starpu_cluster_group_create(struct _starpu_cluster_group* group);
			
 
				+int _starpu_cluster_group_remove(struct _starpu_cluster_group_list* group_list,
			
 
				+								 struct _starpu_cluster_group* group);
			
 
				+
			
 
				+/* Binding helpers */
			
 
				+void _starpu_cluster_noop(void* buffers[], void* cl_arg)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet _starpu_cluster_bind_cl=
			
 
				+{
			
 
				+		.cpu_funcs = {_starpu_cluster_noop},
			
 
				+		.nbuffers = 0,
			
 
				+		.name = "cluster internal runtime init"
			
 
				+};
			
 
				+
			
 
				+
			
 
				+typedef void (*starpu_binding_function)(void*);
			
 
				+starpu_binding_function _starpu_cluster_type_get_func(starpu_cluster_types type);
			
 
				+
			
 
				+#endif /* __STARPU_CLUSTERS_CREATE_H__ */