8 years ago · ea45438a79
--- a/configure.ac
+++ b/configure.ac
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																-# Copyright (C) 2009-2016  Université de Bordeaux
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+# Copyright (C) 2009-2017  Université de Bordeaux
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																 # Copyright (C) 2011  Télécom-SudParis
															
 
																 # Copyright (C) 2011, 2012, 2014-2016  INRIA
															
 
																 #
															
@@ -170,6 +170,7 @@ if test x$enable_simgrid = xyes ; then
 
																 	)
															
 
																 	AC_CHECK_HEADERS([simgrid/msg.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MSG_H], [1], [Define to 1 if you have msg.h in simgrid/.])])
															
 
																 	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
															
 
																+	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
															
 
																    	AC_CHECK_FUNCS([MSG_process_join MSG_process_attach MSG_get_as_by_name MSG_environment_get_routing_root MSG_host_get_speed xbt_mutex_try_acquire smpi_process_set_user_data sg_link_name])
															
 
																 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
															
 
																 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
															
@@ -421,6 +422,8 @@ if test "$svncommand" != "" ; then
 
																       svndir=1
															
 
																    fi
															
 
																 fi
															
 
																+AC_MSG_CHECKING(if $srcdir is a subversion directory)
															
 
																+AC_MSG_RESULT($svndir)
															
 
																 # use svnversion to record the current repository revision only if
															
 
																 # subversion is installed and we are in a working copy
															
--- a/examples/sched_ctx/parallel_tasks_with_cluster_api.c
+++ b/examples/sched_ctx/parallel_tasks_with_cluster_api.c
@@ -101,7 +101,14 @@ int main(int argc, char **argv)
 
																 				    STARPU_VALUE,&size,sizeof(int),
															
 
																 				    0);
															
 
																 		t->destroy = 1;
															
 
																-		t->possibly_parallel = 1;
															
 
																+		/* For two tasks, try out the case when the task isn't parallel and expect
															
 
																+			 the configuration to be sequential due to this, then automatically changed
															
 
																+			 back to the parallel one */
															
 
																+		if (i<=4 || i > 6)
															
 
																+			t->possibly_parallel = 1;
															
 
																+		/* Note that this mode requires that you put a prologue callback managing
															
 
																+			 this on all tasks to be taken into account. */
															
 
																+		t->prologue_callback_pop_func = &starpu_openmp_prologue;
															
 
																 		ret=starpu_task_submit(t);
															
 
																 		if (ret == -ENODEV)
															
--- a/include/starpu_clusters_util.h
+++ b/include/starpu_clusters_util.h
@@ -72,10 +72,10 @@ int starpu_uncluster_machine(struct starpu_cluster_machine* clusters);
 
																 int starpu_cluster_print(struct starpu_cluster_machine* clusters);
															
 
																 /* Prologue functions */
															
 
																-void starpu_openmp_prologue(void * sched_ctx_id);
															
 
																+void starpu_openmp_prologue(void*);
															
 
																 #define starpu_intel_openmp_mkl_prologue starpu_openmp_prologue
															
 
																 #ifdef STARPU_MKL
															
 
																-void starpu_gnu_openmp_mkl_prologue(void * sched_ctx_id);
															
 
																+void starpu_gnu_openmp_mkl_prologue(void*);
															
 
																 #endif /* STARPU_MKL */
															
 
																 #ifdef __cplusplus
															
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -119,6 +119,8 @@ void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
 
																 void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
															
 
																+struct starpu_sched_policy *starpu_sched_ctx_get_sched_policy(unsigned sched_ctx_id);
															
 
																+
															
 
																 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
															
 
																 int starpu_sched_ctx_get_nready_tasks(unsigned sched_ctx_id);
															
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2013, 2015  Université de Bordeaux
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013  CNRS
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
															
 
																 # Copyright (C) 2016  Inria
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -22,8 +22,9 @@ pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc starpumpi-1.1.pc starpumpi-1.2
 
																 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
															
 
																 versinclude_HEADERS = 					\
															
 
																-	include/fstarpu_mpi_mod.f90			\
															
 
																-	include/starpu_mpi.h
															
 
																+	include/starpu_mpi.h				\
															
 
																+	include/starpu_mpi_lb.h				\
															
 
																+	include/fstarpu_mpi_mod.f90
															
 
																 showcheck:
															
 
																 	RET=0 ; \
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2013, 2015-2016  Université de Bordeaux
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																 # Copyright (C) 2016  Inria
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
@@ -108,13 +108,19 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 
																 ###################
															
 
																 if BUILD_EXAMPLES
															
 
																 examplebin_PROGRAMS +=				\
															
 
																-	stencil/stencil5
															
 
																+	stencil/stencil5			\
															
 
																+	stencil/stencil5_lb
															
 
																 stencil_stencil5_LDADD =		\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
															
 
																+stencil_stencil5_lb_LDADD =		\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm
															
 
																+
															
 
																 starpu_mpi_EXAMPLES	+=	\
															
 
																-	stencil/stencil5
															
 
																+	stencil/stencil5	\
															
 
																+	stencil/stencil5_lb
															
 
																+
															
 
																 endif
															
 
																 ##################
															
--- a/mpi/examples/stencil/stencil5_lb.c
+++ b/mpi/examples/stencil/stencil5_lb.c
@@ -0,0 +1,286 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2013, 2015-2016              Université Bordeaux
															
 
																+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_mpi_lb.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
															
 
																+#define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
															
 
																+    						int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank);       \
															
 
																+                                                fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
															
 
																+                                                fflush(ofile); }} while(0);
															
 
																+
															
 
																+void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																+{
															
 
																+	float *xy = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	float *xm1y = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+	float *xp1y = (float *)STARPU_VARIABLE_GET_PTR(descr[2]);
															
 
																+	float *xym1 = (float *)STARPU_VARIABLE_GET_PTR(descr[3]);
															
 
																+	float *xyp1 = (float *)STARPU_VARIABLE_GET_PTR(descr[4]);
															
 
																+
															
 
																+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
															
 
																+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
															
 
																+//	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet stencil5_cl =
															
 
																+{
															
 
																+	.cpu_funcs = {stencil5_cpu},
															
 
																+	.nbuffers = 5,
															
 
																+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
															
 
																+};
															
 
																+
															
 
																+#ifdef STARPU_QUICK_CHECK
															
 
																+#  define NITER_DEF	10
															
 
																+#  define X         	2
															
 
																+#  define Y         	2
															
 
																+#elif !defined(STARPU_LONG_CHECK)
															
 
																+#  define NITER_DEF	10
															
 
																+#  define X         	5
															
 
																+#  define Y         	5
															
 
																+#else
															
 
																+#  define NITER_DEF	100
															
 
																+#  define X         	20
															
 
																+#  define Y         	20
															
 
																+#endif
															
 
																+
															
 
																+int display = 0;
															
 
																+int niter = NITER_DEF;
															
 
																+
															
 
																+/* Returns the MPI node number where data indexes index is */
															
 
																+int my_distrib(int x, int y, int nb_nodes)
															
 
																+{
															
 
																+	/* Block distrib */
															
 
																+	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
															
 
																+}
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++)
															
 
																+	{
															
 
																+		if (strcmp(argv[i], "-iter") == 0)
															
 
																+		{
															
 
																+			char *argptr;
															
 
																+			niter = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+		if (strcmp(argv[i], "-display") == 0)
															
 
																+		{
															
 
																+			display = 1;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void get_neighbors(int **neighbor_ids, int *nneighbors)
															
 
																+{
															
 
																+	int rank, size;
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size <= 2)
															
 
																+	{
															
 
																+		*nneighbors = 1;
															
 
																+		*neighbor_ids = malloc(sizeof(int));
															
 
																+		*neighbor_ids[0] = rank==size-1?0:rank+1;
															
 
																+		fprintf(stderr, "rank %d has neighbor %d\n", rank, *neighbor_ids[0]);
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		*nneighbors = 2;
															
 
																+		*neighbor_ids = malloc(2*sizeof(int));
															
 
																+		(*neighbor_ids)[0] = rank==size-1?0:rank+1;
															
 
																+		(*neighbor_ids)[1] = rank==0?size-1:rank-1;
															
 
																+		fprintf(stderr, "rank %d has neighbor %d and %d\n", rank, (*neighbor_ids)[0], (*neighbor_ids)[1]);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+struct data_node
															
 
																+{
															
 
																+	starpu_data_handle_t data_handle;
															
 
																+	int node;
															
 
																+};
															
 
																+
															
 
																+struct data_node data_nodes[X][Y];
															
 
																+
															
 
																+void get_data_unit_to_migrate(starpu_data_handle_t **handle_unit, int *nhandles, int dst_node)
															
 
																+{
															
 
																+	int rank, x, y;
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	fprintf(stderr, "Looking to move data from %d to %d\n", rank, dst_node);
															
 
																+	for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+		for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+			if (data_nodes[x][y].node == rank)
															
 
																+			{
															
 
																+				*handle_unit = malloc(sizeof(starpu_data_handle_t));
															
 
																+				*handle_unit[0] = data_nodes[x][y].data_handle;
															
 
																+				*nhandles = 1;
															
 
																+				data_nodes[x][y].node = dst_node;
															
 
																+				return;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	*nhandles = 0;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int my_rank, size, x, y, loop;
															
 
																+	float mean=0;
															
 
																+	float matrix[X][Y];
															
 
																+	struct starpu_mpi_lb_conf itf;
															
 
																+
															
 
																+	itf.get_neighbors = get_neighbors;
															
 
																+	itf.get_data_unit_to_migrate = get_data_unit_to_migrate;
															
 
																+
															
 
																+	int ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_init(&argc, &argv, 1);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
															
 
																+
															
 
																+	if (size > 2)
															
 
																+	{
															
 
																+		FPRINTF(stderr, "Only works with 2 nodes\n");
															
 
																+		starpu_mpi_shutdown();
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+	if (starpu_cpu_worker_get_count() == 0)
															
 
																+	{
															
 
																+		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
															
 
																+		starpu_mpi_shutdown();
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																+	setenv("LB_HEAT_SLEEP_THRESHOLD", "5", 1);
															
 
																+	starpu_mpi_lb_init("heat", &itf);
															
 
																+
															
 
																+	parse_args(argc, argv);
															
 
																+
															
 
																+	/* Initial data values */
															
 
																+	starpu_srand48((long int)time(NULL));
															
 
																+	for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+		for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+			matrix[x][y] = (float)starpu_drand48();
															
 
																+			mean += matrix[x][y];
															
 
																+		}
															
 
																+	}
															
 
																+	mean /= (X*Y);
															
 
																+
															
 
																+	if (display)
															
 
																+	{
															
 
																+		FPRINTF_MPI(stdout, "mean=%2.2f\n", mean);
															
 
																+		for(x = 0; x < X; x++)
															
 
																+		{
															
 
																+			fprintf(stdout, "[%d] ", my_rank);
															
 
																+			for (y = 0; y < Y; y++)
															
 
																+			{
															
 
																+				fprintf(stdout, "%2.2f ", matrix[x][y]);
															
 
																+			}
															
 
																+			fprintf(stdout, "\n");
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Initial distribution */
															
 
																+	for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+		for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+			data_nodes[x][y].node = my_distrib(x, y, size);
															
 
																+			if (data_nodes[x][y].node == my_rank)
															
 
																+			{
															
 
																+				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
															
 
																+				starpu_variable_data_register(&data_nodes[x][y].data_handle, 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
															
 
																+			}
															
 
																+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
															
 
																+				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
															
 
																+			{
															
 
																+				/* I don't own that index, but will need it for my computations */
															
 
																+				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
															
 
																+				starpu_variable_data_register(&data_nodes[x][y].data_handle, -1, (uintptr_t)NULL, sizeof(float));
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				/* I know it's useless to allocate anything for this */
															
 
																+				data_nodes[x][y].data_handle = NULL;
															
 
																+			}
															
 
																+			if (data_nodes[x][y].data_handle)
															
 
																+			{
															
 
																+				starpu_mpi_data_register(data_nodes[x][y].data_handle, (y*X)+x, data_nodes[x][y].node);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* First computation with initial distribution */
															
 
																+	for(loop=0 ; loop<niter; loop++)
															
 
																+	{
															
 
																+		for (x = 1; x < X-1; x++)
															
 
																+		{
															
 
																+			for (y = 1; y < Y-1; y++)
															
 
																+			{
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_nodes[x][y].data_handle,
															
 
																+						       STARPU_R, data_nodes[x-1][y].data_handle, STARPU_R, data_nodes[x+1][y].data_handle,
															
 
																+						       STARPU_R, data_nodes[x][y-1].data_handle, STARPU_R, data_nodes[x][y+1].data_handle,
															
 
																+						       STARPU_TAG_ONLY, ((starpu_tag_t)X)*x + y,
															
 
																+						       0);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	FPRINTF(stderr, "Waiting ...\n");
															
 
																+	starpu_task_wait_for_all();
															
 
																+
															
 
																+	// The load balancer needs to be shutdown before unregistering data as it needs access to them
															
 
																+	starpu_mpi_lb_shutdown();
															
 
																+
															
 
																+	/* Unregister data */
															
 
																+	for(x = 0; x < X; x++)
															
 
																+	{
															
 
																+		for (y = 0; y < Y; y++)
															
 
																+		{
															
 
																+			if (data_nodes[x][y].data_handle)
															
 
																+			{
															
 
																+				starpu_data_unregister(data_nodes[x][y].data_handle);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+
															
 
																+	if (display)
															
 
																+	{
															
 
																+		FPRINTF(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
															
 
																+		for(x = 0; x < X; x++)
															
 
																+		{
															
 
																+			FPRINTF(stdout, "[%d] ", my_rank);
															
 
																+			for (y = 0; y < Y; y++)
															
 
																+			{
															
 
																+				FPRINTF(stdout, "%2.2f ", matrix[x][y]);
															
 
																+			}
															
 
																+			FPRINTF(stdout, "\n");
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2012, 2014-2016  Université de Bordeaux
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																  * Copyright (C) 2016  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -119,6 +119,9 @@ typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 
																 int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
															
 
																 int starpu_mpi_datatype_unregister(starpu_data_handle_t handle);
															
 
																+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *));
															
 
																+int starpu_mpi_pre_submit_hook_unregister();
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/mpi/include/starpu_mpi_lb.h
+++ b/mpi/include/starpu_mpi_lb.h
@@ -0,0 +1,41 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __STARPU_MPI_LOAD_BALANCER_H__
															
 
																+#define __STARPU_MPI_LOAD_BALANCER_H__
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C" {
															
 
																+#endif
															
 
																+
															
 
																+struct starpu_mpi_lb_conf
															
 
																+{
															
 
																+	void (*get_neighbors)(int **neighbor_ids, int *nneighbors);
															
 
																+	void (*get_data_unit_to_migrate)(starpu_data_handle_t **handle_unit, int *nhandles, int dst_node);
															
 
																+};
															
 
																+
															
 
																+/* Inits the load balancer's environment with the load policy provided by the
															
 
																+ * user
															
 
																+ */
															
 
																+void starpu_mpi_lb_init(const char *lb_policy_name, struct starpu_mpi_lb_conf *);
															
 
																+void starpu_mpi_lb_shutdown();
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif // __STARPU_MPI_LOAD_BALANCER_H__
															
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2012  Université de Bordeaux
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -69,7 +69,10 @@ noinst_HEADERS =					\
 
																 	starpu_mpi_sync_data.h				\
															
 
																 	starpu_mpi_comm.h				\
															
 
																 	starpu_mpi_tag.h				\
															
 
																-	starpu_mpi_task_insert.h
															
 
																+	starpu_mpi_task_insert.h			\
															
 
																+	load_balancer/policy/data_movements_interface.h	\
															
 
																+	load_balancer/policy/load_data_interface.h	\
															
 
																+	load_balancer/policy/load_balancer_policy.h
															
 
																 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
															
 
																 	starpu_mpi.c					\
															
@@ -88,7 +91,12 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 
																 	starpu_mpi_comm.c				\
															
 
																 	starpu_mpi_tag.c				\
															
 
																 	starpu_mpi_fortran.c				\
															
 
																-	starpu_mpi_task_insert_fortran.c
															
 
																+	starpu_mpi_task_insert_fortran.c		\
															
 
																+	load_balancer/policy/data_movements_interface.c	\
															
 
																+	load_balancer/policy/load_data_interface.c	\
															
 
																+	load_balancer/policy/load_heat_propagation.c	\
															
 
																+	load_balancer/load_balancer.c
															
 
																+
															
 
																 showcheck:
															
 
																 	-cat /dev/null
															
--- a/mpi/src/load_balancer/load_balancer.c
+++ b/mpi/src/load_balancer/load_balancer.c
@@ -0,0 +1,156 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdio.h>
															
 
																+#include <stdlib.h>
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_scheduler.h>
															
 
																+#include <common/utils.h>
															
 
																+
															
 
																+#include <starpu_mpi_lb.h>
															
 
																+#include "policy/load_balancer_policy.h"
															
 
																+
															
 
																+static struct load_balancer_policy *defined_policy = NULL;
															
 
																+typedef void (*_post_exec_hook_func_t)(struct starpu_task *task, unsigned sched_ctx_id);
															
 
																+static _post_exec_hook_func_t saved_post_exec_hook[STARPU_NMAX_SCHED_CTXS];
															
 
																+
															
 
																+static void post_exec_hook_wrapper(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																+{
															
 
																+	//fprintf(stderr,"I am called ! \n");
															
 
																+	if (defined_policy && defined_policy->finished_task_entry_point)
															
 
																+		defined_policy->finished_task_entry_point();
															
 
																+	if (saved_post_exec_hook[sched_ctx_id])
															
 
																+		saved_post_exec_hook[sched_ctx_id](task, sched_ctx_id);
															
 
																+}
															
 
																+
															
 
																+static struct load_balancer_policy *predefined_policies[] =
															
 
																+{
															
 
																+	&load_heat_propagation_policy,
															
 
																+	NULL
															
 
																+};
															
 
																+
															
 
																+void starpu_mpi_lb_init(const char *lb_policy_name, struct starpu_mpi_lb_conf *itf)
															
 
																+{
															
 
																+	int ret;
															
 
																+
															
 
																+	const char *policy_name = starpu_getenv("STARPU_MPI_LB");
															
 
																+	if (!policy_name)
															
 
																+		policy_name = lb_policy_name;
															
 
																+
															
 
																+	if (!policy_name || (strcmp(policy_name, "help") == 0))
															
 
																+	{
															
 
																+		_STARPU_MSG("Warning : load balancing is disabled for this run.\n");
															
 
																+		_STARPU_MSG("Use the STARPU_MPI_LB = <name> environment variable to use a load balancer.\n");
															
 
																+		_STARPU_MSG("Available load balancers :\n");
															
 
																+		struct load_balancer_policy **policy;
															
 
																+		for(policy=predefined_policies ; *policy!=NULL ; policy++)
															
 
																+		{
															
 
																+			struct load_balancer_policy *p = *policy;
															
 
																+			fprintf(stderr," - %s\n", p->policy_name);
															
 
																+		}
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	if (policy_name)
															
 
																+	{
															
 
																+		struct load_balancer_policy **policy;
															
 
																+		for(policy=predefined_policies ; *policy!=NULL ; policy++)
															
 
																+		{
															
 
																+			struct load_balancer_policy *p = *policy;
															
 
																+			if (p->policy_name)
															
 
																+			{
															
 
																+				if (strcmp(policy_name, p->policy_name) == 0)
															
 
																+				{
															
 
																+					/* we found a policy with the requested name */
															
 
																+					defined_policy = p;
															
 
																+					break;
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	if (!defined_policy)
															
 
																+	{
															
 
																+		_STARPU_MSG("Error : no load balancer with the name %s. Load balancing will be disabled for this run.\n", policy_name);
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	ret = defined_policy->init(itf);
															
 
																+	if (ret != 0)
															
 
																+	{
															
 
																+		_STARPU_MSG("Error (%d) in %s->init: invalid starpu_mpi_lb_conf. Load balancing will be disabled for this run.\n", ret, defined_policy->policy_name);
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	/* starpu_register_hook(submitted_task, defined_policy->submitted_task_entry_point); */
															
 
																+	if (defined_policy->submitted_task_entry_point)
															
 
																+		starpu_mpi_pre_submit_hook_register(defined_policy->submitted_task_entry_point);
															
 
																+
															
 
																+	/* starpu_register_hook(finished_task, defined_policy->finished_task_entry_point); */
															
 
																+	if (defined_policy->finished_task_entry_point)
															
 
																+	{
															
 
																+		int i;
															
 
																+		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
															
 
																+		{
															
 
																+			struct starpu_sched_policy *sched_policy = starpu_sched_ctx_get_sched_policy(i);
															
 
																+			if (sched_policy)
															
 
																+			{
															
 
																+				_STARPU_DEBUG("Setting post_exec_hook for scheduling context %d %s (%d)\n", i, sched_policy->policy_name, STARPU_NMAX_SCHED_CTXS);
															
 
																+				saved_post_exec_hook[i] = sched_policy->post_exec_hook;
															
 
																+				sched_policy->post_exec_hook = post_exec_hook_wrapper;
															
 
																+			}
															
 
																+			else
															
 
																+				saved_post_exec_hook[i] = NULL;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return;
															
 
																+}
															
 
																+
															
 
																+void starpu_mpi_lb_shutdown()
															
 
																+{
															
 
																+	if (!defined_policy)
															
 
																+		return;
															
 
																+
															
 
																+	int ret = defined_policy->deinit();
															
 
																+	if (ret != 0)
															
 
																+	{
															
 
																+		_STARPU_MSG("Error (%d) in %s->deinit\n", ret, defined_policy->policy_name);
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	/* starpu_unregister_hook(submitted_task, defined_policy->submitted_task_entry_point); */
															
 
																+	if (defined_policy->submitted_task_entry_point)
															
 
																+		starpu_mpi_pre_submit_hook_unregister();
															
 
																+
															
 
																+	/* starpu_unregister_hook(finished_task, defined_policy->finished_task_entry_point); */
															
 
																+	if (defined_policy->finished_task_entry_point)
															
 
																+	{
															
 
																+		int i;
															
 
																+		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
															
 
																+		{
															
 
																+			if (saved_post_exec_hook[i])
															
 
																+			{
															
 
																+				struct starpu_sched_policy *sched_policy = starpu_sched_ctx_get_sched_policy(i);
															
 
																+				sched_policy->post_exec_hook = saved_post_exec_hook[i];
															
 
																+				saved_post_exec_hook[i] = NULL;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+	defined_policy = NULL;
															
 
																+}
															
--- a/mpi/src/load_balancer/policy/data_movements_interface.c
+++ b/mpi/src/load_balancer/policy/data_movements_interface.c
@@ -0,0 +1,280 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <stdlib.h>
															
 
																+
															
 
																+#include "data_movements_interface.h"
															
 
																+
															
 
																+int **data_movements_get_ref_tags_table(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	if (dm_interface->tags)
															
 
																+		return &dm_interface->tags;
															
 
																+	else
															
 
																+		return NULL;
															
 
																+}
															
 
																+
															
 
																+int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	if (dm_interface->ranks)
															
 
																+		return &dm_interface->ranks;
															
 
																+	else
															
 
																+		return NULL;
															
 
																+}
															
 
																+
															
 
																+int *data_movements_get_tags_table(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return dm_interface->tags;
															
 
																+}
															
 
																+
															
 
																+int *data_movements_get_ranks_table(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return dm_interface->ranks;
															
 
																+}
															
 
																+
															
 
																+int data_movements_get_size_tables(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return dm_interface->size;
															
 
																+}
															
 
																+
															
 
																+int data_movements_reallocate_tables(starpu_data_handle_t handle, int size)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface =
															
 
																+		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	if (dm_interface->size)
															
 
																+	{
															
 
																+		STARPU_ASSERT(dm_interface->tags);
															
 
																+		free(dm_interface->tags);
															
 
																+		dm_interface->tags = NULL;
															
 
																+
															
 
																+		STARPU_ASSERT(dm_interface->ranks);
															
 
																+		free(dm_interface->ranks);
															
 
																+		dm_interface->ranks = NULL;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		STARPU_ASSERT(!dm_interface->tags);
															
 
																+		STARPU_ASSERT(!dm_interface->ranks);
															
 
																+	}
															
 
																+
															
 
																+	dm_interface->size = size;
															
 
																+
															
 
																+	if (dm_interface->size)
															
 
																+	{
															
 
																+		dm_interface->tags = malloc(size*sizeof(int));
															
 
																+		dm_interface->ranks = malloc(size*sizeof(int));
															
 
																+	}
															
 
																+
															
 
																+	return 0 ;
															
 
																+}
															
 
																+
															
 
																+static void data_movements_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
															
 
																+
															
 
																+	unsigned node;
															
 
																+	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+	{
															
 
																+		struct data_movements_interface *local_interface = (struct data_movements_interface *)
															
 
																+			starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+		local_interface->size = dm_interface->size;
															
 
																+		if (node == home_node)
															
 
																+		{
															
 
																+			local_interface->tags = dm_interface->tags;
															
 
																+			local_interface->ranks = dm_interface->ranks;
															
 
																+		}
															
 
																+		else
															
 
																+		{
															
 
																+			local_interface->tags = NULL;
															
 
																+			local_interface->ranks = NULL;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static starpu_ssize_t data_movements_allocate_data_on_node(void *data_interface, unsigned node)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
															
 
																+
															
 
																+	int *addr_tags = NULL;
															
 
																+	int *addr_ranks = NULL;
															
 
																+	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
															
 
																+
															
 
																+	addr_tags = (int*) starpu_malloc_on_node(node, requested_memory);
															
 
																+	if (!addr_tags)
															
 
																+		goto fail_tags;
															
 
																+	addr_ranks = (int*) starpu_malloc_on_node(node, requested_memory);
															
 
																+	if (!addr_ranks)
															
 
																+		goto fail_ranks;
															
 
																+
															
 
																+	/* update the data properly in consequence */
															
 
																+	dm_interface->tags = addr_tags;
															
 
																+	dm_interface->ranks = addr_ranks;
															
 
																+
															
 
																+	return 2*requested_memory;
															
 
																+
															
 
																+fail_ranks:
															
 
																+	starpu_free_on_node(node, (uintptr_t) addr_tags, requested_memory);
															
 
																+fail_tags:
															
 
																+	return -ENOMEM;
															
 
																+}
															
 
																+
															
 
																+static void data_movements_free_data_on_node(void *data_interface, unsigned node)
															
 
																+{
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
															
 
																+	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
															
 
																+
															
 
																+	starpu_free_on_node(node, (uintptr_t) dm_interface->tags, requested_memory);
															
 
																+	starpu_free_on_node(node, (uintptr_t) dm_interface->ranks, requested_memory);
															
 
																+}
															
 
																+
															
 
																+static size_t data_movements_get_size(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	size_t size;
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	size = (dm_interface->size * 2 * sizeof(int)) + sizeof(int);
															
 
																+	return size;
															
 
																+}
															
 
																+
															
 
																+static uint32_t data_movements_footprint(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	return starpu_hash_crc32c_be(data_movements_get_size(handle), 0);
															
 
																+}
															
 
																+
															
 
																+static int data_movements_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
															
 
																+{
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	*count = data_movements_get_size(handle);
															
 
																+	if (ptr != NULL)
															
 
																+	{
															
 
																+		char *data;
															
 
																+		starpu_malloc_flags((void**) &data, *count, 0);
															
 
																+		assert(data);
															
 
																+		*ptr = data;
															
 
																+		memcpy(data, &dm_interface->size, sizeof(int));
															
 
																+		if (dm_interface->size)
															
 
																+		{
															
 
																+			memcpy(data+sizeof(int), dm_interface->tags, (dm_interface->size*sizeof(int)));
															
 
																+			memcpy(data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->ranks, dm_interface->size*sizeof(int));
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int data_movements_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
															
 
																+{
															
 
																+	char *data = ptr;
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct data_movements_interface *dm_interface = (struct data_movements_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	int size = 0;
															
 
																+	memcpy(&size, data, sizeof(int));
															
 
																+	STARPU_ASSERT(count == (2 * size * sizeof(int)) + sizeof(int));
															
 
																+
															
 
																+	data_movements_reallocate_tables(handle, size);
															
 
																+
															
 
																+	if (dm_interface->size)
															
 
																+	{
															
 
																+		memcpy(dm_interface->tags, data+sizeof(int), dm_interface->size*sizeof(int));
															
 
																+		memcpy(dm_interface->ranks, data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->size*sizeof(int));
															
 
																+	}
															
 
																+
															
 
																+    return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node,
															
 
																+			   void *dst_interface, unsigned dst_node,
															
 
																+			   void *async_data)
															
 
																+{
															
 
																+	struct data_movements_interface *src_data_movements = src_interface;
															
 
																+	struct data_movements_interface *dst_data_movements = dst_interface;
															
 
																+	int ret = 0;
															
 
																+
															
 
																+	if (starpu_interface_copy((uintptr_t) src_data_movements->tags, 0, src_node,
															
 
																+				    (uintptr_t) dst_data_movements->tags, 0, dst_node,
															
 
																+				     src_data_movements->size*sizeof(int),
															
 
																+				     async_data))
															
 
																+		ret = -EAGAIN;
															
 
																+	if (starpu_interface_copy((uintptr_t) src_data_movements->ranks, 0, src_node,
															
 
																+				    (uintptr_t) dst_data_movements->ranks, 0, dst_node,
															
 
																+				     src_data_movements->size*sizeof(int),
															
 
																+				     async_data))
															
 
																+		ret = -EAGAIN;
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+static const struct starpu_data_copy_methods data_movements_copy_methods =
															
 
																+{
															
 
																+	.any_to_any = copy_any_to_any
															
 
																+};
															
 
																+
															
 
																+static struct starpu_data_interface_ops interface_data_movements_ops =
															
 
																+{
															
 
																+	.register_data_handle = data_movements_register_data_handle,
															
 
																+	.allocate_data_on_node = data_movements_allocate_data_on_node,
															
 
																+	.free_data_on_node = data_movements_free_data_on_node,
															
 
																+	.copy_methods = &data_movements_copy_methods,
															
 
																+	.get_size = data_movements_get_size,
															
 
																+	.footprint = data_movements_footprint,
															
 
																+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																+	.interface_size = sizeof(struct data_movements_interface),
															
 
																+	.handle_to_pointer = NULL,
															
 
																+	.pack_data = data_movements_pack_data,
															
 
																+	.unpack_data = data_movements_unpack_data,
															
 
																+	.describe = NULL
															
 
																+};
															
 
																+
															
 
																+void data_movements_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int *tags, int *ranks, int size)
															
 
																+{
															
 
																+	struct data_movements_interface data_movements =
															
 
																+	{
															
 
																+		.tags = tags,
															
 
																+		.ranks = ranks,
															
 
																+		.size = size
															
 
																+	};
															
 
																+
															
 
																+	if (interface_data_movements_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																+	{
															
 
																+		interface_data_movements_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_register(handleptr, home_node, &data_movements, &interface_data_movements_ops);
															
 
																+}
															
--- a/mpi/src/load_balancer/policy/data_movements_interface.h
+++ b/mpi/src/load_balancer/policy/data_movements_interface.h
@@ -0,0 +1,48 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#ifndef __DATA_MOVEMENTS_INTERFACE_H
															
 
																+#define __DATA_MOVEMENTS_INTERFACE_H
															
 
																+
															
 
																+/* interface for data_movements */
															
 
																+struct data_movements_interface
															
 
																+{
															
 
																+	/* Data tags table */
															
 
																+	int *tags;
															
 
																+	/* Ranks table (where to move the corresponding data) */
															
 
																+	int *ranks;
															
 
																+	/* Size of the tables */
															
 
																+	int size;
															
 
																+};
															
 
																+
															
 
																+void data_movements_data_register(starpu_data_handle_t *handle, unsigned home_node, int *ranks, int *tags, int size);
															
 
																+
															
 
																+int **data_movements_get_ref_tags_table(starpu_data_handle_t handle);
															
 
																+int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle);
															
 
																+int data_movements_reallocate_tables(starpu_data_handle_t handle, int size);
															
 
																+
															
 
																+int *data_movements_get_tags_table(starpu_data_handle_t handle);
															
 
																+int *data_movements_get_ranks_table(starpu_data_handle_t handle);
															
 
																+int data_movements_get_size_tables(starpu_data_handle_t handle);
															
 
																+
															
 
																+#define DATA_MOVEMENTS_GET_SIZE_TABLES(interface)	(((struct data_movements_interface *)(interface))->size)
															
 
																+#define DATA_MOVEMENTS_GET_TAGS_TABLE(interface)	(((struct data_movements_interface *)(interface))->tags)
															
 
																+#define DATA_MOVEMENTS_GET_RANKS_TABLE(interface)	(((struct data_movements_interface *)(interface))->ranks)
															
 
																+
															
 
																+#endif /* __DATA_MOVEMENTS_INTERFACE_H */
															
--- a/mpi/src/load_balancer/policy/load_balancer_policy.h
+++ b/mpi/src/load_balancer/policy/load_balancer_policy.h
@@ -0,0 +1,52 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __LOAD_BALANCER_POLICY_H__
															
 
																+#define __LOAD_BALANCER_POLICY_H__
															
 
																+
															
 
																+#include <starpu_mpi_lb.h>
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+extern "C" {
															
 
																+#endif
															
 
																+
															
 
																+/* A load balancer consists in a collection of operations on a data
															
 
																+ * representing the load of the application (in terms of computation, memory,
															
 
																+ * whatever). StarPU allows several entry points for the user. The load
															
 
																+ * balancer allows the user to give its load balancing methods to be used on
															
 
																+ * these entry points of the runtime system. */
															
 
																+struct load_balancer_policy
															
 
																+{
															
 
																+	int (*init)(struct starpu_mpi_lb_conf *);
															
 
																+	int (*deinit)();
															
 
																+	void (*submitted_task_entry_point)();
															
 
																+	void (*finished_task_entry_point)();
															
 
																+
															
 
																+	/* Name of the load balancing policy. The selection of the load balancer is
															
 
																+	 * performed through the use of the STARPU_MPI_LB=name environment
															
 
																+	 * variable.
															
 
																+	 */
															
 
																+	const char *policy_name;
															
 
																+};
															
 
																+
															
 
																+extern struct load_balancer_policy load_heat_propagation_policy;
															
 
																+
															
 
																+#ifdef __cplusplus
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+#endif // __LOAD_BALANCER_POLICY_H__
															
--- a/mpi/src/load_balancer/policy/load_data_interface.c
+++ b/mpi/src/load_balancer/policy/load_data_interface.c
@@ -0,0 +1,268 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <stdlib.h>
															
 
																+
															
 
																+#include "load_data_interface.h"
															
 
																+
															
 
																+int load_data_get_sleep_threshold(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->sleep_task_threshold;
															
 
																+}
															
 
																+
															
 
																+int load_data_get_wakeup_threshold(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->wakeup_task_threshold;
															
 
																+}
															
 
																+
															
 
																+int load_data_get_current_phase(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->phase;
															
 
																+}
															
 
																+
															
 
																+int load_data_get_nsubmitted_tasks(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->nsubmitted_tasks;
															
 
																+}
															
 
																+
															
 
																+int load_data_get_nfinished_tasks(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->nfinished_tasks;
															
 
																+}
															
 
																+
															
 
																+int load_data_inc_nsubmitted_tasks(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	(ld_interface->nsubmitted_tasks)++;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int load_data_inc_nfinished_tasks(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	(ld_interface->nfinished_tasks)++;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int load_data_next_phase(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	ld_interface->phase++;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int load_data_update_elapsed_time(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	ld_interface->elapsed_time = starpu_timing_now() - ld_interface->start;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+double load_data_get_elapsed_time(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ld_interface->elapsed_time;
															
 
																+}
															
 
																+
															
 
																+int load_data_update_wakeup_cond(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	int previous_threshold = ld_interface->wakeup_task_threshold;
															
 
																+	ld_interface->wakeup_task_threshold += (ld_interface->nsubmitted_tasks - previous_threshold) * ld_interface->wakeup_ratio;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int load_data_wakeup_cond(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+
															
 
																+	return ((ld_interface->wakeup_task_threshold > 0) && (ld_interface->nfinished_tasks == ld_interface->wakeup_task_threshold));
															
 
																+}
															
 
																+
															
 
																+static void load_data_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface = (struct load_data_interface *) data_interface;
															
 
																+
															
 
																+	unsigned node;
															
 
																+	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+	{
															
 
																+		struct load_data_interface *local_interface = (struct load_data_interface *)
															
 
																+			starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+		local_interface->start = ld_interface->start;
															
 
																+		local_interface->elapsed_time = ld_interface->elapsed_time;
															
 
																+		local_interface->phase = ld_interface->phase;
															
 
																+		local_interface->nsubmitted_tasks = ld_interface->nsubmitted_tasks;
															
 
																+		local_interface->nfinished_tasks = ld_interface->nsubmitted_tasks;
															
 
																+		local_interface->wakeup_task_threshold = ld_interface->wakeup_task_threshold;
															
 
																+		local_interface->wakeup_ratio = ld_interface->wakeup_ratio;
															
 
																+		local_interface->sleep_task_threshold = ld_interface->sleep_task_threshold;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static starpu_ssize_t load_data_allocate_data_on_node(void *data_interface, unsigned node)
															
 
																+{
															
 
																+	(void) data_interface;
															
 
																+	(void) node;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static void load_data_free_data_on_node(void *data_interface, unsigned node)
															
 
																+{
															
 
																+	(void) data_interface;
															
 
																+	(void) node;
															
 
																+}
															
 
																+
															
 
																+static size_t load_data_get_size(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	(void) handle;
															
 
																+	return (sizeof(struct load_data_interface));
															
 
																+}
															
 
																+
															
 
																+static uint32_t load_data_footprint(starpu_data_handle_t handle)
															
 
																+{
															
 
																+	struct load_data_interface *ld_interface =
															
 
																+		(struct load_data_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+	return starpu_hash_crc32c_be(ld_interface->start,
															
 
																+				     starpu_hash_crc32c_be(ld_interface->elapsed_time,
															
 
																+							   starpu_hash_crc32c_be(ld_interface->nsubmitted_tasks,
															
 
																+										 starpu_hash_crc32c_be(ld_interface->sleep_task_threshold, ld_interface->wakeup_task_threshold))));
															
 
																+}
															
 
																+
															
 
																+static int load_data_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
															
 
																+{
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct load_data_interface *ld_interface = (struct load_data_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	*count = load_data_get_size(handle);
															
 
																+	if (ptr != NULL)
															
 
																+	{
															
 
																+		char *data;
															
 
																+		starpu_malloc_flags((void**) &data, *count, 0);
															
 
																+		*ptr = data;
															
 
																+		memcpy(data, ld_interface, *count);
															
 
																+	}
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int load_data_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
															
 
																+{
															
 
																+	char *data = ptr;
															
 
																+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
															
 
																+
															
 
																+	struct load_data_interface *ld_interface = (struct load_data_interface *)
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																+
															
 
																+	STARPU_ASSERT(count == sizeof(struct load_data_interface));
															
 
																+	memcpy(ld_interface, data, count);
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static int copy_any_to_any(void *src_interface, unsigned src_node,
															
 
																+			   void *dst_interface, unsigned dst_node,
															
 
																+			   void *async_data)
															
 
																+{
															
 
																+	(void) src_interface;
															
 
																+	(void) dst_interface;
															
 
																+	(void) src_node;
															
 
																+	(void) dst_node;
															
 
																+	(void) async_data;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+static const struct starpu_data_copy_methods load_data_copy_methods =
															
 
																+{
															
 
																+	.any_to_any = copy_any_to_any
															
 
																+};
															
 
																+
															
 
																+static struct starpu_data_interface_ops interface_load_data_ops =
															
 
																+{
															
 
																+	.register_data_handle = load_data_register_data_handle,
															
 
																+	.allocate_data_on_node = load_data_allocate_data_on_node,
															
 
																+	.free_data_on_node = load_data_free_data_on_node,
															
 
																+	.copy_methods = &load_data_copy_methods,
															
 
																+	.get_size = load_data_get_size,
															
 
																+	.footprint = load_data_footprint,
															
 
																+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																+	.interface_size = sizeof(struct load_data_interface),
															
 
																+	.handle_to_pointer = NULL,
															
 
																+	.pack_data = load_data_pack_data,
															
 
																+	.unpack_data = load_data_unpack_data,
															
 
																+	.describe = NULL
															
 
																+};
															
 
																+
															
 
																+void load_data_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int sleep_task_threshold, double wakeup_ratio)
															
 
																+{
															
 
																+	struct load_data_interface load_data =
															
 
																+	{
															
 
																+		.start = starpu_timing_now(),
															
 
																+		.elapsed_time = 0,
															
 
																+		.phase = 0,
															
 
																+		.nsubmitted_tasks = 0,
															
 
																+		.nfinished_tasks = 0,
															
 
																+		.sleep_task_threshold = sleep_task_threshold,
															
 
																+		.wakeup_task_threshold = 0,
															
 
																+		.wakeup_ratio = wakeup_ratio
															
 
																+	};
															
 
																+
															
 
																+	if (interface_load_data_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																+	{
															
 
																+		interface_load_data_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																+	}
															
 
																+
															
 
																+	starpu_data_register(handleptr, home_node, &load_data, &interface_load_data_ops);
															
 
																+}
															
--- a/mpi/src/load_balancer/policy/load_data_interface.h
+++ b/mpi/src/load_balancer/policy/load_data_interface.h
@@ -0,0 +1,70 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#ifndef __LOAD_DATA_INTERFACE_H
															
 
																+#define __LOAD_DATA_INTERFACE_H
															
 
																+
															
 
																+/* interface for load_data */
															
 
																+struct load_data_interface
															
 
																+{
															
 
																+	/* Starting time of the execution */
															
 
																+	double start;
															
 
																+	/* Elapsed time until the start time and the time when event "launch a load
															
 
																+	 * balancing phase" is triggered */
															
 
																+	double elapsed_time;
															
 
																+	/* Current submission phase, i.e how many balanced steps have already
															
 
																+	 * happened so far. */
															
 
																+	int phase;
															
 
																+	/* Number of currently submitted tasks */
															
 
																+	int nsubmitted_tasks;
															
 
																+	/* Number of currently finished tasks */
															
 
																+	int nfinished_tasks;
															
 
																+	/* Task threshold to sleep the submission thread */
															
 
																+	int sleep_task_threshold;
															
 
																+	/* Task threshold to wake-up the submission thread */
															
 
																+	int wakeup_task_threshold;
															
 
																+	/* Ratio of submitted tasks to wait for completion before waking up the
															
 
																+	 * submission thread */
															
 
																+	double wakeup_ratio;
															
 
																+};
															
 
																+
															
 
																+void load_data_data_register(starpu_data_handle_t *handle, unsigned home_node, int sleep_task_threshold, double wakeup_ratio);
															
 
																+
															
 
																+int load_data_get_sleep_threshold(starpu_data_handle_t handle);
															
 
																+int load_data_get_wakeup_threshold(starpu_data_handle_t handle);
															
 
																+int load_data_get_current_phase(starpu_data_handle_t handle);
															
 
																+int load_data_get_nsubmitted_tasks(starpu_data_handle_t handle);
															
 
																+int load_data_get_nfinished_tasks(starpu_data_handle_t handle);
															
 
																+
															
 
																+int load_data_inc_nsubmitted_tasks(starpu_data_handle_t handle);
															
 
																+int load_data_inc_nfinished_tasks(starpu_data_handle_t handle);
															
 
																+
															
 
																+int load_data_next_phase(starpu_data_handle_t handle);
															
 
																+
															
 
																+int load_data_update_elapsed_time(starpu_data_handle_t handle);
															
 
																+double load_data_get_elapsed_time(starpu_data_handle_t handle);
															
 
																+
															
 
																+int load_data_update_wakeup_cond(starpu_data_handle_t handle);
															
 
																+int load_data_wakeup_cond(starpu_data_handle_t handle);
															
 
																+
															
 
																+#define LOAD_DATA_GET_NSUBMITTED_TASKS(interface)	(((struct load_data_interface *)(interface))->nsubmitted_tasks)
															
 
																+#define LOAD_DATA_GET_SLEEP_THRESHOLD(interface)	(((struct load_data_interface *)(interface))->sleep_task_threshold)
															
 
																+#define LOAD_DATA_GET_WAKEUP_THRESHOLD(interface)	(((struct load_data_interface *)(interface))->wakeup_task_threshold)
															
 
																+
															
 
																+#endif /* __LOAD_DATA_INTERFACE_H */
															
--- a/mpi/src/load_balancer/policy/load_heat_propagation.c
+++ b/mpi/src/load_balancer/policy/load_heat_propagation.c
@@ -0,0 +1,640 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_mpi_tag.h>
															
 
																+#include <common/uthash.h>
															
 
																+#include <common/utils.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+#include "load_balancer_policy.h"
															
 
																+#include "data_movements_interface.h"
															
 
																+#include "load_data_interface.h"
															
 
																+
															
 
																+static int TAG_LOAD(int n)
															
 
																+{
															
 
																+	return ((n+1) << 24);
															
 
																+}
															
 
																+
															
 
																+static int TAG_MOV(int n)
															
 
																+{
															
 
																+	return ((n+1) << 20);
															
 
																+}
															
 
																+
															
 
																+/* Hash table of local pieces of data that has been moved out of the local MPI
															
 
																+ * node by the load balancer. All of these pieces of data must be migrated back
															
 
																+ * to the local node at the end of the execution. */
															
 
																+struct moved_data_entry
															
 
																+{
															
 
																+	UT_hash_handle hh;
															
 
																+	starpu_data_handle_t handle;
															
 
																+};
															
 
																+
															
 
																+static struct moved_data_entry *mdh = NULL;
															
 
																+
															
 
																+static starpu_pthread_mutex_t load_data_mutex;
															
 
																+static starpu_pthread_cond_t load_data_cond;
															
 
																+
															
 
																+/* MPI infos */
															
 
																+static int my_rank;
															
 
																+static int world_size;
															
 
																+
															
 
																+/* Number of neighbours of the local MPI node and their IDs. These are given by
															
 
																+ * the get_neighbors() method, and thus can be easily changed. */
															
 
																+static int *neighbor_ids = NULL;
															
 
																+static int nneighbors = 0;
															
 
																+
															
 
																+/* Local load data */
															
 
																+static starpu_data_handle_t *load_data_handle = NULL;
															
 
																+static starpu_data_handle_t *load_data_handle_cpy = NULL;
															
 
																+/* Load data of neighbours */
															
 
																+static starpu_data_handle_t *neighbor_load_data_handles = NULL;
															
 
																+
															
 
																+/* Table which contains a data_movements_handle for each MPI node of
															
 
																+ * MPI_COMM_WORLD. Since all the MPI nodes must be advised of any data
															
 
																+ * movement, this table will be used to perform communications of data
															
 
																+ * movements handles following an all-to-all model. */
															
 
																+static starpu_data_handle_t *data_movements_handles = NULL;
															
 
																+
															
 
																+/* Load balancer interface which contains the application-specific methods for
															
 
																+ * the load balancer to use. */
															
 
																+static struct starpu_mpi_lb_conf *user_itf = NULL;
															
 
																+
															
 
																+static double time_threshold = 20000;
															
 
																+
															
 
																+/******************************************************************************
															
 
																+ *                              Balancing                                     *
															
 
																+ *****************************************************************************/
															
 
																+
															
 
																+
															
 
																+/* Decides which data has to move where, and fills the
															
 
																+ * data_movements_handles[my_rank] data handle from that.
															
 
																+ * In data :
															
 
																+ *  - local load_data_handle
															
 
																+ *  - nneighbors
															
 
																+ *  - neighbor_ids[nneighbors]
															
 
																+ *  - neighbor_load_data_handles[nneighbors]
															
 
																+ * Out data :
															
 
																+ *  - data_movements_handles[my_rank]
															
 
																+ */
															
 
																+
															
 
																+static void balance(starpu_data_handle_t load_data_cpy)
															
 
																+{
															
 
																+	int less_loaded = -1;
															
 
																+	int n;
															
 
																+	double elapsed_time, ref_elapsed_time;
															
 
																+	double my_elapsed_time = load_data_get_elapsed_time(load_data_cpy);
															
 
																+
															
 
																+	/* Search for the less loaded neighbor */
															
 
																+	ref_elapsed_time = my_elapsed_time;
															
 
																+	for (n = 0; n < nneighbors; n++)
															
 
																+	{
															
 
																+		elapsed_time = load_data_get_elapsed_time(neighbor_load_data_handles[n]);
															
 
																+		if (ref_elapsed_time > elapsed_time)
															
 
																+		{
															
 
																+			//fprintf(stderr,"Node%d: ref local time %lf vs neighbour%d time %lf\n", my_rank, ref_elapsed_time, neighbor_ids[n], elapsed_time);
															
 
																+			less_loaded = neighbor_ids[n];
															
 
																+			ref_elapsed_time = elapsed_time;
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* We found it */
															
 
																+	if (less_loaded >= 0)
															
 
																+	{
															
 
																+		_STARPU_DEBUG("Less loaded found on node %d : %d\n", my_rank, less_loaded);
															
 
																+		double diff_time = my_elapsed_time - ref_elapsed_time;
															
 
																+		/* If the difference is higher than a time threshold, we move
															
 
																+		 * one data to the less loaded neighbour. */
															
 
																+		/* TODO: How to decide the time threshold ? */
															
 
																+		if ((time_threshold > 0) && (diff_time >= time_threshold))
															
 
																+		{
															
 
																+			starpu_data_handle_t *handles = NULL;
															
 
																+			int nhandles = 0;
															
 
																+			user_itf->get_data_unit_to_migrate(&handles, &nhandles, less_loaded);
															
 
																+
															
 
																+			data_movements_reallocate_tables(data_movements_handles[my_rank], nhandles);
															
 
																+
															
 
																+			if (nhandles)
															
 
																+			{
															
 
																+				int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
															
 
																+				int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
															
 
																+
															
 
																+				for (n = 0; n < nhandles; n++)
															
 
																+				{
															
 
																+					tags[n] = starpu_mpi_data_get_tag(handles[n]);
															
 
																+					ranks[n] = less_loaded;
															
 
																+				}
															
 
																+
															
 
																+				free(handles);
															
 
																+			}
															
 
																+		}
															
 
																+		else
															
 
																+			data_movements_reallocate_tables(data_movements_handles[my_rank], 0);
															
 
																+	}
															
 
																+	else
															
 
																+		data_movements_reallocate_tables(data_movements_handles[my_rank], 0);
															
 
																+}
															
 
																+
															
 
																+static void exchange_load_data_infos(starpu_data_handle_t load_data_cpy)
															
 
																+{
															
 
																+	int i;
															
 
																+
															
 
																+	/* Allocate all requests and status for point-to-point communications */
															
 
																+	starpu_mpi_req load_send_req[nneighbors];
															
 
																+	starpu_mpi_req load_recv_req[nneighbors];
															
 
																+
															
 
																+	MPI_Status load_send_status[nneighbors];
															
 
																+	MPI_Status load_recv_status[nneighbors];
															
 
																+
															
 
																+	int flag;
															
 
																+
															
 
																+	/* Send the local load data to neighbour nodes, and receive the remote load
															
 
																+	 * data from neighbour nodes */
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+	{
															
 
																+		//_STARPU_DEBUG("[node %d] sending and receiving with %i-th neighbor %i\n", my_rank, i, neighbor_ids[i]);
															
 
																+		starpu_mpi_isend(load_data_cpy, &load_send_req[i], neighbor_ids[i], TAG_LOAD(my_rank), MPI_COMM_WORLD);
															
 
																+		starpu_mpi_irecv(neighbor_load_data_handles[i], &load_recv_req[i], neighbor_ids[i], TAG_LOAD(neighbor_ids[i]), MPI_COMM_WORLD);
															
 
																+	}
															
 
																+
															
 
																+	/* Wait for completion of all send requests */
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+	{
															
 
																+		flag = 0;
															
 
																+		while (!flag)
															
 
																+			starpu_mpi_test(&load_send_req[i], &flag, &load_send_status[i]);
															
 
																+	}
															
 
																+
															
 
																+	/* Wait for completion of all receive requests */
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+	{
															
 
																+		flag = 0;
															
 
																+		while (!flag)
															
 
																+			starpu_mpi_test(&load_recv_req[i], &flag, &load_recv_status[i]);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void exchange_data_movements_infos()
															
 
																+{
															
 
																+	int i;
															
 
																+
															
 
																+	/* Allocate all requests and status for point-to-point communications */
															
 
																+	starpu_mpi_req data_movements_send_req[world_size];
															
 
																+	starpu_mpi_req data_movements_recv_req[world_size];
															
 
																+
															
 
																+	MPI_Status data_movements_send_status[world_size];
															
 
																+	MPI_Status data_movements_recv_status[world_size];
															
 
																+
															
 
																+	int flag;
															
 
																+
															
 
																+	/* Send the new ranks of local data to all other nodes, and receive the new
															
 
																+	 * ranks of all remote data from all other nodes */
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		if (i != my_rank)
															
 
																+		{
															
 
																+			//_STARPU_DEBUG("[node %d] Send and receive data movement with %d\n", my_rank, i);
															
 
																+			starpu_mpi_isend(data_movements_handles[my_rank], &data_movements_send_req[i], i, TAG_MOV(my_rank), MPI_COMM_WORLD);
															
 
																+			starpu_mpi_irecv(data_movements_handles[i], &data_movements_recv_req[i], i, TAG_MOV(i), MPI_COMM_WORLD);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Wait for completion of all send requests */
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		if (i != my_rank)
															
 
																+		{
															
 
																+			//fprintf(stderr,"Wait for sending data movement of %d to %d\n", my_rank, i);
															
 
																+			flag = 0;
															
 
																+			while (!flag)
															
 
																+				starpu_mpi_test(&data_movements_send_req[i], &flag, &data_movements_send_status[i]);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Wait for completion of all receive requests */
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		if (i != my_rank)
															
 
																+		{
															
 
																+			//fprintf(stderr,"Wait for recieving data movement from %d on %d\n", i, my_rank);
															
 
																+			flag = 0;
															
 
																+			while (!flag)
															
 
																+				starpu_mpi_test(&data_movements_recv_req[i], &flag, &data_movements_recv_status[i]);
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void update_data_ranks()
															
 
																+{
															
 
																+	int i,j;
															
 
																+
															
 
																+	/* Update the new ranks for all concerned data */
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		int ndata_to_update = data_movements_get_size_tables(data_movements_handles[i]);
															
 
																+		if (ndata_to_update)
															
 
																+		{
															
 
																+			//fprintf(stderr,"Update %d data from table %d on node %d\n", ndata_to_update, i, my_rank);
															
 
																+
															
 
																+			for (j = 0; j < ndata_to_update; j++)
															
 
																+			{
															
 
																+				starpu_data_handle_t handle = _starpu_mpi_data_get_data_handle_from_tag((data_movements_get_tags_table(data_movements_handles[i]))[j]);
															
 
																+				STARPU_ASSERT(handle);
															
 
																+				int dst_rank = (data_movements_get_ranks_table(data_movements_handles[i]))[j];
															
 
																+
															
 
																+				/* Save the fact that the data has been moved out of this node */
															
 
																+				if (i == my_rank)
															
 
																+				{
															
 
																+					struct moved_data_entry *md = (struct moved_data_entry *)malloc(sizeof(struct moved_data_entry));
															
 
																+					md->handle = handle;
															
 
																+					HASH_ADD_PTR(mdh, handle, md);
															
 
																+				}
															
 
																+				else if (dst_rank == my_rank)
															
 
																+				{
															
 
																+					/* The data has been moved out, and now is moved back, so
															
 
																+					 * update the state of the moved_data hash table to reflect
															
 
																+					 * this change */
															
 
																+					struct moved_data_entry *md = NULL;
															
 
																+					HASH_FIND_PTR(mdh, &handle, md);
															
 
																+					if (md)
															
 
																+					{
															
 
																+						HASH_DEL(mdh, md);
															
 
																+						free(md);
															
 
																+					}
															
 
																+				}
															
 
																+
															
 
																+				//if (i == my_rank)
															
 
																+				//{
															
 
																+				//    if (dst_rank != my_rank)
															
 
																+				//        fprintf(stderr,"Move data %p (tag %d) from node %d to node %d\n", handle, (data_movements_get_tags_table(data_movements_handles[i]))[j], my_rank, dst_rank);
															
 
																+				//    else
															
 
																+				//        fprintf(stderr,"Bring back data %p (tag %d) from node %d on node %d\n", handle, (data_movements_get_tags_table(data_movements_handles[i]))[j], starpu_mpi_data_get_rank(handle), my_rank);
															
 
																+				//}
															
 
																+
															
 
																+				_STARPU_DEBUG("Call of starpu_mpi_get_data_on_node(%d,%d) on node %d\n", starpu_mpi_data_get_tag(handle), dst_rank, my_rank);
															
 
																+
															
 
																+				/* Migrate the data handle */
															
 
																+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handle, dst_rank, NULL, NULL);
															
 
																+
															
 
																+				_STARPU_DEBUG("New rank (%d) of data %d upgraded on node %d\n", dst_rank, starpu_mpi_data_get_tag(handle), my_rank);
															
 
																+				starpu_mpi_data_set_rank_comm(handle, dst_rank, MPI_COMM_WORLD);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void clean_balance()
															
 
																+{
															
 
																+	int i;
															
 
																+	starpu_mpi_cache_flush(MPI_COMM_WORLD, *load_data_handle_cpy);
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+		starpu_mpi_cache_flush(MPI_COMM_WORLD, neighbor_load_data_handles[i]);
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_movements_handles[i]);
															
 
																+}
															
 
																+
															
 
																+/* Core function of the load balancer. Computes from the load_data_cpy handle a
															
 
																+ * load balancing of the work to come (if needed), perform the necessary data
															
 
																+ * communications and negociate with the other nodes the rebalancing. */
															
 
																+static void heat_balance(starpu_data_handle_t load_data_cpy)
															
 
																+{
															
 
																+	/* Exchange load data handles with neighboring nodes */
															
 
																+	exchange_load_data_infos(load_data_cpy);
															
 
																+
															
 
																+	/* Determine if this node should sent data to other nodes :
															
 
																+	 * which ones, how much data */
															
 
																+	balance(load_data_cpy);
															
 
																+
															
 
																+	/* Exchange data movements with neighboring nodes */
															
 
																+	exchange_data_movements_infos();
															
 
																+
															
 
																+	/* Perform data movements */
															
 
																+	update_data_ranks();
															
 
																+
															
 
																+	/* Clean the data handles to properly launch the next balance phase */
															
 
																+	clean_balance();
															
 
																+}
															
 
																+
															
 
																+/******************************************************************************
															
 
																+ *                      Heat Load Balancer Entry Points                       *
															
 
																+ *****************************************************************************/
															
 
																+
															
 
																+static void submitted_task_heat(struct starpu_task *task)
															
 
																+{
															
 
																+	load_data_inc_nsubmitted_tasks(*load_data_handle);
															
 
																+	//if (load_data_get_nsubmitted_tasks(*load_data_handle) > task->tag_id)
															
 
																+	//{
															
 
																+	//    fprintf(stderr,"Error : nsubmitted_tasks (%d) > tag_id (%lld) ! \n", load_data_get_nsubmitted_tasks(*load_data_handle), (long long int)task->tag_id);
															
 
																+	//    STARPU_ASSERT(0);
															
 
																+	//}
															
 
																+
															
 
																+	int phase = load_data_get_current_phase(*load_data_handle);
															
 
																+	/* Numbering of tasks in StarPU-MPI should be given by the application with
															
 
																+	 * the STARPU_TAG_ONLY insert task option for now. */
															
 
																+	/* TODO: Properly implement a solution for numbering tasks in StarPU-MPI */
															
 
																+	if ((task->tag_id / load_data_get_sleep_threshold(*load_data_handle)) > phase)
															
 
																+	{
															
 
																+		STARPU_PTHREAD_MUTEX_LOCK(&load_data_mutex);
															
 
																+		load_data_update_wakeup_cond(*load_data_handle);
															
 
																+		//fprintf(stderr,"Node %d sleep on tag %lld\n", my_rank, (long long int)task->tag_id);
															
 
																+		//if (load_data_get_nsubmitted_tasks(*load_data_handle) < load_data_get_wakeup_threshold(*load_data_handle))
															
 
																+		//{
															
 
																+		//    fprintf(stderr,"Error : nsubmitted_tasks (%d) lower than wakeup_threshold (%d) !\n", load_data_get_nsubmitted_tasks(*load_data_handle), load_data_get_wakeup_threshold(*load_data_handle));
															
 
																+		//    STARPU_ASSERT(0);
															
 
																+		//}
															
 
																+
															
 
																+		if (load_data_get_wakeup_threshold(*load_data_handle) > load_data_get_nfinished_tasks(*load_data_handle))
															
 
																+			STARPU_PTHREAD_COND_WAIT(&load_data_cond, &load_data_mutex);
															
 
																+
															
 
																+		load_data_next_phase(*load_data_handle);
															
 
																+
															
 
																+		/* Register a copy of the load data at this moment, to allow to compute
															
 
																+		 * the heat balance while not locking the load data during the whole
															
 
																+		 * balance step, which could cause all the workers to wait on the lock
															
 
																+		 * to update the data. */
															
 
																+		struct starpu_data_interface_ops *itf_load_data = starpu_data_get_interface_ops(*load_data_handle);
															
 
																+		void* itf_src = starpu_data_get_interface_on_node(*load_data_handle, STARPU_MAIN_RAM);
															
 
																+		void* itf_dst = starpu_data_get_interface_on_node(*load_data_handle_cpy, STARPU_MAIN_RAM);
															
 
																+		memcpy(itf_dst, itf_src, itf_load_data->interface_size);
															
 
																+
															
 
																+		_STARPU_DEBUG("[node %d] Balance phase %d\n", my_rank, load_data_get_current_phase(*load_data_handle));
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&load_data_mutex);
															
 
																+
															
 
																+		heat_balance(*load_data_handle_cpy);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static void finished_task_heat()
															
 
																+{
															
 
																+	//fprintf(stderr,"Try to decrement nsubmitted_tasks...");
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&load_data_mutex);
															
 
																+
															
 
																+	load_data_inc_nfinished_tasks(*load_data_handle);
															
 
																+	//fprintf(stderr,"Decrement nsubmitted_tasks, now %d\n", load_data_get_nsubmitted_tasks(*load_data_handle));
															
 
																+	if (load_data_wakeup_cond(*load_data_handle))
															
 
																+	{
															
 
																+		//fprintf(stderr,"Wakeup ! nfinished_tasks = %d, wakeup_threshold = %d\n", load_data_get_nfinished_tasks(*load_data_handle), load_data_get_wakeup_threshold(*load_data_handle));
															
 
																+		load_data_update_elapsed_time(*load_data_handle);
															
 
																+		STARPU_PTHREAD_COND_SIGNAL(&load_data_cond);
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&load_data_mutex);
															
 
																+	}
															
 
																+	else
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&load_data_mutex);
															
 
																+}
															
 
																+
															
 
																+/******************************************************************************
															
 
																+ *                  Initialization / Deinitialization                         *
															
 
																+ *****************************************************************************/
															
 
																+
															
 
																+static int init_heat(struct starpu_mpi_lb_conf *itf)
															
 
																+{
															
 
																+	int i;
															
 
																+	int sleep_task_threshold;
															
 
																+	double wakeup_ratio;
															
 
																+
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
															
 
																+
															
 
																+	/* Immediately return if the starpu_mpi_lb_conf is invalid. */
															
 
																+	if (!(itf && itf->get_neighbors && itf->get_data_unit_to_migrate))
															
 
																+	{
															
 
																+		_STARPU_MSG("Error: struct starpu_mpi_lb_conf %p invalid\n", itf);
															
 
																+		return 1;
															
 
																+	}
															
 
																+
															
 
																+	user_itf = malloc(sizeof(struct starpu_mpi_lb_conf));
															
 
																+	memcpy(user_itf, itf, sizeof(struct starpu_mpi_lb_conf));;
															
 
																+
															
 
																+	/* Get the neighbors of the local MPI node */
															
 
																+	user_itf->get_neighbors(&neighbor_ids, &nneighbors);
															
 
																+	if (nneighbors == 0)
															
 
																+	{
															
 
																+		_STARPU_MSG("Error: Function get_neighbors returning 0 neighbor\n");
															
 
																+		free(user_itf);
															
 
																+		user_itf = NULL;
															
 
																+		return 2;
															
 
																+	}
															
 
																+
															
 
																+	/* The sleep threshold is deducted from the numbering of tasks by the
															
 
																+	 * application. For example, with this threshold, the submission thread
															
 
																+	 * will stop when a task for which the numbering is 2000 or above will be
															
 
																+	 * submitted to StarPU-MPI. However, much less tasks can be really
															
 
																+	 * submitted to the local MPI node: the sleeping of the submission threads
															
 
																+	 * checks the numbering of the tasks, not how many tasks have been
															
 
																+	 * submitted to the local MPI node, which are two different things. */
															
 
																+	char *sleep_env = starpu_getenv("LB_HEAT_SLEEP_THRESHOLD");
															
 
																+	if (sleep_env)
															
 
																+		sleep_task_threshold = atoi(sleep_env);
															
 
																+	else
															
 
																+		sleep_task_threshold = 2000;
															
 
																+
															
 
																+	char *wakeup_env = starpu_getenv("LB_HEAT_WAKEUP_RATIO");
															
 
																+	if (wakeup_env)
															
 
																+		wakeup_ratio = atof(wakeup_env);
															
 
																+	else
															
 
																+		wakeup_ratio = 0.5;
															
 
																+
															
 
																+	char *time_env = starpu_getenv("LB_HEAT_TIME_THRESHOLD");
															
 
																+	if (time_env)
															
 
																+		time_threshold = atoi(time_env);
															
 
																+	else
															
 
																+		time_threshold = 2000;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_INIT(&load_data_mutex, NULL);
															
 
																+	STARPU_PTHREAD_COND_INIT(&load_data_cond, NULL);
															
 
																+
															
 
																+	/* Allocate, initialize and register all the data handles that will be
															
 
																+	 * needed for the load balancer, to not reallocate them at each balance
															
 
																+	 * step. */
															
 
																+
															
 
																+	/* Local load data */
															
 
																+	load_data_handle = malloc(sizeof(starpu_data_handle_t));
															
 
																+	memset(load_data_handle, 0, sizeof(starpu_data_handle_t));
															
 
																+	load_data_data_register(load_data_handle, STARPU_MAIN_RAM, sleep_task_threshold, wakeup_ratio);
															
 
																+
															
 
																+	/* Copy of the local load data to enable parallel update of the load data
															
 
																+	 * with communications to neighbor nodes */
															
 
																+	load_data_handle_cpy = malloc(sizeof(starpu_data_handle_t));
															
 
																+	memset(load_data_handle_cpy, 0, sizeof(starpu_data_handle_t));
															
 
																+	void *local_interface = starpu_data_get_interface_on_node(*load_data_handle, STARPU_MAIN_RAM);
															
 
																+	struct starpu_data_interface_ops *itf_load_data = starpu_data_get_interface_ops(*load_data_handle);
															
 
																+	starpu_data_register(load_data_handle_cpy, STARPU_MAIN_RAM, local_interface, itf_load_data);
															
 
																+	starpu_mpi_data_register(*load_data_handle_cpy, TAG_LOAD(my_rank), my_rank);
															
 
																+
															
 
																+	/* Remote load data */
															
 
																+	neighbor_load_data_handles = malloc(nneighbors*sizeof(starpu_data_handle_t));
															
 
																+	memset(neighbor_load_data_handles, 0, nneighbors*sizeof(starpu_data_handle_t));
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+	{
															
 
																+		load_data_data_register(&neighbor_load_data_handles[i], STARPU_MAIN_RAM, sleep_task_threshold, wakeup_ratio);
															
 
																+		starpu_mpi_data_register(neighbor_load_data_handles[i], TAG_LOAD(neighbor_ids[i]), neighbor_ids[i]);
															
 
																+	}
															
 
																+
															
 
																+	/* Data movements handles */
															
 
																+	data_movements_handles = malloc(world_size*sizeof(starpu_data_handle_t));
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		data_movements_data_register(&data_movements_handles[i], STARPU_MAIN_RAM, NULL, NULL, 0);
															
 
																+		starpu_mpi_data_register(data_movements_handles[i], TAG_MOV(i), i);
															
 
																+	}
															
 
																+
															
 
																+	/* Hash table of moved data that will be brought back on the node at
															
 
																+	 * termination time */
															
 
																+	mdh = NULL;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/* Move back all the data that has been migrated out of this node at
															
 
																+ * denitialization time of the load balancer, to ensure the consistency with
															
 
																+ * the ranks of data originally registered by the application. */
															
 
																+static void move_back_data()
															
 
																+{
															
 
																+	int i,j;
															
 
																+
															
 
																+	/* Update the new ranks for all concerned data */
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		/* In this case, each data_movements_handles contains the handles to move back on the specific node */
															
 
																+		int ndata_to_update = data_movements_get_size_tables(data_movements_handles[i]);
															
 
																+		if (ndata_to_update)
															
 
																+		{
															
 
																+			_STARPU_DEBUG("Move back %d data from table %d on node %d\n", ndata_to_update, i, my_rank);
															
 
																+
															
 
																+			for (j = 0; j < ndata_to_update; j++)
															
 
																+			{
															
 
																+				starpu_data_handle_t handle = _starpu_mpi_data_get_data_handle_from_tag((data_movements_get_tags_table(data_movements_handles[i]))[j]);
															
 
																+				STARPU_ASSERT(handle);
															
 
																+
															
 
																+				int dst_rank = (data_movements_get_ranks_table(data_movements_handles[i]))[j];
															
 
																+				STARPU_ASSERT(i == dst_rank);
															
 
																+
															
 
																+				if (i == my_rank)
															
 
																+				{
															
 
																+					/* The data is moved back, so update the state of the
															
 
																+					 * moved_data hash table to reflect this change */
															
 
																+					struct moved_data_entry *md = NULL;
															
 
																+					HASH_FIND_PTR(mdh, &handle, md);
															
 
																+					if (md)
															
 
																+					{
															
 
																+						HASH_DEL(mdh, md);
															
 
																+						free(md);
															
 
																+					}
															
 
																+				}
															
 
																+
															
 
																+				//fprintf(stderr,"Call of starpu_mpi_get_data_on_node(%d,%d) on node %d\n", starpu_mpi_data_get_tag(handle), dst_rank, my_rank);
															
 
																+
															
 
																+				/* Migrate the data handle */
															
 
																+				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handle, dst_rank, NULL, NULL);
															
 
																+
															
 
																+				//fprintf(stderr,"New rank (%d) of data %d upgraded on node %d\n", dst_rank, starpu_mpi_data_get_tag(handle), my_rank);
															
 
																+				starpu_mpi_data_set_rank_comm(handle, dst_rank, MPI_COMM_WORLD);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static int deinit_heat()
															
 
																+{
															
 
																+	int i;
															
 
																+
															
 
																+	if ((!user_itf) || (nneighbors == 0))
															
 
																+		return 1;
															
 
																+
															
 
																+	_STARPU_DEBUG("Shutting down heat lb policy\n");
															
 
																+
															
 
																+	unsigned int ndata_to_move_back = HASH_COUNT(mdh);
															
 
																+
															
 
																+	if (ndata_to_move_back)
															
 
																+	{
															
 
																+		_STARPU_DEBUG("Move back %u data on node %d ..\n", ndata_to_move_back, my_rank);
															
 
																+		data_movements_reallocate_tables(data_movements_handles[my_rank], ndata_to_move_back);
															
 
																+
															
 
																+		int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
															
 
																+		int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
															
 
																+
															
 
																+		int n = 0;
															
 
																+		struct moved_data_entry *md, *tmp;
															
 
																+		HASH_ITER(hh, mdh, md, tmp)
															
 
																+		{
															
 
																+			tags[n] = starpu_mpi_data_get_tag(md->handle);
															
 
																+			ranks[n] = my_rank;
															
 
																+			n++;
															
 
																+		}
															
 
																+	}
															
 
																+	else
															
 
																+		data_movements_reallocate_tables(data_movements_handles[my_rank], 0);
															
 
																+
															
 
																+	exchange_data_movements_infos();
															
 
																+	move_back_data();
															
 
																+
															
 
																+	/* This assert ensures that all nodes have properly gotten back all the
															
 
																+	 * data that has been moven out of the node. */
															
 
																+	STARPU_ASSERT(HASH_COUNT(mdh) == 0);
															
 
																+	free(mdh);
															
 
																+	mdh = NULL;
															
 
																+
															
 
																+	starpu_data_unregister(*load_data_handle);
															
 
																+	free(load_data_handle);
															
 
																+	load_data_handle = NULL;
															
 
																+
															
 
																+	starpu_mpi_cache_flush(MPI_COMM_WORLD, *load_data_handle_cpy);
															
 
																+	starpu_data_unregister(*load_data_handle_cpy);
															
 
																+	free(load_data_handle_cpy);
															
 
																+	load_data_handle_cpy = NULL;
															
 
																+
															
 
																+	for (i = 0; i < nneighbors; i++)
															
 
																+	{
															
 
																+		starpu_mpi_cache_flush(MPI_COMM_WORLD, neighbor_load_data_handles[i]);
															
 
																+		starpu_data_unregister(neighbor_load_data_handles[i]);
															
 
																+	}
															
 
																+	free(neighbor_load_data_handles);
															
 
																+	neighbor_load_data_handles = NULL;
															
 
																+
															
 
																+	nneighbors = 0;
															
 
																+	free(neighbor_ids);
															
 
																+	neighbor_ids = NULL;
															
 
																+
															
 
																+	for (i = 0; i < world_size; i++)
															
 
																+	{
															
 
																+		starpu_mpi_cache_flush(MPI_COMM_WORLD, data_movements_handles[i]);
															
 
																+		data_movements_reallocate_tables(data_movements_handles[i], 0);
															
 
																+		starpu_data_unregister(data_movements_handles[i]);
															
 
																+	}
															
 
																+	free(data_movements_handles);
															
 
																+	data_movements_handles = NULL;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_DESTROY(&load_data_mutex);
															
 
																+	STARPU_PTHREAD_COND_DESTROY(&load_data_cond);
															
 
																+	free(user_itf);
															
 
																+	user_itf = NULL;
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+/******************************************************************************
															
 
																+ *                                  Policy                                    *
															
 
																+ *****************************************************************************/
															
 
																+
															
 
																+struct load_balancer_policy load_heat_propagation_policy =
															
 
																+{
															
 
																+	.init = init_heat,
															
 
																+	.deinit = deinit_heat,
															
 
																+	.submitted_task_entry_point = submitted_task_heat,
															
 
																+	.finished_task_entry_point = finished_task_heat,
															
 
																+	.policy_name = "heat"
															
 
																+};
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -449,7 +449,7 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
																 	_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
															
 
																 #endif
															
 
																-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, 0);
															
 
																+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle));
															
 
																 	/* somebody is perhaps waiting for the MPI request to be posted */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
															
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -36,6 +36,22 @@
 
																 	else								\
															
 
																 		starpu_mpi_isend_detached(data, dest, data_tag, comm, callback, arg);
															
 
																+static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
															
 
																+
															
 
																+int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *))
															
 
																+{
															
 
																+	if (pre_submit_hook)
															
 
																+		_STARPU_MSG("Warning: a pre_submit_hook has already been registered. Please check if you really want to erase the previously registered hook.\n");
															
 
																+	pre_submit_hook = f;
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_mpi_pre_submit_hook_unregister()
															
 
																+{
															
 
																+	pre_submit_hook = NULL;
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
															
 
																 {
															
 
																 	if (mode & STARPU_W)
															
@@ -472,6 +488,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 
																 		va_copy(varg_list_copy, varg_list);
															
 
																 		_starpu_task_insert_create(codelet, task, varg_list_copy);
															
 
																 		va_end(varg_list_copy);
															
 
																+
															
 
																 		return 0;
															
 
																 	}
															
 
																 }
															
@@ -526,7 +543,13 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
 
																 			starpu_task_destroy(task);
															
 
																 		}
															
 
																 	}
															
 
																-	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
															
 
																+
															
 
																+	int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
															
 
																+
															
 
																+	if (ret == 0 && pre_submit_hook)
															
 
																+		pre_submit_hook(task);
															
 
																+
															
 
																+	return val;
															
 
																 }
															
 
																 int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...)
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -1,7 +1,7 @@
 
																 # StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																 #
															
 
																 # Copyright (C) 2009-2012, 2015-2016  Université de Bordeaux
															
 
																-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																 #
															
 
																 # StarPU is free software; you can redistribute it and/or modify
															
 
																 # it under the terms of the GNU Lesser General Public License as published by
															
@@ -138,7 +138,8 @@ starpu_mpi_TESTS =				\
 
																 	policy_selection			\
															
 
																 	policy_selection2			\
															
 
																 	early_request				\
															
 
																-	starpu_redefine
															
 
																+	starpu_redefine				\
															
 
																+	load_balancer
															
 
																 noinst_PROGRAMS =				\
															
 
																 	datatypes				\
															
@@ -191,7 +192,8 @@ noinst_PROGRAMS =				\
 
																 	policy_selection			\
															
 
																 	policy_selection2			\
															
 
																 	early_request				\
															
 
																-	starpu_redefine
															
 
																+	starpu_redefine				\
															
 
																+	load_balancer
															
 
																 XFAIL_TESTS=					\
															
@@ -301,6 +303,8 @@ early_request_LDADD =					\
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																 starpu_redefine_LDADD =					\
															
 
																 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																+load_balancer_LDADD =					\
															
 
																+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
															
 
																 ring_SOURCES = ring.c
															
 
																 ring_sync_SOURCES = ring_sync.c
															
--- a/mpi/tests/load_balancer.c
+++ b/mpi/tests/load_balancer.c
@@ -0,0 +1,73 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2017  CNRS
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <starpu_mpi_lb.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#if !defined(STARPU_HAVE_UNSETENV)
															
 
																+
															
 
																+#warning unsetenv is not defined. Skipping test
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	return STARPU_TEST_SKIPPED;
															
 
																+}
															
 
																+#else
															
 
																+
															
 
																+void get_neighbors(int **neighbor_ids, int *nneighbors)
															
 
																+{
															
 
																+	int ret, rank, size;
															
 
																+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
															
 
																+	*nneighbors = 1;
															
 
																+	*neighbor_ids = malloc(sizeof(int));
															
 
																+	*neighbor_ids[0] = rank==size-1?0:rank+1;
															
 
																+}
															
 
																+
															
 
																+void get_data_unit_to_migrate(starpu_data_handle_t **handle_unit, int *nhandles, int dst_node)
															
 
																+{
															
 
																+	*nhandles = 0;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret;
															
 
																+	struct starpu_mpi_lb_conf itf;
															
 
																+
															
 
																+	itf.get_neighbors = get_neighbors;
															
 
																+	itf.get_data_unit_to_migrate = get_data_unit_to_migrate;
															
 
																+
															
 
																+	MPI_Init(&argc, &argv);
															
 
																+	ret = starpu_init(NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	ret = starpu_mpi_init(NULL, NULL, 0);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
															
 
																+
															
 
																+	unsetenv("STARPU_MPI_LB");
															
 
																+	starpu_mpi_lb_init(NULL, NULL);
															
 
																+	starpu_mpi_lb_shutdown();
															
 
																+
															
 
																+	starpu_mpi_lb_init("heat", &itf);
															
 
																+	starpu_mpi_lb_shutdown();
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	starpu_shutdown();
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+#endif
															
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -138,12 +138,11 @@ void _starpu_fxt_init_profiling(unsigned trace_buffer_size)
 
																 		return;
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_fxt_started_mutex);
															
 
																-	if (!_starpu_fxt_started)
															
 
																-	{
															
 
																-		_starpu_fxt_started = 1;
															
 
																-		_starpu_written = 0;
															
 
																-		_starpu_profile_set_tracefile();
															
 
																-	}
															
 
																+	STARPU_ASSERT(!_starpu_fxt_started);
															
 
																+
															
 
																+	_starpu_fxt_started = 1;
															
 
																+	_starpu_written = 0;
															
 
																+	_starpu_profile_set_tracefile();
															
 
																 #ifdef HAVE_FUT_SET_FILENAME
															
 
																 	fut_set_filename(_STARPU_PROF_FILE_USER);
															
--- a/src/common/fxt.h
+++ b/src/common/fxt.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
															
 
																  * Copyright (C) 2016  Inria
															
 
																  *
															
@@ -95,6 +95,8 @@
 
																 #define	_STARPU_FUT_START_ALLOC_REUSE	0x5129
															
 
																 #define	_STARPU_FUT_END_ALLOC_REUSE	0x5130
															
 
																+#define	_STARPU_FUT_USED_MEM	0x512a
															
 
																+
															
 
																 #define	_STARPU_FUT_START_MEMRECLAIM	0x5131
															
 
																 #define	_STARPU_FUT_END_MEMRECLAIM	0x5132
															
@@ -493,7 +495,7 @@ do {									\
 
																 		}							\
															
 
																 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
															
 
																 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
															
 
																-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid, ((job)->job_id)); \
															
 
																+		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000, (job)->task->tag_id, workerid, ((job)->job_id)); \
															
 
																 	}								\
															
 
																 } while(0);
															
@@ -700,6 +702,9 @@ do {										\
 
																 #define _STARPU_TRACE_END_WRITEBACK(memnode)		\
															
 
																 	FUT_DO_PROBE2(_STARPU_FUT_END_WRITEBACK, memnode, _starpu_gettid());
															
 
																+#define _STARPU_TRACE_USED_MEM(memnode,used)		\
															
 
																+	FUT_DO_PROBE3(_STARPU_FUT_USED_MEM, memnode, used, _starpu_gettid());
															
 
																+	
															
 
																 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)		\
															
 
																 	FUT_DO_PROBE3(_STARPU_FUT_START_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
															
@@ -1006,6 +1011,7 @@ do {										\
 
																 #define _STARPU_TRACE_END_FREE(memnode)		do {} while(0)
															
 
																 #define _STARPU_TRACE_START_WRITEBACK(memnode)	do {} while(0)
															
 
																 #define _STARPU_TRACE_END_WRITEBACK(memnode)		do {} while(0)
															
 
																+#define _STARPU_TRACE_USED_MEM(memnode,used)		do {} while (0)
															
 
																 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
															
 
																 #define _STARPU_TRACE_END_MEMRECLAIM(memnode,is_prefetch)	do {} while(0)
															
 
																 #define _STARPU_TRACE_START_WRITEBACK_ASYNC(memnode)	do {} while(0)
															
--- a/src/common/list.h
+++ b/src/common/list.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2012, 2015-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2012, 2015-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2016  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -73,6 +73,12 @@
 
																  *   struct FOO*	FOO_list_end(struct FOO_list*);
															
 
																  *     * retourne l'élément suivant de la liste
															
 
																  *   struct FOO*	FOO_list_next(struct FOO*)
															
 
																+ *     * retourne le dernier élément de la liste
															
 
																+ *   struct FOO*	FOO_list_last(struct FOO_list*);
															
 
																+ *     * retourne la valeur à tester en début de liste
															
 
																+ *   struct FOO*	FOO_list_alpha(struct FOO_list*);
															
 
																+ *     * retourne l'élément précédent de la liste
															
 
																+ *   struct FOO*	FOO_list_prev(struct FOO*)
															
 
																  *     * retourne la taille de la liste
															
 
																  *   int		FOO_list_size(struct FOO_list*)
															
 
																  *     * retourne la position de l'élément dans la liste (indexé à partir de 0)
															
@@ -108,6 +114,14 @@
 
																  *  {
															
 
																  *    printf("a=%d; b=%d\n", i->a, i->b);
															
 
																  *  }
															
 
																+ *  - itérateur de liste :
															
 
																+ *  struct ma_structure * i;
															
 
																+ *  for(i  = ma_structure_list_last(l);
															
 
																+ *      i != ma_structure_list_alpha(l);
															
 
																+ *      i  = ma_structure_list_prev(i))
															
 
																+ *  {
															
 
																+ *    printf("a=%d; b=%d\n", i->a, i->b);
															
 
																+ *  }
															
 
																  * *********************************************************
															
 
																  */
															
@@ -185,6 +199,12 @@
 
																     { return NULL; } \
															
 
																   /** @internal */static inline struct ENAME *ENAME##_list_next(const struct ENAME *i) \
															
 
																     { return i->_next; } \
															
 
																+  /** @internal */static inline struct ENAME *ENAME##_list_last(const struct ENAME##_list *l) \
															
 
																+    { return l->_tail; } \
															
 
																+  /** @internal */static inline struct ENAME *ENAME##_list_alpha(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
															
 
																+    { return NULL; } \
															
 
																+  /** @internal */static inline struct ENAME *ENAME##_list_prev(const struct ENAME *i) \
															
 
																+    { return i->_prev; } \
															
 
																   /** @internal */static inline int ENAME##_list_ismember(const struct ENAME##_list *l, const struct ENAME *e) \
															
 
																     { struct ENAME *i=l->_head; while(i!=NULL){ if (i == e) return 1; i=i->_next; } return 0; } \
															
 
																   /** @internal */static inline int ENAME##_list_member(const struct ENAME##_list *l, const struct ENAME *e) \
															
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -271,6 +271,33 @@ int _starpu_fftruncate(FILE *file, size_t length)
 
																 	return ftruncate(fileno(file), length);
															
 
																 }
															
 
																+static int _starpu_warn_nolock(int err)
															
 
																+{
															
 
																+	if (0
															
 
																+#ifdef ENOLCK
															
 
																+		|| err == ENOLCK
															
 
																+#endif
															
 
																+#ifdef ENOTSUP
															
 
																+		|| err == ENOTSUP
															
 
																+#endif
															
 
																+#ifdef EOPNOTSUPP
															
 
																+		|| err == EOPNOTSUPP
															
 
																+#endif
															
 
																+#ifdef EROFS
															
 
																+		|| err == EROFS
															
 
																+#endif
															
 
																+		)
															
 
																+	{
															
 
																+		static int warn;
															
 
																+		if (!warn) {
															
 
																+			warn = 1;
															
 
																+			_STARPU_DISP("warning: Couldn't lock performance file, StarPU home (%s, coming from $HOME or $STARPU_HOME) is probably on some network filesystem like NFS which does not support locking.\n", _starpu_get_home_path());
															
 
																+		}
															
 
																+		return 1;
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																 int _starpu_frdlock(FILE *file)
															
 
																 {
															
 
																 	int ret;
															
@@ -290,17 +317,8 @@ int _starpu_frdlock(FILE *file)
 
																 	};
															
 
																 	ret = fcntl(fileno(file), F_SETLKW, &lock);
															
 
																 #endif
															
 
																-#ifdef ENOLCK
															
 
																-	if (ret != 0 && errno == ENOLCK)
															
 
																-	{
															
 
																-		static int warn;
															
 
																-		if (!warn) {
															
 
																-			warn = 1;
															
 
																-			_STARPU_DISP("warning: Couldn't lock performance file, StarPU home is probably on NFS which does not support locking.\n");
															
 
																-		}
															
 
																+	if (ret != 0 && _starpu_warn_nolock(errno))
															
 
																 		return -1;
															
 
																-	}
															
 
																-#endif
															
 
																 	STARPU_ASSERT(ret == 0);
															
 
																 	return ret;
															
 
																 }
															
@@ -325,6 +343,8 @@ int _starpu_frdunlock(FILE *file)
 
																 	};
															
 
																 	ret = fcntl(fileno(file), F_SETLKW, &lock);
															
 
																 #endif
															
 
																+	if (ret != 0 && _starpu_warn_nolock(errno))
															
 
																+		return -1;
															
 
																 	STARPU_ASSERT(ret == 0);
															
 
																 	return ret;
															
 
																 }
															
@@ -351,17 +371,8 @@ int _starpu_fwrlock(FILE *file)
 
																 	ret = fcntl(fileno(file), F_SETLKW, &lock);
															
 
																 #endif
															
 
																-#ifdef ENOLCK
															
 
																-	if (ret != 0 && errno == ENOLCK)
															
 
																-	{
															
 
																-		static int warn;
															
 
																-		if (!warn) {
															
 
																-			warn = 1;
															
 
																-			_STARPU_DISP("warning: Couldn't lock performance file, StarPU home is probably on NFS which does not support locking.\n");
															
 
																-		}
															
 
																+	if (ret != 0 && _starpu_warn_nolock(errno))
															
 
																 		return -1;
															
 
																-	}
															
 
																-#endif
															
 
																 	STARPU_ASSERT(ret == 0);
															
 
																 	return ret;
															
 
																 }
															
--- a/src/core/disk_ops/disk_leveldb.cpp
+++ b/src/core/disk_ops/disk_leveldb.cpp
@@ -50,14 +50,14 @@ struct starpu_leveldb_base
 
																 static void *starpu_leveldb_alloc(void *base, size_t size STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	struct starpu_leveldb_base *base_tmp = (struct starpu_leveldb_base *) base;
															
 
																-	struct starpu_leveldb_obj *obj;
															
 
																-	_STARPU_MALLOC(obj, sizeof(struct starpu_leveldb_obj));
															
 
																+	struct starpu_leveldb_obj *obj = (struct starpu_leveldb_obj *)malloc(sizeof(struct starpu_leveldb_obj));
															
 
																+	STARPU_ASSERT(obj);
															
 
																         STARPU_PTHREAD_MUTEX_INIT(&obj->mutex, NULL);
															
 
																 	size_t len = 6 + 1 + 2+sizeof(void*)*2 + 1;
															
 
																-	char *key;
															
 
																-	_STARPU_MALLOC(key, len*sizeof(char));
															
 
																+	char *key = (char *)malloc(len*sizeof(char));
															
 
																+	STARPU_ASSERT(key);
															
 
																 	snprintf(key, len, "STARPU-%p", obj);
															
 
																 	/* create and add a key with a small memory */
															
@@ -88,8 +88,8 @@ static void starpu_leveldb_free(void *base , void *obj, size_t size STARPU_ATTRI
 
																 /* open an existing memory on disk */
															
 
																 static void *starpu_leveldb_open(void *base STARPU_ATTRIBUTE_UNUSED, void *pos, size_t size)
															
 
																 {
															
 
																-	struct starpu_leveldb_obj *obj;
															
 
																-	_STARPU_MALLOC(obj, sizeof(struct starpu_leveldb_obj));
															
 
																+	struct starpu_leveldb_obj *obj = (struct starpu_leveldb_obj *)malloc(sizeof(struct starpu_leveldb_obj));
															
 
																+	STARPU_ASSERT(obj);
															
 
																         STARPU_PTHREAD_MUTEX_INIT(&obj->mutex, NULL);
															
@@ -149,7 +149,8 @@ static int starpu_leveldb_full_read(void *base, void *obj, void **ptr, size_t *s
 
																 	STARPU_ASSERT(s.ok());
															
 
																 	*size = value.length();
															
 
																-	_STARPU_MALLOC(*ptr, *size);
															
 
																+	*ptr = malloc(*size);
															
 
																+	STARPU_ASSERT(*ptr);
															
 
																 	/* use buffer */
															
 
																 	memcpy(*ptr, value.c_str(), *size);
															
@@ -177,7 +178,8 @@ static int starpu_leveldb_write(void *base, void *obj, const void *buf, off_t of
 
																 	else
															
 
																 	{
															
 
																 		uintptr_t buf_tmp = (uintptr_t) buf;
															
 
																-		_STARPU_MALLOC(buffer, (tmp->size > (offset + size)) ? tmp->size : (offset + size));
															
 
																+		buffer = malloc((tmp->size > (offset + size)) ? tmp->size : (offset + size));
															
 
																+		STARPU_ASSERT(buffer);
															
 
																 		/* we read the data */
															
 
																 		std::string value;
															
@@ -224,8 +226,8 @@ static int starpu_leveldb_full_write(void *base, void *obj, void *ptr, size_t si
 
																 /* create a new copy of parameter == base */
															
 
																 static void *starpu_leveldb_plug(void *parameter, starpu_ssize_t size STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																-	struct starpu_leveldb_base *tmp;
															
 
																-	_STARPU_MALLOC(tmp, sizeof(struct starpu_leveldb_base));
															
 
																+	struct starpu_leveldb_base *tmp = (struct starpu_leveldb_base *)malloc(sizeof(struct starpu_leveldb_base));
															
 
																+	STARPU_ASSERT(tmp);
															
 
																 	leveldb::Status status;
															
 
																 	leveldb::DB *db;
															
@@ -268,8 +270,8 @@ static int get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 
																 	double end;
															
 
																 	srand(time (NULL));
															
 
																-	char *buf;
															
 
																-	_STARPU_MALLOC(buf, SIZE_DISK_MIN*sizeof(char));
															
 
																+	char *buf = (char *)malloc(SIZE_DISK_MIN*sizeof(char));
															
 
																+	STARPU_ASSERT(buf);
															
 
																 	/* allocate memory */
															
 
																 	void *mem = _starpu_disk_alloc(node, SIZE_DISK_MIN);
															
@@ -293,7 +295,8 @@ static int get_leveldb_bandwidth_between_disk_and_main_ram(unsigned node)
 
																 	/* free memory */
															
 
																 	free(buf);
															
 
																-	_STARPU_MALLOC(buf, sizeof(char));
															
 
																+	buf = (char *)malloc(sizeof(char));
															
 
																+	STARPU_ASSERT(buf);
															
 
																 	/* Measure latency */
															
 
																 	start = starpu_timing_now();
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -35,6 +35,7 @@
 
																 #include <core/workers.h>
															
 
																 #include <core/perfmodel/perfmodel.h>
															
 
																 #include <core/simgrid.h>
															
 
																+#include <core/topology.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <drivers/mpi/driver_mpi_common.h>
															
@@ -2489,7 +2490,7 @@ static void write_bus_platform_file_content(int version)
 
																 	{
															
 
																 		hwloc_topology_t topology;
															
 
																 		hwloc_topology_init(&topology);
															
 
																-		hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
															
 
																+		_starpu_topology_filter(topology);
															
 
																 		hwloc_topology_load(topology);
															
 
																 		/* First find paths and record measured bandwidth along the path */
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -37,10 +37,11 @@ static int nobind;
 
																 static int occupied_sms = 0;
															
 
																 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
															
 
																-static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master);
															
 
																-static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *workerids, int nworkers);
															
 
																-static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids, int nworkers);
															
 
																-static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master);
															
 
																+
															
 
																+static void _starpu_sched_ctx_put_new_master(unsigned sched_ctx_id);
															
 
																+static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id);
															
 
																+static void _starpu_sched_ctx_update_parallel_workers_with(unsigned sched_ctx_id);
															
 
																+static void _starpu_sched_ctx_update_parallel_workers_without(unsigned sched_ctx_id);
															
 
																 static void _starpu_worker_gets_into_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker)
															
 
																 {
															
@@ -155,8 +156,6 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 	if (!nworkers_to_add)
															
 
																 		return;
															
 
																 	int workers_to_add[nworkers_to_add];
															
 
																-	int cpu_workers[nworkers_to_add];
															
 
																-	int ncpu_workers = 0;
															
 
																 	struct starpu_perfmodel_device devices[nworkers_to_add];
															
 
																 	int ndevices = 0;
															
@@ -236,13 +235,6 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 			else
															
 
																 				found = 0;
															
 
																 		}
															
 
																-
															
 
																-		if (!sched_ctx->sched_policy)
															
 
																-		{
															
 
																-			struct _starpu_worker *worker_str = _starpu_get_worker_struct(wa[i]);
															
 
																-			if (worker_str->arch == STARPU_CPU_WORKER)
															
 
																-				cpu_workers[ncpu_workers++] = wa[i];
															
 
																-		}
															
 
																 	}
															
 
																 	if(ndevices > 0)
															
@@ -311,24 +303,10 @@ static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx
 
																 		}
															
 
																 	}
															
 
																-	if(!sched_ctx->sched_policy)
															
 
																-	{
															
 
																-		if(!sched_ctx->awake_workers)
															
 
																-		{
															
 
																-			if(sched_ctx->main_master == -1)
															
 
																-				sched_ctx->main_master = starpu_sched_ctx_book_workers_for_task(sched_ctx->id, cpu_workers, ncpu_workers);
															
 
																-			else
															
 
																-			{
															
 
																-				_starpu_sched_ctx_add_workers_to_master(sched_ctx->id, cpu_workers, ncpu_workers, sched_ctx->main_master);
															
 
																-			}
															
 
																-		}
															
 
																-		else
															
 
																-		{
															
 
																-			sched_ctx->main_master = _starpu_sched_ctx_find_master(sched_ctx->id, cpu_workers, ncpu_workers);
															
 
																-			_starpu_sched_ctx_set_master(sched_ctx, cpu_workers, ncpu_workers, sched_ctx->main_master);
															
 
																-		}
															
 
																-	}
															
 
																-	else if(sched_ctx->sched_policy->add_workers)
															
 
																+
															
 
																+	_starpu_sched_ctx_update_parallel_workers_with(sched_ctx->id);
															
 
																+
															
 
																+	if(sched_ctx->sched_policy && sched_ctx->sched_policy->add_workers)
															
 
																 	{
															
 
																 		_STARPU_TRACE_WORKER_SCHEDULING_PUSH;
															
 
																 		if(added_workers)
															
@@ -412,13 +390,7 @@ static void _starpu_remove_workers_from_sched_ctx(struct _starpu_sched_ctx *sche
 
																 		sched_ctx->perf_arch.devices[dev].ncores = devices[dev].ncores;
															
 
																 	}
															
 
																-	if(!sched_ctx->sched_policy)
															
 
																-	{
															
 
																-		if(!sched_ctx->awake_workers)
															
 
																-		{
															
 
																-			_starpu_sched_ctx_wake_these_workers_up(sched_ctx->id, removed_workers, *n_removed_workers);
															
 
																-		}
															
 
																-	}
															
 
																+	_starpu_sched_ctx_update_parallel_workers_without(sched_ctx->id);
															
 
																 	return;
															
 
																 }
															
@@ -561,13 +533,13 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 
																 		STARPU_PTHREAD_COND_INIT(&sched_ctx->parallel_sect_cond_busy[w], NULL);
															
 
																 		sched_ctx->busy[w] = 0;
															
 
																-		sched_ctx->master[w] = -1;
															
 
																 		sched_ctx->parallel_sect[w] = 0;
															
 
																 		sched_ctx->sleeping[w] = 0;
															
 
																 	}
															
 
																+	sched_ctx->parallel_view = 0;
															
 
																-        /*init the strategy structs and the worker_collection of the ressources of the context */
															
 
																+  /*init the strategy structs and the worker_collection of the ressources of the context */
															
 
																 	if(policy)
															
 
																 	{
															
 
																 		_starpu_init_sched_policy(config, sched_ctx, policy);
															
@@ -1151,7 +1123,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
																 	if(!_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id))
															
 
																 	{
															
 
																 		if(!sched_ctx->sched_policy)
															
 
																-			starpu_sched_ctx_unbook_workers_for_task(sched_ctx->id, sched_ctx->main_master);
															
 
																+			_starpu_sched_ctx_wake_up_workers(sched_ctx_id);
															
 
																 		/*if btw the mutex release & the mutex lock the context has changed take care to free all
															
 
																 		  scheduling data before deleting the context */
															
 
																 		_starpu_update_workers_without_ctx(workerids, nworkers_ctx, sched_ctx_id, 1);
															
@@ -1719,6 +1691,12 @@ void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
 
																 	return sched_ctx->policy_data;
															
 
																 }
															
 
																+struct starpu_sched_policy *starpu_sched_ctx_get_sched_policy(unsigned sched_ctx_id)
															
 
																+{
															
 
																+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+	return sched_ctx->sched_policy;
															
 
																+}
															
 
																+
															
 
																 struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type  worker_collection_type)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
@@ -2359,45 +2337,13 @@ static unsigned _worker_sleeping_in_other_ctx(unsigned sched_ctx_id, int workeri
 
																 }
															
 
																-static void _starpu_sched_ctx_get_workers_to_sleep(unsigned sched_ctx_id, int *workerids, int nworkers, int master)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int current_worker_id = starpu_worker_get_id();
															
 
																-	unsigned sleeping[nworkers];
															
 
																-	int w;
															
 
																-	for(w = 0; w < nworkers; w++)
															
 
																-	{
															
 
																-		if(current_worker_id == -1 || workerids[w] != current_worker_id)
															
 
																-			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerids[w]]);
															
 
																-		sleeping[w] = _worker_sleeping_in_other_ctx(sched_ctx_id, workerids[w]);
															
 
																-		sched_ctx->master[workerids[w]] = master;
															
 
																-		sched_ctx->parallel_sect[workerids[w]] = 1;
															
 
																-		if(current_worker_id == -1 || workerids[w] != current_worker_id)
															
 
																-			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerids[w]]);
															
 
																-#ifndef STARPU_NON_BLOCKING_DRIVERS
															
 
																-		starpu_wake_worker(workerids[w]);
															
 
																-#endif
															
 
																-	}
															
 
																-
															
 
																-	for(w = 0; w < nworkers; w++)
															
 
																-	{
															
 
																-		int workerid = workerids[w];
															
 
																-		if((current_worker_id == -1 || workerid != current_worker_id) && !sleeping[w])
															
 
																-		{
															
 
																-			sem_wait(&sched_ctx->fall_asleep_sem[master]);
															
 
																-		}
															
 
																-	}
															
 
																-	return;
															
 
																-}
															
 
																-
															
 
																 void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid)
															
 
																 {
															
 
																 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																 	worker->blocked = 1;
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	sched_ctx->sleeping[workerid] = 1;
															
 
																-	int master = sched_ctx->master[workerid];
															
 
																-	sem_post(&sched_ctx->fall_asleep_sem[master]);
															
 
																+	sem_post(&sched_ctx->fall_asleep_sem[sched_ctx->main_master]);
															
 
																 	return;
															
 
																 }
															
@@ -2405,30 +2351,74 @@ void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid
 
																 void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int master = sched_ctx->master[workerid];
															
 
																-	sem_post(&sched_ctx->wake_up_sem[master]);
															
 
																+	sem_post(&sched_ctx->wake_up_sem[sched_ctx->main_master]);
															
 
																 	sched_ctx->sleeping[workerid] = 0;
															
 
																-	sched_ctx->master[workerid] = -1;
															
 
																 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																 	worker->blocked = 0;
															
 
																 	return;
															
 
																 }
															
 
																-static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id, int master)
															
 
																+static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id)
															
 
																+{
															
 
																+    struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																+    int current_worker_id = starpu_worker_get_id();
															
 
																+		int master = sched_ctx->main_master;
															
 
																+    struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																+    struct starpu_sched_ctx_iterator it;
															
 
																+    unsigned sleeping[sched_ctx->workers->nworkers];
															
 
																+
															
 
																+	if (master == -1)
															
 
																+		return;
															
 
																+
															
 
																+    workers->init_iterator(workers, &it);
															
 
																+    while(workers->has_next(workers, &it))
															
 
																+    {
															
 
																+        int workerid = workers->get_next(workers, &it);
															
 
																+        sleeping[workerid] = _worker_sleeping_in_other_ctx(sched_ctx_id, workerid);
															
 
																+
															
 
																+        if(!sched_ctx->parallel_sect[workerid] && workerid != master)
															
 
																+        {
															
 
																+            if (current_worker_id == -1 || workerid != current_worker_id)
															
 
																+            {
															
 
																+                STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																+                sched_ctx->parallel_sect[workerid] = 1;
															
 
																+                STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																+            }
															
 
																+        }
															
 
																+    }
															
 
																+
															
 
																+    workers->init_iterator(workers, &it);
															
 
																+    while(workers->has_next(workers, &it))
															
 
																+    {
															
 
																+            int workerid = workers->get_next(workers, &it);
															
 
																+            if(workerid != master
															
 
																+               && (current_worker_id == -1 || workerid != current_worker_id)
															
 
																+               && !sleeping[workerid])
															
 
																+            {
															
 
																+                    sem_wait(&sched_ctx->fall_asleep_sem[master]);
															
 
																+            }
															
 
																+    }
															
 
																+
															
 
																+    return;
															
 
																+}
															
 
																+
															
 
																+static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id)
															
 
																 {
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																 	int current_worker_id = starpu_worker_get_id();
															
 
																+	int master = sched_ctx->main_master;
															
 
																 	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																-
															
 
																 	struct starpu_sched_ctx_iterator it;
															
 
																+	if (master == -1)
															
 
																+		return;
															
 
																+
															
 
																 	workers->init_iterator(workers, &it);
															
 
																 	while(workers->has_next(workers, &it))
															
 
																 	{
															
 
																 		int workerid = workers->get_next(workers, &it);
															
 
																-		int curr_master = sched_ctx->master[workerid];
															
 
																-		if(curr_master == master && sched_ctx->parallel_sect[workerid])
															
 
																+		if(sched_ctx->parallel_sect[workerid] && workerid != master)
															
 
																 		{
															
 
																 			if((current_worker_id == -1 || workerid != current_worker_id) && sched_ctx->sleeping[workerid])
															
 
																 			{
															
@@ -2447,197 +2437,82 @@ static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id, int master)
 
																 void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id)
															
 
																 {
															
 
																-	int *workerids = NULL;
															
 
																-	int nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
															
 
																-	_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, workerids, nworkers, workerids[nworkers-1]);
															
 
																-
															
 
																-	/* execute parallel code */
															
 
																-	void* ret = func(param);
															
 
																+    _starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id);
															
 
																-	/* wake up starpu workers */
															
 
																-	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, workerids[nworkers-1]);
															
 
																+    /* execute parallel code */
															
 
																+    void* ret = func(param);
															
 
																-	free(workerids);
															
 
																-	return ret;
															
 
																+    /* wake up starpu workers */
															
 
																+    _starpu_sched_ctx_wake_up_workers(sched_ctx_id);
															
 
																+    return ret;
															
 
																 }
															
 
																-void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids, int *ncpuids)
															
 
																+static void _starpu_sched_ctx_update_parallel_workers_with(unsigned sched_ctx_id)
															
 
																 {
															
 
																-	int current_worker_id = starpu_worker_get_id();
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																-	_STARPU_MALLOC((*cpuids), workers->nworkers*sizeof(int));
															
 
																-	int w = 0;
															
 
																+    struct _starpu_sched_ctx * sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	struct starpu_sched_ctx_iterator it;
															
 
																+	if(sched_ctx->sched_policy)
															
 
																+		return;
															
 
																-	workers->init_iterator(workers, &it);
															
 
																-	while(workers->has_next(workers, &it))
															
 
																+	_starpu_sched_ctx_put_new_master(sched_ctx_id);
															
 
																+
															
 
																+	if(!sched_ctx->awake_workers)
															
 
																 	{
															
 
																-		int workerid = workers->get_next(workers, &it);
															
 
																-		int master = sched_ctx->master[workerid];
															
 
																-		if(master == current_worker_id || workerid == current_worker_id || current_worker_id == -1)
															
 
																-		{
															
 
																-			(*cpuids)[w++] = starpu_worker_get_bindid(workerid);
															
 
																-		}
															
 
																+		_starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id);
															
 
																 	}
															
 
																-	*ncpuids = w;
															
 
																-	return;
															
 
																 }
															
 
																-static void _starpu_sched_ctx_wake_these_workers_up(unsigned sched_ctx_id, int *workerids, int nworkers)
															
 
																+static void _starpu_sched_ctx_update_parallel_workers_without(unsigned sched_ctx_id)
															
 
																 {
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int current_worker_id = starpu_worker_get_id();
															
 
																+    struct _starpu_sched_ctx * sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int masters[nworkers];
															
 
																-	int w;
															
 
																-	for(w = 0; w < nworkers; w++)
															
 
																-	{
															
 
																-		int workerid = workerids[w];
															
 
																-		masters[w] = sched_ctx->master[workerid];
															
 
																-		if(current_worker_id == -1 || workerid != current_worker_id)
															
 
																-		{
															
 
																-			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																-			STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond[workerid]);
															
 
																-			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																-		}
															
 
																-		else
															
 
																-			sched_ctx->parallel_sect[workerid] = 0;
															
 
																-		sched_ctx->master[workerid] = -1;
															
 
																-	}
															
 
																+	if(sched_ctx->sched_policy)
															
 
																+		return;
															
 
																-	for(w = 0; w < nworkers; w++)
															
 
																+
															
 
																+	_starpu_sched_ctx_put_new_master(sched_ctx_id);
															
 
																+
															
 
																+	if(!sched_ctx->awake_workers)
															
 
																 	{
															
 
																-		int workerid = workerids[w];
															
 
																-		if(masters[w] != -1)
															
 
																-		{
															
 
																-			int master = sched_ctx->master[workerid];
															
 
																-			if(current_worker_id == -1 || workerid != current_worker_id)
															
 
																-				sem_wait(&sched_ctx->wake_up_sem[master]);
															
 
																-		}
															
 
																+		_starpu_sched_ctx_wake_up_workers(sched_ctx_id);
															
 
																 	}
															
 
																-
															
 
																-	return;
															
 
																 }
															
 
																-static int _starpu_sched_ctx_find_master(unsigned sched_ctx_id, int *workerids, int nworkers)
															
 
																+void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids, int *ncpuids)
															
 
																 {
															
 
																+	int current_worker_id = starpu_worker_get_id();
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int new_master = workerids[nworkers-1];
															
 
																-        int current_worker_id = starpu_worker_get_id();
															
 
																-        int current_is_in_section = 0;
															
 
																-        int npotential_masters = 0;
															
 
																-        int nawake_workers = 0;
															
 
																-        int ntrue_masters = 0;
															
 
																-        int potential_masters[nworkers];
															
 
																-        int awake_workers[nworkers];
															
 
																-        int true_masters[nworkers];
															
 
																-
															
 
																-        int i,w;
															
 
																-        for(w = 0 ; w < nworkers ; w++)
															
 
																-        {
															
 
																-                if (current_worker_id == workerids[w])
															
 
																-                        current_is_in_section = 1;
															
 
																+	struct starpu_worker_collection *workers = sched_ctx->workers;
															
 
																+	_STARPU_MALLOC((*cpuids), workers->nworkers*sizeof(int));
															
 
																+	int w = 0;
															
 
																-		int master = sched_ctx->master[workerids[w]];
															
 
																-                if (master > -1)
															
 
																-		{
															
 
																-                        int already_seen = 0;
															
 
																-                        //Could create a function for this. Basically searching an element in an array.
															
 
																-                        for (i = 0 ; i < npotential_masters; i++)
															
 
																-                        {
															
 
																-                                if (potential_masters[i] == master)
															
 
																-				{
															
 
																-                                        already_seen = 1;
															
 
																-                                        break;
															
 
																-				}
															
 
																-                        }
															
 
																-                        if (!already_seen)
															
 
																-				potential_masters[npotential_masters++] = master;
															
 
																-                }
															
 
																-                else if (master == -1)
															
 
																-                        awake_workers[nawake_workers++] = workerids[w];
															
 
																-        }
															
 
																+	struct starpu_sched_ctx_iterator it;
															
 
																-        for (i = 0 ; i < npotential_masters ; i++)
															
 
																-	{
															
 
																-		int master_is_in_section = 0;
															
 
																-		//Could create a function for this. Basically searching an element in an array.
															
 
																-		for (w = 0 ; w < nworkers ; w++)
															
 
																-		{
															
 
																-			if (workerids[w] == potential_masters[i])
															
 
																-			{
															
 
																-				master_is_in_section = 1;
															
 
																-				break;
															
 
																-			}
															
 
																-		}
															
 
																-                if (master_is_in_section)
															
 
																-			true_masters[ntrue_masters++] = potential_masters[i];
															
 
																-        }
															
 
																+	workers->init_iterator(workers, &it);
															
 
																-        if (current_is_in_section)
															
 
																-                new_master = current_worker_id;
															
 
																-        else
															
 
																-        {
															
 
																-                if (ntrue_masters > 1)
															
 
																+	while(workers->has_next(workers, &it))
															
 
																+	{
															
 
																+		int workerid = workers->get_next(workers, &it);
															
 
																+		int master = sched_ctx->main_master;
															
 
																+		if(master == current_worker_id || workerid == current_worker_id || current_worker_id == -1)
															
 
																 		{
															
 
																-                        if (nawake_workers > 0)
															
 
																-                                new_master = awake_workers[nawake_workers - 1];
															
 
																-                        else
															
 
																-                                new_master = true_masters[ntrue_masters - 1];
															
 
																+			(*cpuids)[w++] = starpu_worker_get_bindid(workerid);
															
 
																 		}
															
 
																 	}
															
 
																-	return new_master;
															
 
																+	*ncpuids = w;
															
 
																+	return;
															
 
																 }
															
 
																-static void _starpu_sched_ctx_add_workers_to_master(unsigned sched_ctx_id, int *workerids, int nworkers, int new_master)
															
 
																+static void _starpu_sched_ctx_put_new_master(unsigned sched_ctx_id)
															
 
																 {
															
 
																+	int *workerids;
															
 
																 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	int w;
															
 
																-	int nput_to_sleep = 0;
															
 
																-	int nwake_up = 0;
															
 
																-	int put_to_sleep[nworkers];
															
 
																-	int wake_up[nworkers];
															
 
																-
															
 
																-	for(w = 0 ; w < nworkers ; w++)
															
 
																-	{
															
 
																-		int master = sched_ctx->master[workerids[w]];
															
 
																-		if (master == -1 && workerids[w] != new_master)
															
 
																-			put_to_sleep[nput_to_sleep++] = workerids[w];
															
 
																-		else if(master != -1 && workerids[w] == new_master)
															
 
																-			wake_up[nwake_up++] = workerids[w];
															
 
																-	}
															
 
																-
															
 
																-	if(nwake_up > 0)
															
 
																-		_starpu_sched_ctx_wake_these_workers_up(sched_ctx_id, wake_up, nwake_up);
															
 
																-	if(nput_to_sleep > 0)
															
 
																-		_starpu_sched_ctx_get_workers_to_sleep(sched_ctx_id, put_to_sleep, nput_to_sleep, new_master);
															
 
																-
															
 
																-}
															
 
																-
															
 
																-static void _starpu_sched_ctx_set_master(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers, int master)
															
 
																-{
															
 
																-	int i;
															
 
																-	for(i = 0; i < nworkers; i++)
															
 
																-	{
															
 
																-		if(workerids[i] != master)
															
 
																-			sched_ctx->master[workerids[i]] = master;
															
 
																-	}
															
 
																-}
															
 
																+	unsigned nworkers = starpu_sched_ctx_get_workers_list(sched_ctx_id, &workerids);
															
 
																-int starpu_sched_ctx_book_workers_for_task(unsigned sched_ctx_id, int *workerids, int nworkers)
															
 
																-{
															
 
																-	int new_master = _starpu_sched_ctx_find_master(sched_ctx_id, workerids, nworkers);
															
 
																-	_starpu_sched_ctx_add_workers_to_master(sched_ctx_id, workerids, nworkers, new_master);
															
 
																-	return new_master;
															
 
																-}
															
 
																+	sched_ctx->main_master = workerids[nworkers-1];
															
 
																-void starpu_sched_ctx_unbook_workers_for_task(unsigned sched_ctx_id, int master)
															
 
																-{
															
 
																-	/* wake up starpu workers */
															
 
																-	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, master);
															
 
																+	free(workerids);
															
 
																 }
															
 
																 struct starpu_perfmodel_arch * _starpu_sched_ctx_get_perf_archtype(unsigned sched_ctx_id)
															
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -148,17 +148,14 @@ struct _starpu_sched_ctx
 
																 	   parallel sections to be executed on their allocated resources */
															
 
																 	unsigned parallel_sect[STARPU_NMAXWORKERS];
															
 
																-	/* id of the master worker */
															
 
																-	int master[STARPU_NMAXWORKERS];
															
 
																-
															
 
																-	/* semaphore that block appl thread until starpu threads are 
															
 
																+	/* semaphore that block appl thread until starpu threads are
															
 
																 	   all blocked and ready to exec the parallel code */
															
 
																 	sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
															
 
																 	/* semaphore that block appl thread until starpu threads are 
															
 
																 	   all woke up and ready continue appl */
															
 
																 	sem_t wake_up_sem[STARPU_NMAXWORKERS];
															
 
																-       
															
 
																+
															
 
																 	/* bool indicating if the workers is sleeping in this ctx */
															
 
																 	unsigned sleeping[STARPU_NMAXWORKERS];
															
@@ -168,6 +165,10 @@ struct _starpu_sched_ctx
 
																 	/* perf model for the device comb of the ctx */
															
 
																 	struct starpu_perfmodel_arch perf_arch;
															
 
																+	/* For parallel workers, say whether it is viewed as sequential or not. This
															
 
																+		 is a helper for the prologue code. */
															
 
																+	unsigned parallel_view;
															
 
																+
															
 
																 	/* for ctxs without policy: flag to indicate that we want to get
															
 
																 	   the threads to sleep in order to replace them with other threads or leave
															
 
																 	   them awake & use them in the parallel code*/
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -989,7 +989,11 @@ profiling:
 
																 	}
															
 
																 	if(task->prologue_callback_pop_func)
															
 
																+	{
															
 
																+		_starpu_set_current_task(task);
															
 
																 		task->prologue_callback_pop_func(task->prologue_callback_pop_arg);
															
 
																+		_starpu_set_current_task(NULL);
															
 
																+	}
															
 
																 	return task;
															
 
																 }
															
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -233,6 +233,9 @@ static int main_ret;
 
																 int do_starpu_main(int argc, char *argv[])
															
 
																 {
															
 
																+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
															
 
																+	MSG_process_sleep(0.000001);
															
 
																+
															
 
																 	main_ret = starpu_main(argc, argv);
															
 
																 	return main_ret;
															
 
																 }
															
@@ -342,6 +345,9 @@ static struct task *last_task[STARPU_NMAXWORKERS];
 
																 /* Actually execute the task.  */
															
 
																 static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
															
 
																+	MSG_process_sleep(0.000001);
															
 
																+
															
 
																 	struct task *task = starpu_pthread_getspecific(0);
															
 
																 	_STARPU_DEBUG("task %p started\n", task);
															
 
																 	MSG_task_execute(task->task);
															
@@ -530,6 +536,9 @@ static int transfers_are_sequential(struct transfer *new_transfer, struct transf
 
																 /* Actually execute the transfer, and then start transfers waiting for this one.  */
															
 
																 static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
															
 
																+	MSG_process_sleep(0.000001);
															
 
																+
															
 
																 	struct transfer *transfer = starpu_pthread_getspecific(0);
															
 
																 	unsigned i;
															
 
																 	_STARPU_DEBUG("transfer %p started\n", transfer);
															
@@ -690,6 +699,9 @@ _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 
																 	void *(*f)(void*) = (void*) (uintptr_t) strtol(argv[0], NULL, 16);
															
 
																 	void *arg = (void*) (uintptr_t) strtol(argv[1], NULL, 16);
															
 
																+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
															
 
																+	MSG_process_sleep(0.000001);
															
 
																+
															
 
																 	/* _args is freed with process context */
															
 
																 	f(arg);
															
 
																 	return 0;
															
@@ -813,7 +825,15 @@ typedef struct{
 
																 static int _starpu_simgrid_xbt_thread_create_wrapper(int argc, char *argv[])
															
 
																 {
															
 
																-  smx_process_t self = SIMIX_process_self();
															
 
																+  /* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
															
 
																+  MSG_process_sleep(0.000001);
															
 
																+
															
 
																+#ifdef HAVE_SMX_ACTOR_T
															
 
																+  smx_actor_t
															
 
																+#else
															
 
																+  smx_process_t
															
 
																+#endif
															
 
																+	  self = SIMIX_process_self();
															
 
																   thread_data_t *t = SIMIX_process_self_get_data(self);
															
 
																   simcall_process_set_data(self, t->father_data);
															
 
																   t->code(t->userparam);
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -557,11 +557,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
																 #ifndef STARPU_SIMGRID
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 	hwloc_topology_init(&topology->hwtopology);
															
 
																-#if HWLOC_API_VERSION >= 0x20000
															
 
																-	hwloc_topology_set_io_types_filter(topology->hwtopology, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
															
 
																-#else
															
 
																-	hwloc_topology_set_flags(topology->hwtopology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
															
 
																-#endif
															
 
																+	_starpu_topology_filter(topology->hwtopology);
															
 
																 	hwloc_topology_load(topology->hwtopology);
															
 
																 	_starpu_allocate_topology_userdata(hwloc_get_root_obj(topology->hwtopology));
															
 
																 #endif
															
@@ -837,6 +833,15 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 
																 	return config->topology.nhwpus;
															
 
																 }
															
 
																+void _starpu_topology_filter(hwloc_topology_t topology)
															
 
																+{
															
 
																+#if HWLOC_API_VERSION >= 0x20000
															
 
																+	hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
															
 
																+#else
															
 
																+	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 static void
															
 
																 _starpu_init_mic_config (struct _starpu_machine_config *config,
															
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -51,6 +51,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 
																 /* returns the number of logical cpus */
															
 
																 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
															
 
																+/* Small convenient function to filter hwloc topology depending on HWLOC API version */
															
 
																+void _starpu_topology_filter(hwloc_topology_t topology);
															
 
																+
															
 
																 #define STARPU_NOWORKERID -1
															
 
																 /* Bind the current thread on the CPU logically identified by "cpuid". The
															
 
																  * logical ordering of the processors is either that of hwloc (if available),
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010-2016  Université de Bordeaux
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
															
 
																  * Copyright (C) 2012, 2016  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -417,15 +417,22 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 		sizes[child] = _starpu_data_get_size(child_handle);
															
 
																+		if (child_handle->unregister_hook)
															
 
																+		{
															
 
																+			child_handle->unregister_hook(child_handle);
															
 
																+		}
															
 
																+
															
 
																 		_starpu_data_unregister_ram_pointer(child_handle);
															
 
																 		if (child_handle->per_worker)
															
 
																-		for (worker = 0; worker < nworkers; worker++)
															
 
																 		{
															
 
																-			struct _starpu_data_replicate *local = &child_handle->per_worker[worker];
															
 
																-			STARPU_ASSERT(local->state == STARPU_INVALID);
															
 
																-			if (local->allocated && local->automatically_allocated)
															
 
																-				_starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]);
															
 
																+			for (worker = 0; worker < nworkers; worker++)
															
 
																+			{
															
 
																+				struct _starpu_data_replicate *local = &child_handle->per_worker[worker];
															
 
																+				STARPU_ASSERT(local->state == STARPU_INVALID);
															
 
																+				if (local->allocated && local->automatically_allocated)
															
 
																+					_starpu_request_mem_chunk_removal(child_handle, local, starpu_worker_get_memory_node(worker), sizes[child]);
															
 
																+			}
															
 
																 		}
															
 
																 		_starpu_memory_stats_free(child_handle);
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2010, 2012-2016  Université de Bordeaux
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/src/datawizard/memory_manager.c
+++ b/src/datawizard/memory_manager.c
@@ -17,6 +17,7 @@
 
																 #include <starpu.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <common/thread.h>
															
 
																+#include <common/fxt.h>
															
 
																 #include <datawizard/memory_manager.h>
															
 
																 #include <starpu_stdlib.h>
															
@@ -90,6 +91,7 @@ int starpu_memory_allocate(unsigned node, size_t size, int flags)
 
																 		/* And take it */
															
 
																 		used_size[node] += size;
															
 
																+		_STARPU_TRACE_USED_MEM(node, used_size[node]);
															
 
																 		ret = 0;
															
 
																 	}
															
 
																 	else if (flags & STARPU_MEMORY_OVERFLOW
															
@@ -97,6 +99,7 @@ int starpu_memory_allocate(unsigned node, size_t size, int flags)
 
																 			|| used_size[node] + size <= global_size[node])
															
 
																 	{
															
 
																 		used_size[node] += size;
															
 
																+		_STARPU_TRACE_USED_MEM(node, used_size[node]);
															
 
																 		ret = 0;
															
 
																 	}
															
 
																 	else
															
@@ -112,6 +115,7 @@ void starpu_memory_deallocate(unsigned node, size_t size)
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&lock_nodes[node]);
															
 
																 	used_size[node] -= size;
															
 
																+	_STARPU_TRACE_USED_MEM(node, used_size[node]);
															
 
																 	/* If there's now room for waiters, wake them */
															
 
																 	if (waiting_size[node] &&
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -94,6 +94,7 @@ struct task_info {
 
																 	double start_time;
															
 
																 	double end_time;
															
 
																 	unsigned long footprint;
															
 
																+	unsigned long kflops;
															
 
																 	char *parameters;
															
 
																 	unsigned int ndeps;
															
 
																 	unsigned long *dependencies;
															
@@ -387,6 +388,53 @@ static double get_event_time_stamp(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 	return (((double)(ev->time-options->file_offset))/1000000.0);
															
 
																 }
															
 
																+/*
															
 
																+ *      Auxiliary functions for poti handling names
															
 
																+ */
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+static char *memnode_container_alias(char *output, int len, const char *prefix, long unsigned int memnodeid)
															
 
																+{
															
 
																+	snprintf(output, len, "%smn%lu", prefix, memnodeid);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *memmanager_container_alias(char *output, int len, const char *prefix, long unsigned int memnodeid)
															
 
																+{
															
 
																+	snprintf(output, len, "%smm%lu", prefix, memnodeid);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *thread_container_alias(char *output, int len, const char *prefix, long unsigned int threadid)
															
 
																+{
															
 
																+	snprintf(output, len, "%st%lu", prefix, threadid);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *worker_container_alias(char *output, int len, const char *prefix, long unsigned int workerid)
															
 
																+{
															
 
																+	snprintf(output, len, "%sw%lu", prefix, workerid);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *mpicommthread_container_alias(char *output, int len, const char *prefix)
															
 
																+{
															
 
																+	snprintf(output, len, "%smpict", prefix);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *program_container_alias(char *output, int len, const char *prefix)
															
 
																+{
															
 
																+	snprintf(output, len, "%sp", prefix);
															
 
																+	return output;
															
 
																+}
															
 
																+
															
 
																+static char *scheduler_container_alias(char *output, int len, const char *prefix)
															
 
																+{
															
 
																+	snprintf(output, len, "%ssched", prefix);
															
 
																+	return output;
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																 static int nworkers = 0;
															
 
																 struct worker_entry
															
@@ -492,53 +540,6 @@ static void update_accumulated_time(int worker, double sleep_time, double exec_t
 
																 	}
															
 
																 }
															
 
																-/*
															
 
																- *      Auxiliary functions for poti handling names
															
 
																- */
															
 
																-#ifdef STARPU_HAVE_POTI
															
 
																-static char *memnode_container_alias(char *output, int len, const char *prefix, long unsigned int memnodeid)
															
 
																-{
															
 
																-	snprintf(output, len, "%smn%lu", prefix, memnodeid);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *memmanager_container_alias(char *output, int len, const char *prefix, long unsigned int memnodeid)
															
 
																-{
															
 
																-	snprintf(output, len, "%smm%lu", prefix, memnodeid);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *thread_container_alias(char *output, int len, const char *prefix, long unsigned int threadid)
															
 
																-{
															
 
																-	snprintf(output, len, "%st%lu", prefix, threadid);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *worker_container_alias(char *output, int len, const char *prefix, long unsigned int workerid)
															
 
																-{
															
 
																-	snprintf(output, len, "%sw%lu", prefix, workerid);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *mpicommthread_container_alias(char *output, int len, const char *prefix)
															
 
																-{
															
 
																-	snprintf(output, len, "%smpict", prefix);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *program_container_alias(char *output, int len, const char *prefix)
															
 
																-{
															
 
																-	snprintf(output, len, "%sp", prefix);
															
 
																-	return output;
															
 
																-}
															
 
																-
															
 
																-static char *scheduler_container_alias(char *output, int len, const char *prefix)
															
 
																-{
															
 
																-	snprintf(output, len, "%ssched", prefix);
															
 
																-	return output;
															
 
																-}
															
 
																-#endif
															
 
																-
															
 
																 static void memnode_set_state(double time, const char *prefix, unsigned int memnodeid, const char *name)
															
 
																 {
															
 
																 #ifdef STARPU_HAVE_POTI
															
@@ -625,7 +626,7 @@ static void user_thread_push_state(double time, const char *prefix, long unsigne
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char container[STARPU_POTI_STR_LEN];
															
 
																 	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
															
 
																-	poti_SetState(time, container, "US", name);
															
 
																+	poti_PushState(time, container, "US", name);
															
 
																 #else
															
 
																 	fprintf(out_paje_file, "11	%.9f	%st%lu	US	%s\n", time, prefix, threadid, name);
															
 
																 #endif
															
@@ -637,7 +638,7 @@ static void user_thread_pop_state(double time, const char *prefix, long unsigned
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char container[STARPU_POTI_STR_LEN];
															
 
																 	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
															
 
																-	poti_SetState(time, container, "US", name);
															
 
																+	poti_PopState(time, container, "US");
															
 
																 #else
															
 
																 	fprintf(out_paje_file, "12	%.9f	%st%lu	US\n", time, prefix, threadid);
															
 
																 #endif
															
@@ -703,7 +704,7 @@ static void mpicommthread_push_state(double time, const char *prefix, const char
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char container[STARPU_POTI_STR_LEN];
															
 
																 	mpicommthread_container_alias(container, STARPU_POTI_STR_LEN, prefix);
															
 
																-	poti_SetState(time, container, "CtS", name);
															
 
																+	poti_PushState(time, container, "CtS", name);
															
 
																 #else
															
 
																 	fprintf(out_paje_file, "11	%.9f	%smpict	CtS 	%s\n", time, prefix, name);
															
 
																 #endif
															
@@ -714,7 +715,7 @@ static void mpicommthread_pop_state(double time, const char *prefix)
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char container[STARPU_POTI_STR_LEN];
															
 
																 	mpicommthread_container_alias(container, STARPU_POTI_STR_LEN, prefix);
															
 
																-	poti_SetState(time, container, "CtS", name);
															
 
																+	poti_PopState(time, container, "CtS");
															
 
																 #else
															
 
																 	fprintf(out_paje_file, "12	%.9f	%smpict	CtS\n", time, prefix);
															
 
																 #endif
															
@@ -845,11 +846,17 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																 #endif
															
 
																 		if (!options->no_bus)
															
 
																+		{
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																-			poti_SetVariable(get_event_time_stamp(ev, options), new_memmanager_container_alias, "bw", get_event_time_stamp(ev, options));
															
 
																+			poti_SetVariable(get_event_time_stamp(ev, options), new_memmanager_container_alias, "use", 0.0);
															
 
																+			poti_SetVariable(get_event_time_stamp(ev, options), new_memmanager_container_alias, "bwi", 0.0);
															
 
																+			poti_SetVariable(get_event_time_stamp(ev, options), new_memmanager_container_alias, "bwo", 0.0);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "13	%.9f	%smm%"PRIu64"	bw	0.0\n", get_event_time_stamp(ev, options), prefix, ev->param[0]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%smm%"PRIu64"	use	0.0\n", get_event_time_stamp(ev, options), prefix, ev->param[0]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%smm%"PRIu64"	bwi	0.0\n", get_event_time_stamp(ev, options), prefix, ev->param[0]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%smm%"PRIu64"	bwo	0.0\n", get_event_time_stamp(ev, options), prefix, ev->param[0]);
															
 
																 #endif
															
 
																+		}
															
 
																 	}
															
 
																 }
															
@@ -946,12 +953,15 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
																 		if (new_thread)
															
 
																 			poti_CreateContainer(get_event_time_stamp(ev, options), new_thread_container_alias, "T", memnode_container, new_thread_container_name);
															
 
																 		poti_CreateContainer(get_event_time_stamp(ev, options), new_worker_container_alias, "W", new_thread_container_alias, new_worker_container_name);
															
 
																+		poti_SetVariable(get_event_time_stamp(ev, options), new_worker_container_alias, "gf", 0.0);
															
 
																 #else
															
 
																 		if (new_thread)
															
 
																 			fprintf(out_paje_file, "7	%.9f	%st%lu	T	%smn%d	%s%d\n",
															
 
																 				get_event_time_stamp(ev, options), prefix, threadid, prefix, nodeid, prefix, bindid);
															
 
																 		fprintf(out_paje_file, "7	%.9f	%sw%d	W	%st%lu	%s%s%d\n",
															
 
																 			get_event_time_stamp(ev, options), prefix, workerid, prefix, threadid, prefix, kindstr, devid);
															
 
																+		fprintf(out_paje_file, "13	%.9f	%sw%d	gf	0.0\n",
															
 
																+			get_event_time_stamp(ev, options), prefix, workerid);
															
 
																 #endif
															
 
																 	}
															
@@ -1286,7 +1296,8 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 	struct task_info *task = get_task(job_id, options->file_rank);
															
 
																 	task->parameters = strdup(parameters);
															
 
																-	task->footprint = ev->param[3];
															
 
																+	task->footprint = ev->param[2];
															
 
																+	task->kflops = ev->param[3];
															
 
																 	task->tag = ev->param[4];
															
 
																 	if (out_paje_file)
															
@@ -1294,9 +1305,9 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
															
 
																 		char *prefix = options->file_prefix;
															
 
																-		unsigned sched_ctx = ev->param[1];
															
 
																+		unsigned sched_ctx = ev->param[0];
															
 
																-		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[2], parameters, ev->param[3], ev->param[4], job_id);
															
 
																+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id);
															
 
																 		if (sched_ctx != 0)
															
 
																 		{
															
 
																 #ifdef STARPU_HAVE_POTI
															
@@ -1306,7 +1317,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, worker);
															
 
																 			poti_SetState(last_codelet_start[worker], container, ctx, _starpu_last_codelet_symbol[worker]);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "20	%.9f	%sw%d	Ctx%u	%s	%ld	%s	%08lx	%016lx	%lu\n", last_codelet_start[worker], prefix, worker, sched_ctx, _starpu_last_codelet_symbol[worker], ev->param[2], parameters,  ev->param[3], ev->param[4], job_id);
															
 
																+			fprintf(out_paje_file, "20	%.9f	%sw%d	Ctx%u	%s	%ld	%s	%08lx	%016lx	%lu\n", last_codelet_start[worker], prefix, worker, sched_ctx, _starpu_last_codelet_symbol[worker], ev->param[1], parameters,  ev->param[2], ev->param[4], job_id);
															
 
																 #endif
															
 
																 		}
															
 
																 #endif /* STARPU_ENABLE_PAJE_CODELET_DETAILS */
															
@@ -1335,11 +1346,25 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
																 		recfmt_worker_set_state(end_codelet_time, worker, "I", "Other");
															
 
																 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
															
 
																+	struct task_info *task = get_task(ev->param[0], options->file_rank);
															
 
																+	double gflops = (((double)task->kflops) / 1000000) / (codelet_length / 1000);
															
 
																 	get_task(ev->param[0], options->file_rank)->end_time = end_codelet_time;
															
 
																 	update_accumulated_time(worker, 0.0, codelet_length, end_codelet_time, 0);
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+	char container[STARPU_POTI_STR_LEN];
															
 
																+	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, worker);
															
 
																+	poti_SetVariable(task->start_time, container, "gf", gflops);
															
 
																+	poti_SetVariable(end_codelet_time, container, "gf", 0);
															
 
																+#else
															
 
																+	fprintf(out_paje_file, "13	%.9f	%sw%d	gf	%f\n",
															
 
																+			task->start_time, prefix, worker, gflops);
															
 
																+	fprintf(out_paje_file, "13	%.9f	%sw%d	gf	%f\n",
															
 
																+			end_codelet_time, prefix, worker, 0.);
															
 
																+#endif
															
 
																+
															
 
																 	if (distrib_time)
															
 
																 	     fprintf(distrib_time, "%s\t%s%d\t%ld\t%"PRIx32"\t%.9f\n", _starpu_last_codelet_symbol[worker],
															
 
																 		     prefix, worker, (unsigned long) codelet_size, codelet_hash, codelet_length);
															
@@ -1655,7 +1680,7 @@ static void handle_data_invalidate(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 		char paje_value[STARPU_POTI_STR_LEN], memnode_container[STARPU_POTI_STR_LEN];
															
 
																 		memmanager_container_alias(memnode_container, STARPU_POTI_STR_LEN, prefix, node);
															
 
																 		snprintf(paje_value, STARPU_POTI_STR_LEN, "%lx", handle);
															
 
																-		poti_NewEvent(get_event_time_stamp(ev, options), container, "user_event", paje_value);
															
 
																+		poti_NewEvent(get_event_time_stamp(ev, options), memnode_container, "user_event", paje_value);
															
 
																 #else
															
 
																 		fprintf(out_paje_file, "9	%.9f	invalidate	%smm%u	%lx\n", get_event_time_stamp(ev, options), prefix, node, handle);
															
 
																 #endif
															
@@ -1782,6 +1807,9 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 		}
															
 
																 		/* look for a data transfer to match */
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning FIXME: use hash table instead
															
 
																+#endif
															
 
																 		struct _starpu_communication *itor;
															
 
																 		for (itor = _starpu_communication_list_begin(&communication_list);
															
 
																 			itor != _starpu_communication_list_end(&communication_list);
															
@@ -1841,6 +1869,23 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
															
 
																 }
															
 
																+static void handle_used_mem(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+{
															
 
																+	unsigned memnode = ev->param[0];
															
 
																+
															
 
																+	if (out_paje_file)
															
 
																+	{
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+		char memnode_container[STARPU_POTI_STR_LEN];
															
 
																+		memmanager_container_alias(memnode_container, STARPU_POTI_STR_LEN, options->file_prefix, memnode);
															
 
																+		poti_SetVariable(get_event_time_stamp(ev, options), memnode_container, "use", (double)ev->param[1] / (1<<20));
															
 
																+#else
															
 
																+		fprintf(out_paje_file, "13	%.9f	%smm%u	use	%f\n",
															
 
																+			get_event_time_stamp(ev, options), options->file_prefix, memnode, (double)ev->param[1] / (1<<20));
															
 
																+#endif
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																 static void handle_task_submit_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, unsigned long tid, const char *eventstr)
															
 
																 {
															
 
																 	int workerid = find_worker_id(tid);
															
@@ -2426,7 +2471,8 @@ static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																 static
															
 
																 void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
															
 
																 {
															
 
																-	float current_bandwidth_per_node[STARPU_MAXNODES] = {0.0};
															
 
																+	float current_bandwidth_in_per_node[STARPU_MAXNODES] = {0.0};
															
 
																+	float current_bandwidth_out_per_node[STARPU_MAXNODES] = {0.0};
															
 
																 	char *prefix = options->file_prefix;
															
@@ -2435,29 +2481,29 @@ void _starpu_fxt_display_bandwidth(struct starpu_fxt_options *options)
 
																 		itor != _starpu_communication_list_end(&communication_list);
															
 
																 		itor = _starpu_communication_list_next(itor))
															
 
																 	{
															
 
																-		current_bandwidth_per_node[itor->src_node] +=  itor->bandwidth;
															
 
																+		current_bandwidth_out_per_node[itor->src_node] +=  itor->bandwidth;
															
 
																 		if (out_paje_file)
															
 
																 		{
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			char src_memnode_container[STARPU_POTI_STR_LEN];
															
 
																 			memmanager_container_alias(src_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->src_node);
															
 
																-			poti_SetVariable(itor->comm_start, src_memnode_container, "bw", current_bandwidth_per_node[itor->src_node]);
															
 
																+			poti_SetVariable(itor->comm_start, src_memnode_container, "bwo", current_bandwidth_out_per_node[itor->src_node]);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "13	%.9f	%smm%u	bw	%f\n",
															
 
																-				itor->comm_start, prefix, itor->src_node, current_bandwidth_per_node[itor->src_node]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%smm%u	bwo	%f\n",
															
 
																+				itor->comm_start, prefix, itor->src_node, current_bandwidth_out_per_node[itor->src_node]);
															
 
																 #endif
															
 
																 		}
															
 
																-		current_bandwidth_per_node[itor->dst_node] +=  itor->bandwidth;
															
 
																+		current_bandwidth_in_per_node[itor->dst_node] +=  itor->bandwidth;
															
 
																 		if (out_paje_file)
															
 
																 		{
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 			char dst_memnode_container[STARPU_POTI_STR_LEN];
															
 
																 			memmanager_container_alias(dst_memnode_container, STARPU_POTI_STR_LEN, prefix, itor->dst_node);
															
 
																-			poti_SetVariable(itor->comm_start, dst_memnode_container, "bw", current_bandwidth_per_node[itor->dst_node]);
															
 
																+			poti_SetVariable(itor->comm_start, dst_memnode_container, "bwi", current_bandwidth_in_per_node[itor->dst_node]);
															
 
																 #else
															
 
																-			fprintf(out_paje_file, "13	%.9f	%smm%u	bw	%f\n",
															
 
																-				itor->comm_start, prefix, itor->dst_node, current_bandwidth_per_node[itor->dst_node]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%smm%u	bwi	%f\n",
															
 
																+				itor->comm_start, prefix, itor->dst_node, current_bandwidth_in_per_node[itor->dst_node]);
															
 
																 #endif
															
 
																 		}
															
 
																 	}
															
@@ -2845,6 +2891,8 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
																 					handle_memnode_event(&ev, options, "No");
															
 
																 				}
															
 
																 				break;
															
 
																+			case _STARPU_FUT_USED_MEM:
															
 
																+				handle_used_mem(&ev, options);
															
 
																 			case _STARPU_FUT_USER_EVENT:
															
 
																 				handle_user_event(&ev, options);
															
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2012-2013, 2016  Université Bordeaux
															
 
																+ * Copyright (C) 2012-2013, 2016-2017  Université Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2014, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -26,14 +26,17 @@
 
																 #define STARPU_POTI_STR_LEN 200
															
 
																 #endif
															
 
																-struct mpi_transfer
															
 
																-{
															
 
																+#define MAX_MPI_NODES 64
															
 
																+
															
 
																+LIST_TYPE(mpi_transfer,
															
 
																 	unsigned matched;
															
 
																-	int other_rank; /* src for a recv, dest for a send */
															
 
																+	int src;
															
 
																+	int dst;
															
 
																 	int mpi_tag;
															
 
																 	size_t size;
															
 
																 	float date;
															
 
																-};
															
 
																+	double bandwidth;
															
 
																+);
															
 
																 /* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
															
 
																  * filled with the timestamp of the barrier */
															
@@ -100,25 +103,28 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
																  */
															
 
																 /* the list of MPI transfers found in the different traces */
															
 
																-static struct mpi_transfer *mpi_sends[64] = {NULL};
															
 
																-static struct mpi_transfer *mpi_recvs[64] = {NULL};
															
 
																+static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
															
 
																+static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
															
 
																 /* number of available slots in the lists  */
															
 
																-unsigned mpi_sends_list_size[64] = {0};
															
 
																-unsigned mpi_recvs_list_size[64] = {0};
															
 
																+unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
															
 
																 /* number of slots actually used in the list  */
															
 
																-unsigned mpi_sends_used[64] = {0};
															
 
																-unsigned mpi_recvs_used[64] = {0};
															
 
																+unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
															
 
																 /* number of slots already matched at the beginning of the list. This permits
															
 
																  * going through the lists from the beginning to match each and every
															
 
																  * transfer, thus avoiding a quadratic complexity. */
															
 
																-unsigned mpi_recvs_matched[64] = {0};
															
 
																+unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																+unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, int mpi_tag, size_t size, float date)
															
 
																 {
															
 
																 	STARPU_ASSERT(src >= 0);
															
 
																+	if (src >= MAX_MPI_NODES)
															
 
																+		return;
															
 
																 	unsigned slot = mpi_sends_used[src]++;
															
 
																 	if (mpi_sends_used[src] > mpi_sends_list_size[src])
															
@@ -136,7 +142,8 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
																 	}
															
 
																 	mpi_sends[src][slot].matched = 0;
															
 
																-	mpi_sends[src][slot].other_rank = dst;
															
 
																+	mpi_sends[src][slot].src = src;
															
 
																+	mpi_sends[src][slot].dst = dst;
															
 
																 	mpi_sends[src][slot].mpi_tag = mpi_tag;
															
 
																 	mpi_sends[src][slot].size = size;
															
 
																 	mpi_sends[src][slot].date = date;
															
@@ -144,6 +151,8 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
																 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag, float date)
															
 
																 {
															
 
																+	if (dst >= MAX_MPI_NODES)
															
 
																+		return;
															
 
																 	unsigned slot = mpi_recvs_used[dst]++;
															
 
																 	if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
															
@@ -161,7 +170,8 @@ void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst,
 
																 	}
															
 
																 	mpi_recvs[dst][slot].matched = 0;
															
 
																-	mpi_recvs[dst][slot].other_rank = dst;
															
 
																+	mpi_recvs[dst][slot].src = src;
															
 
																+	mpi_recvs[dst][slot].dst = dst;
															
 
																 	mpi_recvs[dst][slot].mpi_tag = mpi_tag;
															
 
																 	mpi_recvs[dst][slot].date = date;
															
 
																 }
															
@@ -170,7 +180,7 @@ static
 
																 struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag)
															
 
																 {
															
 
																 	unsigned slot;
															
 
																-	unsigned firstslot = mpi_recvs_matched[dst];
															
 
																+	unsigned firstslot = mpi_recvs_matched[src][dst];
															
 
																 	unsigned all_previous_were_matched = 1;
															
@@ -193,7 +203,7 @@ struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED,
 
																 			{
															
 
																 				/* All previous transfers are already matched,
															
 
																 				 * we need not consider them anymore */
															
 
																-				mpi_recvs_matched[dst] = slot;
															
 
																+				mpi_recvs_matched[src][dst] = slot;
															
 
																 			}
															
 
																 		}
															
 
																 	}
															
@@ -204,60 +214,146 @@ struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED,
 
																 static unsigned long mpi_com_id = 0;
															
 
																-static void display_all_transfers_from_trace(FILE *out_paje_file, int src)
															
 
																+static void display_all_transfers_from_trace(FILE *out_paje_file, unsigned n)
															
 
																 {
															
 
																-	unsigned slot;
															
 
																-	for (slot = 0; slot < mpi_sends_used[src]; slot++)
															
 
																+	unsigned slot[MAX_MPI_NODES] = { 0 }, node, src;
															
 
																+	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
															
 
																+	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																+	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+	char mpi_container[STARPU_POTI_STR_LEN];
															
 
																+#endif
															
 
																+
															
 
																+	for (node = 0; node < n ; node++)
															
 
																 	{
															
 
																-		int dst = mpi_sends[src][slot].other_rank;
															
 
																-		int mpi_tag = mpi_sends[src][slot].mpi_tag;
															
 
																-		float start_date = mpi_sends[src][slot].date;
															
 
																-		size_t size = mpi_sends[src][slot].size;
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+		snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", node);
															
 
																+		poti_SetVariable(0., mpi_container, "bwi", 0.);
															
 
																+		poti_SetVariable(0., mpi_container, "bwo", 0.);
															
 
																+#else
															
 
																+		fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi	%f\n", 0., node, 0.);
															
 
																+		fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo	%f\n", 0., node, 0.);
															
 
																+#endif
															
 
																+	}
															
 
																-		struct mpi_transfer *match;
															
 
																-		match = try_to_match_send_transfer(src, dst, mpi_tag);
															
 
																+	mpi_transfer_list_init(&pending_receives);
															
 
																+
															
 
																+	while (1)
															
 
																+	{
															
 
																+		float start_date;
															
 
																+		struct mpi_transfer *cur, *match;
															
 
																+
															
 
																+		/* Find out which event comes first: a pending receive, or a new send */
															
 
																+
															
 
																+		if (mpi_transfer_list_empty(&pending_receives))
															
 
																+			start_date = INFINITY;
															
 
																+		else
															
 
																+			start_date = mpi_transfer_list_front(&pending_receives)->date;
															
 
																+
															
 
																+		src = MAX_MPI_NODES;
															
 
																+		for (node = 0; node < n; node++) {
															
 
																+			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
															
 
																+			{
															
 
																+				/* next send for node is earlier than others */
															
 
																+				src = node;
															
 
																+				start_date = mpi_sends[src][slot[src]].date;
															
 
																+			}
															
 
																+		}
															
 
																+		if (start_date == INFINITY)
															
 
																+			/* No event any more, we're finished! */
															
 
																+			break;
															
 
																+
															
 
																+		if (src == MAX_MPI_NODES)
															
 
																+		{
															
 
																+			/* Pending match is earlier than all new sends, finish its communication */
															
 
																+			match = mpi_transfer_list_pop_front(&pending_receives);
															
 
																+			current_out_bandwidth[match->src] -= match->bandwidth;
															
 
																+			current_in_bandwidth[match->dst] -= match->bandwidth;
															
 
																+#ifdef STARPU_HAVE_POTI
															
 
																+			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->src);
															
 
																+			poti_SetVariable(match->date, mpi_container, "bwo", current_out_bandwidth[match->src]);
															
 
																+			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->dst);
															
 
																+			poti_SetVariable(match->date, mpi_container, "bwi", current_in_bandwidth[match->dst]);
															
 
																+#else
															
 
																+			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo	%f\n", match->date, match->src, current_out_bandwidth[match->src]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi	%f\n", match->date, match->dst, current_in_bandwidth[match->dst]);
															
 
																+#endif
															
 
																+			continue;
															
 
																+		}
															
 
																+
															
 
																+		cur = &mpi_sends[src][slot[src]];
															
 
																+		int dst = cur->dst;
															
 
																+		int mpi_tag = cur->mpi_tag;
															
 
																+		size_t size = cur->size;
															
 
																+
															
 
																+		if (dst < MAX_MPI_NODES)
															
 
																+			match = try_to_match_send_transfer(src, dst, mpi_tag);
															
 
																+		else
															
 
																+			match = NULL;
															
 
																 		if (match)
															
 
																 		{
															
 
																 			float end_date = match->date;
															
 
																+			struct mpi_transfer *prev;
															
 
																+
															
 
																+			match->bandwidth = (0.001*size)/(end_date - start_date);
															
 
																+			current_out_bandwidth[src] += match->bandwidth;
															
 
																+			current_in_bandwidth[dst] += match->bandwidth;
															
 
																+
															
 
																+			/* Insert in sorted list, most probably at the end so let's use a mere insertion sort */
															
 
																+			for (prev = mpi_transfer_list_last(&pending_receives);
															
 
																+				prev != mpi_transfer_list_alpha(&pending_receives);
															
 
																+				prev = mpi_transfer_list_prev(prev))
															
 
																+				if (prev->date <= end_date)
															
 
																+				{
															
 
																+					/* Found its place */
															
 
																+					mpi_transfer_list_insert_after(&pending_receives, match, prev);
															
 
																+					break;
															
 
																+				}
															
 
																+			if (prev == mpi_transfer_list_alpha(&pending_receives))
															
 
																+			{
															
 
																+				/* No element earlier than this one, put it at the head */
															
 
																+				mpi_transfer_list_push_front(&pending_receives, match);
															
 
																+			}
															
 
																 			unsigned long id = mpi_com_id++;
															
 
																-			/* TODO replace 0 by a MPI program ? */
															
 
																-			if (out_paje_file)
															
 
																-			{
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																-				char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
															
 
																-				snprintf(paje_value, STARPU_POTI_STR_LEN, "%lu", (long unsigned) size);
															
 
																-				snprintf(paje_key, STARPU_POTI_STR_LEN, "mpicom_%lu", id);
															
 
																-				char mpi_container[STARPU_POTI_STR_LEN];
															
 
																-				snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", /* XXX */src);
															
 
																-				poti_StartLink(start_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
															
 
																-				snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", /* XXX */dst);
															
 
																-				poti_EndLink(end_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
															
 
																+			char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
															
 
																+			snprintf(paje_value, STARPU_POTI_STR_LEN, "%lu", (long unsigned) size);
															
 
																+			snprintf(paje_key, STARPU_POTI_STR_LEN, "mpicom_%lu", id);
															
 
																+			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", src);
															
 
																+			poti_StartLink(start_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
															
 
																+			poti_SetVariable(start_date, mpi_container, "bwo", current_out_bandwidth[src]);
															
 
																+			snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
															
 
																+			poti_EndLink(end_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
															
 
																+			poti_SetVariable(start_date, mpi_container, "bwo", current_in_bandwidth[dst]);
															
 
																 #else
															
 
																-				fprintf(out_paje_file, "18	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", start_date, (unsigned long)size, /* XXX */src, id);
															
 
																-				fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, /* XXX */dst, id);
															
 
																+			fprintf(out_paje_file, "18	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", start_date, (unsigned long)size, src, id);
															
 
																+			fprintf(out_paje_file, "19	%.9f	MPIL	MPIroot	%lu	%d_mpict	mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwo	%f\n", start_date, src, current_out_bandwidth[src]);
															
 
																+			fprintf(out_paje_file, "13	%.9f	%d_mpict	bwi	%f\n", start_date, dst, current_in_bandwidth[dst]);
															
 
																 #endif
															
 
																-			}
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			_STARPU_DISP("Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n", src, dst, mpi_tag, start_date);
															
 
																 		}
															
 
																+		slot[src]++;
															
 
																 	}
															
 
																 }
															
 
																-void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file)
															
 
																+void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file)
															
 
																 {
															
 
																-	unsigned inputfile;
															
 
																-
															
 
																-	/* display the MPI transfers if possible */
															
 
																-	for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
															
 
																+	if (options->ninputfiles > MAX_MPI_NODES)
															
 
																 	{
															
 
																-		int filerank = ranks[inputfile];
															
 
																-		display_all_transfers_from_trace(out_paje_file, filerank);
															
 
																+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
															
 
																+		options->ninputfiles = MAX_MPI_NODES;
															
 
																 	}
															
 
																+
															
 
																+	/* display the MPI transfers if possible */
															
 
																+	if (out_paje_file)
															
 
																+		display_all_transfers_from_trace(out_paje_file, options->ninputfiles);
															
 
																 }
															
 
																 #endif // STARPU_USE_FXT
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2017  Université de Bordeaux
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -160,7 +160,9 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 	/* Types for the memory node */
															
 
																 	poti_DefineEventType("invalidate", "Mm", "data invalidation");
															
 
																-	poti_DefineVariableType("bw", "Mm", "Bandwidth", "0 0 0");
															
 
																+	poti_DefineVariableType("use", "Mm", "Used (MB)", "0 0 0");
															
 
																+	poti_DefineVariableType("bwi", "Mm", "Bandwidth In (MB/s)", "0 0 0");
															
 
																+	poti_DefineVariableType("bwo", "Mm", "Bandwidth Out (MB/s)", "0 0 0");
															
 
																 	poti_DefineStateType("MS", "Mm", "Memory Node State");
															
 
																 	poti_DefineEntityValue("A", "MS", "Allocating", ".4 .1 .0");
															
 
																 	poti_DefineEntityValue("Ar", "MS", "AllocatingReuse", ".1 .1 .8");
															
@@ -175,6 +177,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 	/* Types for the Worker of the Memory Node */
															
 
																 	poti_DefineEventType("user_event", "T", "user event type");
															
 
																 	poti_DefineEventType("thread_event", "T", "thread event type");
															
 
																+	poti_DefineVariableType("gf", "T", "GFlops", "0 0 0");
															
 
																 	poti_DefineStateType("S", "T", "Thread State");
															
 
																 	poti_DefineEntityValue("I", "S", "Idle", ".9 .1 0");
															
 
																 	poti_DefineEntityValue("In", "S", "Initializing", "0.0 .7 1.0");
															
@@ -213,6 +216,8 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 	/* Types for the MPI Communication Thread of the Memory Node */
															
 
																 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
															
 
																+	poti_DefineVariableType("bwi", "MPICt", "Bandwidth In (MB/s)", "0 0 0");
															
 
																+	poti_DefineVariableType("bwo", "MPICt", "Bandwidth Out (MB/s)", "0 0 0");
															
 
																 	poti_DefineStateType("CtS", "MPICt", "Communication Thread State");
															
 
																 	poti_DefineEntityValue("P", "CtS", "Processing", "0 0 0");
															
 
																 	poti_DefineEntityValue("Sl", "CtS", "Sleeping", ".9 .1 .0");
															
@@ -299,7 +304,12 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 3       MS       Mm       \"Memory Node State\"                        \n\
															
 
																 4       nsubmitted    Sc       \"Number of Submitted Uncompleted Tasks\"                        \n\
															
 
																 4       nready    Sc       \"Number of Ready Tasks\"                        \n\
															
 
																-4       bw      Mm       \"Bandwidth\"                        \n\
															
 
																+4       use     Mm       \"Used (MB)\"                        \n\
															
 
																+4       bwi     Mm       \"Bandwidth In (MB/s)\"                        \n\
															
 
																+4       bwo     Mm       \"Bandwidth Out (MB/s)\"                        \n\
															
 
																+4       bwi     MPICt       \"Bandwidth In (MB/s)\"                        \n\
															
 
																+4       bwo     MPICt       \"Bandwidth Out (MB/s)\"                        \n\
															
 
																+4       gf      T       \"GFlops\"                        \n\
															
 
																 6       I       S       Idle         \".9 .1 .0\"		\n\
															
 
																 6       In       S      Initializing       \"0.0 .7 1.0\"            \n\
															
 
																 6       D       S      Deinitializing       \"0.0 .1 .7\"            \n\
															
--- a/src/util/starpu_clusters_create.c
+++ b/src/util/starpu_clusters_create.c
@@ -45,45 +45,67 @@ starpu_binding_function _starpu_cluster_type_get_func(starpu_cluster_types type)
 
																 	return prologue_func;
															
 
																 }
															
 
																-void starpu_openmp_prologue(void *sched_ctx_id)
															
 
																+void starpu_openmp_prologue(void* arg)
															
 
																 {
															
 
																-	int sched_ctx = *(int*)sched_ctx_id;
															
 
																-	int *cpuids = NULL;
															
 
																-	int ncpuids = 0;
															
 
																 	int workerid = starpu_worker_get_id_check();
															
 
																 	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																 	{
															
 
																-		starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
															
 
																-		omp_set_num_threads(ncpuids);
															
 
																-#pragma omp parallel
															
 
																+		struct starpu_task *task = starpu_task_get_current();
															
 
																+		int sched_ctx = task->sched_ctx;
															
 
																+		struct _starpu_sched_ctx *ctx_struct = _starpu_get_sched_ctx_struct(sched_ctx);
															
 
																+		/* If the view of the worker doesn't correspond to the view of the task,
															
 
																+			 adapt the thread team */
															
 
																+		if (ctx_struct->parallel_view != task->possibly_parallel)
															
 
																 		{
															
 
																-			starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
															
 
																+			int *cpuids = NULL;
															
 
																+			int ncpuids = 0;
															
 
																+
															
 
																+			starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
															
 
																+			if (!task->possibly_parallel)
															
 
																+				ncpuids=1;
															
 
																+			omp_set_num_threads(ncpuids);
															
 
																+#pragma omp parallel
															
 
																+			{
															
 
																+				starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
															
 
																+			}
															
 
																+			free(cpuids);
															
 
																+			ctx_struct->parallel_view = !ctx_struct->parallel_view;
															
 
																 		}
															
 
																-		free(cpuids);
															
 
																 	}
															
 
																 	return;
															
 
																 }
															
 
																 #ifdef STARPU_MKL
															
 
																-void starpu_gnu_openmp_mkl_prologue(void *sched_ctx_id)
															
 
																+void starpu_gnu_openmp_mkl_prologue(void* arg)
															
 
																 {
															
 
																-	int sched_ctx = *(int*)sched_ctx_id;
															
 
																-	int *cpuids = NULL;
															
 
																-	int ncpuids = 0;
															
 
																 	int workerid = starpu_worker_get_id();
															
 
																 	if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																 	{
															
 
																-		starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
															
 
																-		omp_set_num_threads(ncpuids);
															
 
																-		mkl_set_num_threads(ncpuids);
															
 
																-		mkl_set_dynamic(0);
															
 
																-#pragma omp parallel
															
 
																+		struct starpu_task *task = starpu_task_get_current();
															
 
																+		int sched_ctx = task->sched_ctx;
															
 
																+		struct _starpu_sched_ctx *ctx_struct = _starpu_get_sched_ctx_struct(sched_ctx);
															
 
																+		/* If the view of the worker doesn't correspond to the view of the task,
															
 
																+			 adapt the thread team */
															
 
																+		if (ctx_struct->parallel_view != task->possibly_parallel)
															
 
																 		{
															
 
																-			starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
															
 
																+			int *cpuids = NULL;
															
 
																+			int ncpuids = 0;
															
 
																+
															
 
																+			starpu_sched_ctx_get_available_cpuids(sched_ctx, &cpuids, &ncpuids);
															
 
																+			if (!task->possibly_parallel)
															
 
																+				ncpuids=1;
															
 
																+			omp_set_num_threads(ncpuids);
															
 
																+			mkl_set_num_threads(ncpuids);
															
 
																+			mkl_set_dynamic(0);
															
 
																+#pragma omp parallel
															
 
																+			{
															
 
																+				starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
															
 
																+			}
															
 
																+			free(cpuids);
															
 
																+			ctx_struct->parallel_view = !ctx_struct->parallel_view;
															
 
																 		}
															
 
																-		free(cpuids);
															
 
																 	}
															
 
																 	return;
															
 
																 }
															
@@ -324,8 +346,8 @@ int _starpu_cluster_bind(struct _starpu_cluster *cluster)
 
																 	else
															
 
																 	{
															
 
																 		func = _starpu_cluster_type_get_func(cluster->params->type);
															
 
																-		func_arg = (void*) &cluster->id;
															
 
																-		}
															
 
																+		func_arg = NULL;
															
 
																+	}
															
 
																 	return starpu_task_insert(&_starpu_cluster_bind_cl,
															
 
																 				  STARPU_SCHED_CTX, cluster->id,
															
--- a/starpufft/src/Makefile.am
+++ b/starpufft/src/Makefile.am
@@ -18,7 +18,7 @@
 
																 include $(top_srcdir)/starpu.mk
															
 
																 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS)
															
 
																-AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/starpufft/include/ -I$(top_builddir)/include $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
															
 
																+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/starpufft/include/ -I$(top_builddir)/include $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS)
															
 
																 lib_LTLIBRARIES = libstarpufft-@STARPU_EFFECTIVE_VERSION@.la
															
--- a/tests/datawizard/acquire_cb_insert.c
+++ b/tests/datawizard/acquire_cb_insert.c
@@ -111,11 +111,18 @@ int main(int argc, char **argv)
 
																 			starpu_task_insert(&work, STARPU_W, starpu_data_get_sub_data(f_handle, 1, x), 0)
															
 
																 			);
															
 
																 #else
															
 
																-	starpu_data_acquire_cb(x_handle, STARPU_W, callback, NULL);
															
 
																+	starpu_data_acquire_cb(x_handle, STARPU_R, callback, NULL);
															
 
																 #endif
															
 
																+	/* Wait for acquisition (and thus insertion) */
															
 
																+	starpu_data_acquire(x_handle, STARPU_W);
															
 
																+	starpu_data_release(x_handle);
															
 
																+
															
 
																+	/* Now wait for the inserted task */
															
 
																 	ret = starpu_task_wait_for_all();
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
															
 
																+
															
 
																+	/* Can now clean */
															
 
																 	starpu_data_unpartition(f_handle, STARPU_MAIN_RAM);
															
 
																 	starpu_data_unregister(f_handle);
															
 
																 	starpu_data_unregister(x_handle);
															
--- a/tests/loader.c
+++ b/tests/loader.c
@@ -297,6 +297,7 @@ int main(int argc, char *argv[])
 
																 				strcpy(libtool, top_builddir);
															
 
																 				strcat(libtool, "/libtool");
															
 
																+				decode(&launcher, "@top_srcdir@", top_srcdir);
															
 
																 				decode(&launcher_args, "@top_srcdir@", top_srcdir);
															
 
																 				launcher_argv[0] = libtool;
															
--- a/tests/sched_ctx/sched_ctx_hierarchy.c
+++ b/tests/sched_ctx/sched_ctx_hierarchy.c
@@ -23,7 +23,7 @@ void func_cpu_bis(void *descr[], void *_args)
 
																 {
															
 
																 	char msg;
															
 
																 	char worker_name[256];
															
 
																-	int worker_id = starpu_worker_get_id();
															
 
																+	int worker_id = starpu_worker_get_id_check();
															
 
																 	int worker_id_expected;
															
 
																 	int ntasks;
															
@@ -54,7 +54,7 @@ void func_cpu(void *descr[], void *_args)
 
																 {
															
 
																 	char msg;
															
 
																 	char worker_name[256];
															
 
																-	int worker_id = starpu_worker_get_id();
															
 
																+	int worker_id = starpu_worker_get_id_check();
															
 
																 	int worker_id_expected;
															
 
																 	int ntasks;
															
 
																 	unsigned sched_ctx_id;
															
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -138,16 +138,21 @@ EXTRA_DIST =				\
 
																 	dev/rename.sh			\
															
 
																 	perfmodels/README		\
															
 
																 	perfmodels/sampling/codelets/tmp/mlr_init.out	 \
															
 
																+	valgrind/fxt.suppr		\
															
 
																 	valgrind/hwloc.suppr		\
															
 
																 	valgrind/libc.suppr		\
															
 
																 	valgrind/libgomp.suppr		\
															
 
																 	valgrind/libnuma.suppr		\
															
 
																+	valgrind/madmpi.suppr		\
															
 
																 	valgrind/opencl.suppr		\
															
 
																 	valgrind/openmpi.suppr		\
															
 
																+	valgrind/openmp.suppr		\
															
 
																+	valgrind/padico.suppr		\
															
 
																 	valgrind/pthread.suppr		\
															
 
																 	valgrind/starpu.suppr		\
															
 
																 	valgrind/valgrind.suppr		\
															
 
																 	valgrind/valgrind.sh		\
															
 
																+	valgrind/helgrind.sh		\
															
 
																 	tsan/starpu.suppr		\
															
 
																 	lsan/libc.suppr			\
															
 
																 	lsan/openmpi.suppr		\
															
--- a/tools/cppcheck/suppressions.txt
+++ b/tools/cppcheck/suppressions.txt
@@ -42,11 +42,11 @@ unusedStructMember:src/core/perfmodel/perfmodel_bus.c:65
 
																 unusedStructMember:src/core/perfmodel/perfmodel_bus.c:66
															
 
																 unusedStructMember:src/core/simgrid.c:225
															
 
																 unusedStructMember:src/core/simgrid.c:226
															
 
																-wrongPrintfScanfArgNum:src/core/simgrid.c:719
															
 
																+wrongPrintfScanfArgNum:src/core/simgrid.c:731
															
 
																 duplicateExpression:src/util/starpu_task_insert.c:52
															
 
																 // TODO: this could be an error?
															
 
																-redundantCopy:src/core/disk_ops/disk_leveldb.cpp:192
															
 
																+redundantCopy:src/core/disk_ops/disk_leveldb.cpp:194
															
 
																 nullPointerRedundantCheck:src/common/rbtree.c
															
 
																 unreadVariable:src/datawizard/interfaces/*
															
@@ -62,5 +62,6 @@ allocaCalled:gcc-plugin/src/*
 
																 unusedVariable:gcc-plugin/tests/*
															
 
																 unreadVariable:gcc-plugin/tests/*
															
 
																 duplicateExpression:gcc-plugin/src/*
															
 
																+negativeIndex:gcc-plugin/src/*
															
 
																 pointerSize:socl/src/cl_getcontextinfo.c:33
															
--- a/tools/valgrind/hwloc.suppr
+++ b/tools/valgrind/hwloc.suppr
@@ -20,3 +20,146 @@
 
																    obj:/usr/lib/x86_64-linux-gnu/libhwloc.so.5.6.8
															
 
																    fun:hwloc_topology_init
															
 
																 }
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: indirect
															
 
																+   fun:realloc
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: indirect
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: indirect
															
 
																+   fun:calloc
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: definite
															
 
																+   fun:calloc
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: indirect
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: possible
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: possible
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: possible
															
 
																+   fun:realloc
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   obj:*
															
 
																+   fun:hwloc_discover
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:pci_device_get_device_name
															
 
																+   fun:hwloc_look_pci
															
 
																+}
															
 
																+
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:realloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:pci_device_get_device_name
															
 
																+   fun:hwloc_look_pci
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:calloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+}
															
--- a/tools/valgrind/openmp.suppr
+++ b/tools/valgrind/openmp.suppr
@@ -0,0 +1,167 @@
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:realloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:pci_device_get_device_name
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:pci_device_get_device_name
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:pci_device_get_device_name
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:calloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:calloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:calloc
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}
															
 
																+{
															
 
																+   <insert_a_suppression_name_here>
															
 
																+   Memcheck:Leak
															
 
																+   match-leak-kinds: reachable
															
 
																+   fun:malloc
															
 
																+   fun:strdup
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   obj:/usr/lib/x86_64-linux-gnu/libpciaccess.so.0.11.1
															
 
																+   fun:hwloc_look_pci
															
 
																+   fun:hwloc_discover
															
 
																+   fun:hwloc_topology_load
															
 
																+   fun:_starpu_init_topology
															
 
																+   fun:_starpu_topology_get_nhwcpu
															
 
																+   fun:check_bus_config_file
															
 
																+   fun:_starpu_load_bus_performance_files
															
 
																+   fun:starpu_initialize
															
 
																+   fun:starpu_init
															
 
																+   fun:omp_initial_thread_setup
															
 
																+   fun:omp_initial_region_setup
															
 
																+   fun:starpu_omp_init
															
 
																+   fun:omp_constructor
															
 
																+   fun:__libc_csu_init
															
 
																+   fun:(below main)
															
 
																+}